1 /* 2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1989, 1993, 1995 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * Poul-Henning Kamp of the FreeBSD Project. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 3. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 */ 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/uio.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 #include <sys/mount.h> 71 #include <sys/vnode.h> 72 #include <sys/malloc.h> 73 #include <sys/sysmsg.h> 74 #include <sys/spinlock.h> 75 #include <sys/proc.h> 76 #include <sys/nlookup.h> 77 #include <sys/filedesc.h> 78 #include <sys/fnv_hash.h> 79 #include <sys/globaldata.h> 80 #include <sys/kern_syscall.h> 81 #include <sys/dirent.h> 82 #include <ddb/ddb.h> 83 84 #include <sys/spinlock2.h> 85 86 #define MAX_RECURSION_DEPTH 64 87 88 /* 89 * Random lookups in the cache are accomplished with a hash table using 90 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock, 91 * but we use the ncp->update counter trick to avoid acquiring any 92 * contestable spin-locks during a lookup. 93 * 94 * Negative entries may exist and correspond to resolved namecache 95 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT 96 * will be set if the entry corresponds to a whited-out directory entry 97 * (verses simply not finding the entry at all). pcpu_ncache[n].neg_list 98 * is locked via pcpu_ncache[n].neg_spin; 99 * 100 * MPSAFE RULES: 101 * 102 * (1) ncp's typically have at least a nc_refs of 1, and usually 2. One 103 * is applicable to direct lookups via the hash table nchpp or via 104 * nc_list (the two are added or removed together). Removal of the ncp 105 * from the hash table drops this reference. The second is applicable 106 * to vp->v_namecache linkages (or negative list linkages), and removal 107 * of the ncp from these lists drops this reference. 108 * 109 * On the 1->0 transition of nc_refs the ncp can no longer be referenced 110 * and must be destroyed. No other thread should have access to it at 111 * this point so it can be safely locked and freed without any deadlock 112 * fears. 113 * 114 * The 1->0 transition can occur at almost any juncture and so cache_drop() 115 * deals with it directly. 116 * 117 * (2) Once the 1->0 transition occurs, the entity that caused the transition 118 * will be responsible for destroying the ncp. The ncp cannot be on any 119 * list or hash at this time, or be held by anyone other than the caller 120 * responsible for the transition. 121 * 122 * (3) A ncp must be locked in order to modify it. 123 * 124 * (5) ncp locks are ordered, child-to-parent. Child first, then parent. 125 * This may seem backwards but forward-scans use the hash table and thus 126 * can hold the parent unlocked while traversing downward. Deletions, 127 * on the other-hand, tend to propagate bottom-up since the ref on the 128 * is dropped as the children go away. 129 * 130 * (6) Both parent and child must be locked in order to enter the child onto 131 * the parent's nc_list. 132 */ 133 134 /* 135 * Structures associated with name cacheing. 136 */ 137 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash]) 138 #define MINNEG 1024 139 #define MINPOS 1024 140 #define NCMOUNT_NUMCACHE (16384) /* power of 2 */ 141 #define NCMOUNT_SET (8) /* power of 2 */ 142 143 MALLOC_DEFINE_OBJ(M_VFSCACHE, sizeof(struct namecache), 144 "namecache", "namecache entries"); 145 MALLOC_DEFINE(M_VFSCACHEAUX, "namecachestr", "namecache strings"); 146 147 TAILQ_HEAD(nchash_list, namecache); 148 149 /* 150 * Don't cachealign, but at least pad to 32 bytes so entries 151 * don't cross a cache line. 152 */ 153 struct nchash_head { 154 struct nchash_list list; /* 16 bytes */ 155 struct spinlock spin; /* 8 bytes */ 156 long pad01; /* 8 bytes */ 157 }; 158 159 struct ncmount_cache { 160 struct spinlock spin; 161 struct namecache *ncp; 162 struct mount *mp; 163 struct mount *mp_target; 164 int isneg; 165 int ticks; 166 int updating; 167 int unused01; 168 }; 169 170 struct pcpu_ncache { 171 struct spinlock umount_spin; /* cache_findmount/interlock */ 172 struct spinlock neg_spin; /* for neg_list and neg_count */ 173 struct namecache_list neg_list; 174 long neg_count; 175 long vfscache_negs; 176 long vfscache_count; 177 long vfscache_leafs; 178 long numdefered; 179 } __cachealign; 180 181 __read_mostly static struct nchash_head *nchashtbl; 182 __read_mostly static struct pcpu_ncache *pcpu_ncache; 183 static struct ncmount_cache ncmount_cache[NCMOUNT_NUMCACHE]; 184 185 /* 186 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server 187 * to create the namecache infrastructure leading to a dangling vnode. 188 * 189 * 0 Only errors are reported 190 * 1 Successes are reported 191 * 2 Successes + the whole directory scan is reported 192 * 3 Force the directory scan code run as if the parent vnode did not 193 * have a namecache record, even if it does have one. 194 */ 195 __read_mostly static int ncvp_debug; 196 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0, 197 "Namecache debug level (0-3)"); 198 199 __read_mostly static u_long nchash; /* size of hash table */ 200 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 201 "Size of namecache hash table"); 202 203 __read_mostly static int ncnegflush = 10; /* burst for negative flush */ 204 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0, 205 "Batch flush negative entries"); 206 207 __read_mostly static int ncposflush = 10; /* burst for positive flush */ 208 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0, 209 "Batch flush positive entries"); 210 211 __read_mostly static int ncnegfactor = 16; /* ratio of negative entries */ 212 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 213 "Ratio of namecache negative entries"); 214 215 __read_mostly static int nclockwarn; /* warn on locked entries in ticks */ 216 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0, 217 "Warn on locked namecache entries in ticks"); 218 219 __read_mostly static int ncposlimit; /* number of cache entries allocated */ 220 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0, 221 "Number of cache entries allocated"); 222 223 __read_mostly static int ncp_shared_lock_disable = 0; 224 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW, 225 &ncp_shared_lock_disable, 0, "Disable shared namecache locks"); 226 227 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), 228 "sizeof(struct vnode)"); 229 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), 230 "sizeof(struct namecache)"); 231 232 __read_mostly static int ncmount_cache_enable = 1; 233 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW, 234 &ncmount_cache_enable, 0, "mount point cache"); 235 236 static __inline void _cache_drop(struct namecache *ncp); 237 static int cache_resolve_mp(struct mount *mp); 238 static int cache_findmount_callback(struct mount *mp, void *data); 239 static void _cache_setunresolved(struct namecache *ncp); 240 static void _cache_cleanneg(long count); 241 static void _cache_cleanpos(long count); 242 static void _cache_cleandefered(void); 243 static void _cache_unlink(struct namecache *ncp); 244 245 /* 246 * The new name cache statistics (these are rolled up globals and not 247 * modified in the critical path, see struct pcpu_ncache). 248 */ 249 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); 250 static long vfscache_negs; 251 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0, 252 "Number of negative namecache entries"); 253 static long vfscache_count; 254 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0, 255 "Number of namecaches entries"); 256 static long vfscache_leafs; 257 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0, 258 "Number of namecaches entries"); 259 static long numdefered; 260 SYSCTL_LONG(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0, 261 "Number of cache entries allocated"); 262 263 264 struct nchstats nchstats[SMP_MAXCPU]; 265 /* 266 * Export VFS cache effectiveness statistics to user-land. 267 * 268 * The statistics are left for aggregation to user-land so 269 * neat things can be achieved, like observing per-CPU cache 270 * distribution. 271 */ 272 static int 273 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 274 { 275 struct globaldata *gd; 276 int i, error; 277 278 error = 0; 279 for (i = 0; i < ncpus; ++i) { 280 gd = globaldata_find(i); 281 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats), 282 sizeof(struct nchstats)))) 283 break; 284 } 285 286 return (error); 287 } 288 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD, 289 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics"); 290 291 static int cache_zap(struct namecache *ncp); 292 293 /* 294 * Cache mount points and namecache records in order to avoid unnecessary 295 * atomic ops on mnt_refs and ncp->refs. This improves concurrent SMP 296 * performance and is particularly important on multi-socket systems to 297 * reduce cache-line ping-ponging. 298 * 299 * Try to keep the pcpu structure within one cache line (~64 bytes). 300 */ 301 #define MNTCACHE_COUNT 32 /* power of 2, multiple of SET */ 302 #define MNTCACHE_SET 8 /* set associativity */ 303 304 struct mntcache_elm { 305 struct namecache *ncp; 306 struct mount *mp; 307 int ticks; 308 int unused01; 309 }; 310 311 struct mntcache { 312 struct mntcache_elm array[MNTCACHE_COUNT]; 313 } __cachealign; 314 315 static struct mntcache pcpu_mntcache[MAXCPU]; 316 317 static __inline 318 struct mntcache_elm * 319 _cache_mntcache_hash(void *ptr) 320 { 321 struct mntcache_elm *elm; 322 int hv; 323 324 hv = iscsi_crc32(&ptr, sizeof(ptr)) & (MNTCACHE_COUNT - 1); 325 elm = &pcpu_mntcache[mycpu->gd_cpuid].array[hv & ~(MNTCACHE_SET - 1)]; 326 327 return elm; 328 } 329 330 static 331 void 332 _cache_mntref(struct mount *mp) 333 { 334 struct mntcache_elm *elm; 335 struct mount *mpr; 336 int i; 337 338 elm = _cache_mntcache_hash(mp); 339 for (i = 0; i < MNTCACHE_SET; ++i) { 340 if (elm->mp == mp) { 341 mpr = atomic_swap_ptr((void *)&elm->mp, NULL); 342 if (__predict_true(mpr == mp)) 343 return; 344 if (mpr) 345 atomic_add_int(&mpr->mnt_refs, -1); 346 } 347 ++elm; 348 } 349 atomic_add_int(&mp->mnt_refs, 1); 350 } 351 352 static 353 void 354 _cache_mntrel(struct mount *mp) 355 { 356 struct mntcache_elm *elm; 357 struct mntcache_elm *best; 358 struct mount *mpr; 359 int delta1; 360 int delta2; 361 int i; 362 363 elm = _cache_mntcache_hash(mp); 364 best = elm; 365 for (i = 0; i < MNTCACHE_SET; ++i) { 366 if (elm->mp == NULL) { 367 mpr = atomic_swap_ptr((void *)&elm->mp, mp); 368 if (__predict_false(mpr != NULL)) { 369 atomic_add_int(&mpr->mnt_refs, -1); 370 } 371 elm->ticks = ticks; 372 return; 373 } 374 delta1 = ticks - best->ticks; 375 delta2 = ticks - elm->ticks; 376 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 377 best = elm; 378 ++elm; 379 } 380 mpr = atomic_swap_ptr((void *)&best->mp, mp); 381 best->ticks = ticks; 382 if (mpr) 383 atomic_add_int(&mpr->mnt_refs, -1); 384 } 385 386 /* 387 * Clears all cached mount points on all cpus. This routine should only 388 * be called when we are waiting for a mount to clear, e.g. so we can 389 * unmount. 390 */ 391 void 392 cache_clearmntcache(struct mount *target __unused) 393 { 394 int n; 395 396 for (n = 0; n < ncpus; ++n) { 397 struct mntcache *cache = &pcpu_mntcache[n]; 398 struct mntcache_elm *elm; 399 struct namecache *ncp; 400 struct mount *mp; 401 int i; 402 403 for (i = 0; i < MNTCACHE_COUNT; ++i) { 404 elm = &cache->array[i]; 405 if (elm->mp) { 406 mp = atomic_swap_ptr((void *)&elm->mp, NULL); 407 if (mp) 408 atomic_add_int(&mp->mnt_refs, -1); 409 } 410 if (elm->ncp) { 411 ncp = atomic_swap_ptr((void *)&elm->ncp, NULL); 412 if (ncp) 413 _cache_drop(ncp); 414 } 415 } 416 } 417 } 418 419 /* 420 * Namespace locking. The caller must already hold a reference to the 421 * namecache structure in order to lock/unlock it. The controlling entity 422 * in a 1->0 transition does not need to lock the ncp to dispose of it, 423 * as nobody else will have visibility to it at that point. 424 * 425 * Note that holding a locked namecache structure prevents other threads 426 * from making namespace changes (e.g. deleting or creating), prevents 427 * vnode association state changes by other threads, and prevents the 428 * namecache entry from being resolved or unresolved by other threads. 429 * 430 * An exclusive lock owner has full authority to associate/disassociate 431 * vnodes and resolve/unresolve the locked ncp. 432 * 433 * A shared lock owner only has authority to acquire the underlying vnode, 434 * if any. 435 * 436 * The primary lock field is nc_lockstatus. nc_locktd is set after the 437 * fact (when locking) or cleared prior to unlocking. 438 * 439 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed 440 * or recycled, but it does NOT help you if the vnode had already 441 * initiated a recyclement. If this is important, use cache_get() 442 * rather then cache_lock() (and deal with the differences in the 443 * way the refs counter is handled). Or, alternatively, make an 444 * unconditional call to cache_validate() or cache_resolve() 445 * after cache_lock() returns. 446 */ 447 static __inline 448 void 449 _cache_lock(struct namecache *ncp) 450 { 451 int didwarn = 0; 452 int error; 453 454 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 455 while (__predict_false(error == EWOULDBLOCK)) { 456 if (didwarn == 0) { 457 didwarn = ticks - nclockwarn; 458 kprintf("[diagnostic] cache_lock: " 459 "%s blocked on %p " 460 "\"%*.*s\"\n", 461 curthread->td_comm, ncp, 462 ncp->nc_nlen, ncp->nc_nlen, 463 ncp->nc_name); 464 } 465 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_TIMELOCK); 466 } 467 if (__predict_false(didwarn)) { 468 kprintf("[diagnostic] cache_lock: " 469 "%s unblocked %*.*s after %d secs\n", 470 curthread->td_comm, 471 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 472 (int)(ticks - didwarn) / hz); 473 } 474 } 475 476 /* 477 * Release a previously acquired lock. 478 * 479 * A concurrent shared-lock acquisition or acquisition/release can 480 * race bit 31 so only drop the ncp if bit 31 was set. 481 */ 482 static __inline 483 void 484 _cache_unlock(struct namecache *ncp) 485 { 486 lockmgr(&ncp->nc_lock, LK_RELEASE); 487 } 488 489 /* 490 * Lock ncp exclusively, non-blocking. Return 0 on success. 491 */ 492 static __inline 493 int 494 _cache_lock_nonblock(struct namecache *ncp) 495 { 496 int error; 497 498 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_NOWAIT); 499 if (__predict_false(error != 0)) { 500 return(EWOULDBLOCK); 501 } 502 return 0; 503 } 504 505 /* 506 * This is a special form of _cache_lock() which only succeeds if 507 * it can get a pristine, non-recursive lock. The caller must have 508 * already ref'd the ncp. 509 * 510 * On success the ncp will be locked, on failure it will not. The 511 * ref count does not change either way. 512 * 513 * We want _cache_lock_special() (on success) to return a definitively 514 * usable vnode or a definitively unresolved ncp. 515 */ 516 static __inline 517 int 518 _cache_lock_special(struct namecache *ncp) 519 { 520 if (_cache_lock_nonblock(ncp) == 0) { 521 if (lockmgr_oneexcl(&ncp->nc_lock)) { 522 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 523 _cache_setunresolved(ncp); 524 return 0; 525 } 526 _cache_unlock(ncp); 527 } 528 return EWOULDBLOCK; 529 } 530 531 /* 532 * Shared lock, guarantees vp held 533 * 534 * The shared lock holds vp on the 0->1 transition. It is possible to race 535 * another shared lock release, preventing the other release from dropping 536 * the vnode and clearing bit 31. 537 * 538 * If it is not set then we are responsible for setting it, and this 539 * responsibility does not race with anyone else. 540 */ 541 static __inline 542 void 543 _cache_lock_shared(struct namecache *ncp) 544 { 545 int didwarn = 0; 546 int error; 547 548 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 549 while (__predict_false(error == EWOULDBLOCK)) { 550 if (didwarn == 0) { 551 didwarn = ticks - nclockwarn; 552 kprintf("[diagnostic] cache_lock_shared: " 553 "%s blocked on %p " 554 "\"%*.*s\"\n", 555 curthread->td_comm, ncp, 556 ncp->nc_nlen, ncp->nc_nlen, 557 ncp->nc_name); 558 } 559 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 560 } 561 if (__predict_false(didwarn)) { 562 kprintf("[diagnostic] cache_lock_shared: " 563 "%s unblocked %*.*s after %d secs\n", 564 curthread->td_comm, 565 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 566 (int)(ticks - didwarn) / hz); 567 } 568 } 569 570 /* 571 * Shared lock, guarantees vp held. Non-blocking. Returns 0 on success 572 */ 573 static __inline 574 int 575 _cache_lock_shared_nonblock(struct namecache *ncp) 576 { 577 int error; 578 579 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_NOWAIT); 580 if (__predict_false(error != 0)) { 581 return(EWOULDBLOCK); 582 } 583 return 0; 584 } 585 586 /* 587 * This function tries to get a shared lock but will back-off to an 588 * exclusive lock if: 589 * 590 * (1) Some other thread is trying to obtain an exclusive lock 591 * (to prevent the exclusive requester from getting livelocked out 592 * by many shared locks). 593 * 594 * (2) The current thread already owns an exclusive lock (to avoid 595 * deadlocking). 596 * 597 * WARNING! On machines with lots of cores we really want to try hard to 598 * get a shared lock or concurrent path lookups can chain-react 599 * into a very high-latency exclusive lock. 600 * 601 * This is very evident in dsynth's initial scans. 602 */ 603 static __inline 604 int 605 _cache_lock_shared_special(struct namecache *ncp) 606 { 607 /* 608 * Only honor a successful shared lock (returning 0) if there is 609 * no exclusive request pending and the vnode, if present, is not 610 * in a reclaimed state. 611 */ 612 if (_cache_lock_shared_nonblock(ncp) == 0) { 613 if (__predict_true(!lockmgr_exclpending(&ncp->nc_lock))) { 614 if (ncp->nc_vp == NULL || 615 (ncp->nc_vp->v_flag & VRECLAIMED) == 0) { 616 return(0); 617 } 618 } 619 _cache_unlock(ncp); 620 return(EWOULDBLOCK); 621 } 622 623 /* 624 * Non-blocking shared lock failed. If we already own the exclusive 625 * lock just acquire another exclusive lock (instead of deadlocking). 626 * Otherwise acquire a shared lock. 627 */ 628 if (lockstatus(&ncp->nc_lock, curthread) == LK_EXCLUSIVE) { 629 _cache_lock(ncp); 630 return(0); 631 } 632 _cache_lock_shared(ncp); 633 return(0); 634 } 635 636 static __inline 637 int 638 _cache_lockstatus(struct namecache *ncp) 639 { 640 int status; 641 642 status = lockstatus(&ncp->nc_lock, curthread); 643 if (status == 0 || status == LK_EXCLOTHER) 644 status = -1; 645 return status; 646 } 647 648 /* 649 * cache_hold() and cache_drop() prevent the premature deletion of a 650 * namecache entry but do not prevent operations (such as zapping) on 651 * that namecache entry. 652 * 653 * This routine may only be called from outside this source module if 654 * nc_refs is already deterministically at least 1, such as being 655 * associated with e.g. a process, file descriptor, or some other entity. 656 * 657 * Only the above situations, similar situations within this module where 658 * the ref count is deterministically at least 1, or when the ncp is found 659 * via the nchpp (hash table) lookup, can bump nc_refs. 660 * 661 * Very specifically, a ncp found via nc_list CANNOT bump nc_refs. It 662 * can still be removed from the nc_list, however, as long as the caller 663 * can acquire its lock (in the wrong order). 664 * 665 * This is a rare case where callers are allowed to hold a spinlock, 666 * so we can't ourselves. 667 */ 668 static __inline 669 struct namecache * 670 _cache_hold(struct namecache *ncp) 671 { 672 KKASSERT(ncp->nc_refs > 0); 673 atomic_add_int(&ncp->nc_refs, 1); 674 675 return(ncp); 676 } 677 678 /* 679 * Drop a cache entry. 680 * 681 * The 1->0 transition is special and requires the caller to destroy the 682 * entry. It means that the ncp is no longer on a nchpp list (since that 683 * would mean there was stilla ref). The ncp could still be on a nc_list 684 * but will not have any child of its own, again because nc_refs is now 0 685 * and children would have a ref to their parent. 686 * 687 * Once the 1->0 transition is made, nc_refs cannot be incremented again. 688 */ 689 static __inline 690 void 691 _cache_drop(struct namecache *ncp) 692 { 693 if (atomic_fetchadd_int(&ncp->nc_refs, -1) == 1) { 694 /* 695 * Executed unlocked (no need to lock on last drop) 696 */ 697 _cache_setunresolved(ncp); 698 699 /* 700 * Scrap it. 701 */ 702 ncp->nc_refs = -1; /* safety */ 703 if (ncp->nc_name) 704 kfree(ncp->nc_name, M_VFSCACHEAUX); 705 kfree_obj(ncp, M_VFSCACHE); 706 } 707 } 708 709 /* 710 * Link a new namecache entry to its parent and to the hash table. Be 711 * careful to avoid races if vhold() blocks in the future. 712 * 713 * Both ncp and par must be referenced and locked. The reference is 714 * transfered to the nchpp (and, most notably, NOT to the parent list). 715 * 716 * NOTE: The hash table spinlock is held across this call, we can't do 717 * anything fancy. 718 */ 719 static void 720 _cache_link_parent(struct namecache *ncp, struct namecache *par, 721 struct nchash_head *nchpp) 722 { 723 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 724 725 KKASSERT(ncp->nc_parent == NULL); 726 ncp->nc_parent = par; 727 ncp->nc_head = nchpp; 728 729 /* 730 * Set inheritance flags. Note that the parent flags may be 731 * stale due to getattr potentially not having been run yet 732 * (it gets run during nlookup()'s). 733 */ 734 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE); 735 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) 736 ncp->nc_flag |= NCF_SF_PNOCACHE; 737 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE)) 738 ncp->nc_flag |= NCF_UF_PCACHE; 739 740 /* 741 * Add to hash table and parent, adjust accounting 742 */ 743 TAILQ_INSERT_HEAD(&nchpp->list, ncp, nc_hash); 744 atomic_add_long(&pn->vfscache_count, 1); 745 if (TAILQ_EMPTY(&ncp->nc_list)) 746 atomic_add_long(&pn->vfscache_leafs, 1); 747 748 if (TAILQ_EMPTY(&par->nc_list)) { 749 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 750 atomic_add_long(&pn->vfscache_leafs, -1); 751 /* 752 * Any vp associated with an ncp which has children must 753 * be held to prevent it from being recycled. 754 */ 755 if (par->nc_vp) 756 vhold(par->nc_vp); 757 } else { 758 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 759 } 760 _cache_hold(par); /* add nc_parent ref */ 761 } 762 763 /* 764 * Remove the parent and hash associations from a namecache structure. 765 * Drop the ref-count on the parent. The caller receives the ref 766 * from the ncp's nchpp linkage that was removed and may forward that 767 * ref to a new linkage. 768 769 * The caller usually holds an additional ref * on the ncp so the unlink 770 * cannot be the final drop. XXX should not be necessary now since the 771 * caller receives the ref from the nchpp linkage, assuming the ncp 772 * was linked in the first place. 773 * 774 * ncp must be locked, which means that there won't be any nc_parent 775 * removal races. This routine will acquire a temporary lock on 776 * the parent as well as the appropriate hash chain. 777 */ 778 static void 779 _cache_unlink_parent(struct namecache *ncp) 780 { 781 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 782 struct namecache *par; 783 struct vnode *dropvp; 784 struct nchash_head *nchpp; 785 786 if ((par = ncp->nc_parent) != NULL) { 787 cpu_ccfence(); 788 KKASSERT(ncp->nc_parent == par); 789 790 /* don't add a ref, we drop the nchpp ref later */ 791 _cache_lock(par); 792 nchpp = ncp->nc_head; 793 spin_lock(&nchpp->spin); 794 795 /* 796 * Remove from hash table and parent, adjust accounting 797 */ 798 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 799 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 800 atomic_add_long(&pn->vfscache_count, -1); 801 if (TAILQ_EMPTY(&ncp->nc_list)) 802 atomic_add_long(&pn->vfscache_leafs, -1); 803 804 dropvp = NULL; 805 if (TAILQ_EMPTY(&par->nc_list)) { 806 atomic_add_long(&pn->vfscache_leafs, 1); 807 if (par->nc_vp) 808 dropvp = par->nc_vp; 809 } 810 ncp->nc_parent = NULL; 811 ncp->nc_head = NULL; 812 spin_unlock(&nchpp->spin); 813 _cache_unlock(par); 814 _cache_drop(par); /* drop nc_parent ref */ 815 816 /* 817 * We can only safely vdrop with no spinlocks held. 818 */ 819 if (dropvp) 820 vdrop(dropvp); 821 } 822 } 823 824 /* 825 * Allocate a new namecache structure. Most of the code does not require 826 * zero-termination of the string but it makes vop_compat_ncreate() easier. 827 * 828 * The returned ncp will be locked and referenced. The ref is generally meant 829 * to be transfered to the nchpp linkage. 830 */ 831 static struct namecache * 832 cache_alloc(int nlen) 833 { 834 struct namecache *ncp; 835 836 ncp = kmalloc_obj(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO); 837 if (nlen) 838 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHEAUX, M_WAITOK); 839 ncp->nc_nlen = nlen; 840 ncp->nc_flag = NCF_UNRESOLVED; 841 ncp->nc_error = ENOTCONN; /* needs to be resolved */ 842 ncp->nc_refs = 1; 843 TAILQ_INIT(&ncp->nc_list); 844 lockinit(&ncp->nc_lock, "ncplk", hz, LK_CANRECURSE); 845 lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 846 847 return(ncp); 848 } 849 850 /* 851 * Can only be called for the case where the ncp has never been 852 * associated with anything (so no spinlocks are needed). 853 */ 854 static void 855 _cache_free(struct namecache *ncp) 856 { 857 KKASSERT(ncp->nc_refs == 1); 858 if (ncp->nc_name) 859 kfree(ncp->nc_name, M_VFSCACHEAUX); 860 kfree_obj(ncp, M_VFSCACHE); 861 } 862 863 /* 864 * [re]initialize a nchandle. 865 */ 866 void 867 cache_zero(struct nchandle *nch) 868 { 869 nch->ncp = NULL; 870 nch->mount = NULL; 871 } 872 873 /* 874 * Ref and deref a nchandle structure (ncp + mp) 875 * 876 * The caller must specify a stable ncp pointer, typically meaning the 877 * ncp is already referenced but this can also occur indirectly through 878 * e.g. holding a lock on a direct child. 879 * 880 * WARNING: Caller may hold an unrelated read spinlock, which means we can't 881 * use read spinlocks here. 882 */ 883 struct nchandle * 884 cache_hold(struct nchandle *nch) 885 { 886 _cache_hold(nch->ncp); 887 _cache_mntref(nch->mount); 888 return(nch); 889 } 890 891 /* 892 * Create a copy of a namecache handle for an already-referenced 893 * entry. 894 */ 895 void 896 cache_copy(struct nchandle *nch, struct nchandle *target) 897 { 898 struct namecache *ncp; 899 struct mount *mp; 900 struct mntcache_elm *elm; 901 struct namecache *ncpr; 902 int i; 903 904 ncp = nch->ncp; 905 mp = nch->mount; 906 target->ncp = ncp; 907 target->mount = mp; 908 909 elm = _cache_mntcache_hash(ncp); 910 for (i = 0; i < MNTCACHE_SET; ++i) { 911 if (elm->ncp == ncp) { 912 ncpr = atomic_swap_ptr((void *)&elm->ncp, NULL); 913 if (ncpr == ncp) { 914 _cache_mntref(mp); 915 return; 916 } 917 if (ncpr) 918 _cache_drop(ncpr); 919 } 920 ++elm; 921 } 922 if (ncp) 923 _cache_hold(ncp); 924 _cache_mntref(mp); 925 } 926 927 /* 928 * Drop the nchandle, but try to cache the ref to avoid global atomic 929 * ops. This is typically done on the system root and jail root nchandles. 930 */ 931 void 932 cache_drop_and_cache(struct nchandle *nch, int elmno) 933 { 934 struct mntcache_elm *elm; 935 struct mntcache_elm *best; 936 struct namecache *ncpr; 937 int delta1; 938 int delta2; 939 int i; 940 941 if (elmno > 4) { 942 if (nch->ncp) { 943 _cache_drop(nch->ncp); 944 nch->ncp = NULL; 945 } 946 if (nch->mount) { 947 _cache_mntrel(nch->mount); 948 nch->mount = NULL; 949 } 950 return; 951 } 952 953 elm = _cache_mntcache_hash(nch->ncp); 954 best = elm; 955 for (i = 0; i < MNTCACHE_SET; ++i) { 956 if (elm->ncp == NULL) { 957 ncpr = atomic_swap_ptr((void *)&elm->ncp, nch->ncp); 958 _cache_mntrel(nch->mount); 959 elm->ticks = ticks; 960 nch->mount = NULL; 961 nch->ncp = NULL; 962 if (ncpr) 963 _cache_drop(ncpr); 964 return; 965 } 966 delta1 = ticks - best->ticks; 967 delta2 = ticks - elm->ticks; 968 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 969 best = elm; 970 ++elm; 971 } 972 ncpr = atomic_swap_ptr((void *)&best->ncp, nch->ncp); 973 _cache_mntrel(nch->mount); 974 best->ticks = ticks; 975 nch->mount = NULL; 976 nch->ncp = NULL; 977 if (ncpr) 978 _cache_drop(ncpr); 979 } 980 981 void 982 cache_changemount(struct nchandle *nch, struct mount *mp) 983 { 984 _cache_mntref(mp); 985 _cache_mntrel(nch->mount); 986 nch->mount = mp; 987 } 988 989 void 990 cache_drop(struct nchandle *nch) 991 { 992 _cache_mntrel(nch->mount); 993 _cache_drop(nch->ncp); 994 nch->ncp = NULL; 995 nch->mount = NULL; 996 } 997 998 int 999 cache_lockstatus(struct nchandle *nch) 1000 { 1001 return(_cache_lockstatus(nch->ncp)); 1002 } 1003 1004 void 1005 cache_lock(struct nchandle *nch) 1006 { 1007 _cache_lock(nch->ncp); 1008 } 1009 1010 void 1011 cache_lock_maybe_shared(struct nchandle *nch, int excl) 1012 { 1013 struct namecache *ncp = nch->ncp; 1014 1015 if (ncp_shared_lock_disable || excl || 1016 (ncp->nc_flag & NCF_UNRESOLVED)) { 1017 _cache_lock(ncp); 1018 } else { 1019 _cache_lock_shared(ncp); 1020 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1021 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1022 _cache_unlock(ncp); 1023 _cache_lock(ncp); 1024 } 1025 } else { 1026 _cache_unlock(ncp); 1027 _cache_lock(ncp); 1028 } 1029 } 1030 } 1031 1032 /* 1033 * Lock fncpd, fncp, tncpd, and tncp. tncp is already locked but may 1034 * have to be cycled to avoid deadlocks. Make sure all four are resolved. 1035 * 1036 * The caller is responsible for checking the validity upon return as 1037 * the records may have been flagged DESTROYED in the interim. 1038 * 1039 * Namecache lock ordering is leaf first, then parent. However, complex 1040 * interactions may occur between the source and target because there is 1041 * no ordering guarantee between (fncpd, fncp) and (tncpd and tncp). 1042 */ 1043 void 1044 cache_lock4_tondlocked(struct nchandle *fncpd, struct nchandle *fncp, 1045 struct nchandle *tncpd, struct nchandle *tncp, 1046 struct ucred *fcred, struct ucred *tcred) 1047 { 1048 int tlocked = 1; 1049 1050 /* 1051 * Lock tncp and tncpd 1052 * 1053 * NOTE: Because these ncps are not locked to begin with, it is 1054 * possible for other rename races to cause the normal lock 1055 * order assumptions to fail. 1056 * 1057 * NOTE: Lock ordering assumptions are valid if a leaf's parent 1058 * matches after the leaf has been locked. However, ordering 1059 * between the 'from' and the 'to' is not and an overlapping 1060 * lock order reversal is still possible. 1061 */ 1062 again: 1063 if (__predict_false(tlocked == 0)) { 1064 cache_lock(tncp); 1065 } 1066 if (__predict_false(cache_lock_nonblock(tncpd) != 0)) { 1067 cache_unlock(tncp); 1068 cache_lock(tncpd); cache_unlock(tncpd); /* cycle */ 1069 tlocked = 0; 1070 goto again; 1071 } 1072 1073 /* 1074 * Lock fncp and fncpd 1075 * 1076 * NOTE: Because these ncps are not locked to begin with, it is 1077 * possible for other rename races to cause the normal lock 1078 * order assumptions to fail. 1079 * 1080 * NOTE: Lock ordering assumptions are valid if a leaf's parent 1081 * matches after the leaf has been locked. However, ordering 1082 * between the 'from' and the 'to' is not and an overlapping 1083 * lock order reversal is still possible. 1084 */ 1085 if (__predict_false(cache_lock_nonblock(fncp) != 0)) { 1086 cache_unlock(tncpd); 1087 cache_unlock(tncp); 1088 cache_lock(fncp); cache_unlock(fncp); /* cycle */ 1089 tlocked = 0; 1090 goto again; 1091 } 1092 if (__predict_false(cache_lock_nonblock(fncpd) != 0)) { 1093 cache_unlock(fncp); 1094 cache_unlock(tncpd); 1095 cache_unlock(tncp); 1096 cache_lock(fncpd); cache_unlock(fncpd); /* cycle */ 1097 tlocked = 0; 1098 goto again; 1099 } 1100 if (__predict_true((fncpd->ncp->nc_flag & NCF_DESTROYED) == 0)) 1101 cache_resolve(fncpd, fcred); 1102 if (__predict_true((tncpd->ncp->nc_flag & NCF_DESTROYED) == 0)) 1103 cache_resolve(tncpd, tcred); 1104 if (__predict_true((fncp->ncp->nc_flag & NCF_DESTROYED) == 0)) 1105 cache_resolve(fncp, fcred); 1106 if (__predict_true((tncp->ncp->nc_flag & NCF_DESTROYED) == 0)) 1107 cache_resolve(tncp, tcred); 1108 } 1109 1110 int 1111 cache_lock_nonblock(struct nchandle *nch) 1112 { 1113 return(_cache_lock_nonblock(nch->ncp)); 1114 } 1115 1116 void 1117 cache_unlock(struct nchandle *nch) 1118 { 1119 _cache_unlock(nch->ncp); 1120 } 1121 1122 /* 1123 * ref-and-lock, unlock-and-deref functions. 1124 * 1125 * This function is primarily used by nlookup. Even though cache_lock 1126 * holds the vnode, it is possible that the vnode may have already 1127 * initiated a recyclement. 1128 * 1129 * We want cache_get() to return a definitively usable vnode or a 1130 * definitively unresolved ncp. 1131 */ 1132 static 1133 struct namecache * 1134 _cache_get(struct namecache *ncp) 1135 { 1136 _cache_hold(ncp); 1137 _cache_lock(ncp); 1138 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1139 _cache_setunresolved(ncp); 1140 return(ncp); 1141 } 1142 1143 /* 1144 * Attempt to obtain a shared lock on the ncp. A shared lock will only 1145 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is 1146 * valid. Otherwise an exclusive lock will be acquired instead. 1147 */ 1148 static 1149 struct namecache * 1150 _cache_get_maybe_shared(struct namecache *ncp, int excl) 1151 { 1152 if (ncp_shared_lock_disable || excl || 1153 (ncp->nc_flag & NCF_UNRESOLVED)) { 1154 return(_cache_get(ncp)); 1155 } 1156 _cache_hold(ncp); 1157 _cache_lock_shared(ncp); 1158 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1159 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1160 _cache_unlock(ncp); 1161 ncp = _cache_get(ncp); 1162 _cache_drop(ncp); 1163 } 1164 } else { 1165 _cache_unlock(ncp); 1166 ncp = _cache_get(ncp); 1167 _cache_drop(ncp); 1168 } 1169 return(ncp); 1170 } 1171 1172 /* 1173 * NOTE: The same nchandle can be passed for both arguments. 1174 */ 1175 void 1176 cache_get(struct nchandle *nch, struct nchandle *target) 1177 { 1178 KKASSERT(nch->ncp->nc_refs > 0); 1179 target->mount = nch->mount; 1180 target->ncp = _cache_get(nch->ncp); 1181 _cache_mntref(target->mount); 1182 } 1183 1184 void 1185 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl) 1186 { 1187 KKASSERT(nch->ncp->nc_refs > 0); 1188 target->mount = nch->mount; 1189 target->ncp = _cache_get_maybe_shared(nch->ncp, excl); 1190 _cache_mntref(target->mount); 1191 } 1192 1193 /* 1194 * Release a held and locked ncp 1195 */ 1196 static __inline 1197 void 1198 _cache_put(struct namecache *ncp) 1199 { 1200 _cache_unlock(ncp); 1201 _cache_drop(ncp); 1202 } 1203 1204 void 1205 cache_put(struct nchandle *nch) 1206 { 1207 _cache_mntrel(nch->mount); 1208 _cache_put(nch->ncp); 1209 nch->ncp = NULL; 1210 nch->mount = NULL; 1211 } 1212 1213 /* 1214 * Resolve an unresolved ncp by associating a vnode with it. If the 1215 * vnode is NULL, a negative cache entry is created. 1216 * 1217 * The ncp should be locked on entry and will remain locked on return. 1218 */ 1219 static 1220 void 1221 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp) 1222 { 1223 KKASSERT((ncp->nc_flag & NCF_UNRESOLVED) && 1224 (_cache_lockstatus(ncp) == LK_EXCLUSIVE) && 1225 ncp->nc_vp == NULL); 1226 1227 if (vp) { 1228 /* 1229 * Any vp associated with an ncp which has children must 1230 * be held. Any vp associated with a locked ncp must be held. 1231 */ 1232 if (!TAILQ_EMPTY(&ncp->nc_list)) 1233 vhold(vp); 1234 spin_lock(&vp->v_spin); 1235 ncp->nc_vp = vp; 1236 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode); 1237 ++vp->v_namecache_count; 1238 _cache_hold(ncp); /* v_namecache assoc */ 1239 spin_unlock(&vp->v_spin); 1240 vhold(vp); /* nc_vp */ 1241 1242 /* 1243 * Set auxiliary flags 1244 */ 1245 switch(vp->v_type) { 1246 case VDIR: 1247 ncp->nc_flag |= NCF_ISDIR; 1248 break; 1249 case VLNK: 1250 ncp->nc_flag |= NCF_ISSYMLINK; 1251 /* XXX cache the contents of the symlink */ 1252 break; 1253 default: 1254 break; 1255 } 1256 1257 ncp->nc_error = 0; 1258 1259 /* 1260 * XXX: this is a hack to work-around the lack of a real pfs vfs 1261 * implementation 1262 */ 1263 if (mp) { 1264 if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0) 1265 vp->v_pfsmp = mp; 1266 } 1267 } else { 1268 /* 1269 * When creating a negative cache hit we set the 1270 * namecache_gen. A later resolve will clean out the 1271 * negative cache hit if the mount point's namecache_gen 1272 * has changed. Used by devfs, could also be used by 1273 * other remote FSs. 1274 */ 1275 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 1276 1277 ncp->nc_vp = NULL; 1278 ncp->nc_negcpu = mycpu->gd_cpuid; 1279 spin_lock(&pn->neg_spin); 1280 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 1281 _cache_hold(ncp); /* neg_list assoc */ 1282 ++pn->neg_count; 1283 spin_unlock(&pn->neg_spin); 1284 atomic_add_long(&pn->vfscache_negs, 1); 1285 1286 ncp->nc_error = ENOENT; 1287 if (mp) 1288 VFS_NCPGEN_SET(mp, ncp); 1289 } 1290 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP); 1291 } 1292 1293 void 1294 cache_setvp(struct nchandle *nch, struct vnode *vp) 1295 { 1296 _cache_setvp(nch->mount, nch->ncp, vp); 1297 } 1298 1299 /* 1300 * Used for NFS 1301 */ 1302 void 1303 cache_settimeout(struct nchandle *nch, int nticks) 1304 { 1305 struct namecache *ncp = nch->ncp; 1306 1307 if ((ncp->nc_timeout = ticks + nticks) == 0) 1308 ncp->nc_timeout = 1; 1309 } 1310 1311 /* 1312 * Disassociate the vnode or negative-cache association and mark a 1313 * namecache entry as unresolved again. Note that the ncp is still 1314 * left in the hash table and still linked to its parent. 1315 * 1316 * The ncp should be locked and refd on entry and will remain locked and refd 1317 * on return. 1318 * 1319 * This routine is normally never called on a directory containing children. 1320 * However, NFS often does just that in its rename() code as a cop-out to 1321 * avoid complex namespace operations. This disconnects a directory vnode 1322 * from its namecache and can cause the OLDAPI and NEWAPI to get out of 1323 * sync. 1324 * 1325 */ 1326 static 1327 void 1328 _cache_setunresolved(struct namecache *ncp) 1329 { 1330 struct vnode *vp; 1331 1332 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1333 ncp->nc_flag |= NCF_UNRESOLVED; 1334 ncp->nc_timeout = 0; 1335 ncp->nc_error = ENOTCONN; 1336 if ((vp = ncp->nc_vp) != NULL) { 1337 spin_lock(&vp->v_spin); 1338 ncp->nc_vp = NULL; 1339 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode); 1340 --vp->v_namecache_count; 1341 spin_unlock(&vp->v_spin); 1342 1343 /* 1344 * Any vp associated with an ncp with children is 1345 * held by that ncp. Any vp associated with ncp 1346 * is held by that ncp. These conditions must be 1347 * undone when the vp is cleared out from the ncp. 1348 */ 1349 if (!TAILQ_EMPTY(&ncp->nc_list)) 1350 vdrop(vp); 1351 vdrop(vp); 1352 } else { 1353 struct pcpu_ncache *pn; 1354 1355 pn = &pcpu_ncache[ncp->nc_negcpu]; 1356 1357 atomic_add_long(&pn->vfscache_negs, -1); 1358 spin_lock(&pn->neg_spin); 1359 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 1360 --pn->neg_count; 1361 spin_unlock(&pn->neg_spin); 1362 } 1363 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK); 1364 _cache_drop(ncp); /* from v_namecache or neg_list */ 1365 } 1366 } 1367 1368 /* 1369 * The cache_nresolve() code calls this function to automatically 1370 * set a resolved cache element to unresolved if it has timed out 1371 * or if it is a negative cache hit and the mount point namecache_gen 1372 * has changed. 1373 */ 1374 static __inline int 1375 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp) 1376 { 1377 /* 1378 * Try to zap entries that have timed out. We have 1379 * to be careful here because locked leafs may depend 1380 * on the vnode remaining intact in a parent, so only 1381 * do this under very specific conditions. 1382 */ 1383 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 && 1384 TAILQ_EMPTY(&ncp->nc_list)) { 1385 return 1; 1386 } 1387 1388 /* 1389 * If a resolved negative cache hit is invalid due to 1390 * the mount's namecache generation being bumped, zap it. 1391 */ 1392 if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) { 1393 return 1; 1394 } 1395 1396 /* 1397 * Otherwise we are good 1398 */ 1399 return 0; 1400 } 1401 1402 static __inline void 1403 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp) 1404 { 1405 /* 1406 * Already in an unresolved state, nothing to do. 1407 */ 1408 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1409 if (_cache_auto_unresolve_test(mp, ncp)) 1410 _cache_setunresolved(ncp); 1411 } 1412 } 1413 1414 void 1415 cache_setunresolved(struct nchandle *nch) 1416 { 1417 _cache_setunresolved(nch->ncp); 1418 } 1419 1420 /* 1421 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist 1422 * looking for matches. This flag tells the lookup code when it must 1423 * check for a mount linkage and also prevents the directories in question 1424 * from being deleted or renamed. 1425 */ 1426 static 1427 int 1428 cache_clrmountpt_callback(struct mount *mp, void *data) 1429 { 1430 struct nchandle *nch = data; 1431 1432 if (mp->mnt_ncmounton.ncp == nch->ncp) 1433 return(1); 1434 if (mp->mnt_ncmountpt.ncp == nch->ncp) 1435 return(1); 1436 return(0); 1437 } 1438 1439 /* 1440 * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated 1441 * with a mount point. 1442 */ 1443 void 1444 cache_clrmountpt(struct nchandle *nch) 1445 { 1446 int count; 1447 1448 count = mountlist_scan(cache_clrmountpt_callback, nch, 1449 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | 1450 MNTSCAN_NOUNLOCK); 1451 if (count == 0) 1452 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT; 1453 } 1454 1455 /* 1456 * Invalidate portions of the namecache topology given a starting entry. 1457 * The passed ncp is set to an unresolved state and: 1458 * 1459 * The passed ncp must be referenced and locked. The routine may unlock 1460 * and relock ncp several times, and will recheck the children and loop 1461 * to catch races. When done the passed ncp will be returned with the 1462 * reference and lock intact. 1463 * 1464 * CINV_DESTROY - Set a flag in the passed ncp entry indicating 1465 * that the physical underlying nodes have been 1466 * destroyed... as in deleted. For example, when 1467 * a directory is removed. This will cause record 1468 * lookups on the name to no longer be able to find 1469 * the record and tells the resolver to return failure 1470 * rather then trying to resolve through the parent. 1471 * 1472 * The topology itself, including ncp->nc_name, 1473 * remains intact. 1474 * 1475 * This only applies to the passed ncp, if CINV_CHILDREN 1476 * is specified the children are not flagged. 1477 * 1478 * CINV_CHILDREN - Set all children (recursively) to an unresolved 1479 * state as well. 1480 * 1481 * Note that this will also have the side effect of 1482 * cleaning out any unreferenced nodes in the topology 1483 * from the leaves up as the recursion backs out. 1484 * 1485 * Note that the topology for any referenced nodes remains intact, but 1486 * the nodes will be marked as having been destroyed and will be set 1487 * to an unresolved state. 1488 * 1489 * It is possible for cache_inval() to race a cache_resolve(), meaning that 1490 * the namecache entry may not actually be invalidated on return if it was 1491 * revalidated while recursing down into its children. This code guarentees 1492 * that the node(s) will go through an invalidation cycle, but does not 1493 * guarentee that they will remain in an invalidated state. 1494 * 1495 * Returns non-zero if a revalidation was detected during the invalidation 1496 * recursion, zero otherwise. Note that since only the original ncp is 1497 * locked the revalidation ultimately can only indicate that the original ncp 1498 * *MIGHT* no have been reresolved. 1499 * 1500 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we 1501 * have to avoid blowing out the kernel stack. We do this by saving the 1502 * deep namecache node and aborting the recursion, then re-recursing at that 1503 * node using a depth-first algorithm in order to allow multiple deep 1504 * recursions to chain through each other, then we restart the invalidation 1505 * from scratch. 1506 */ 1507 1508 struct cinvtrack { 1509 struct namecache *resume_ncp; 1510 int depth; 1511 }; 1512 1513 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *); 1514 1515 static 1516 int 1517 _cache_inval(struct namecache *ncp, int flags) 1518 { 1519 struct cinvtrack track; 1520 struct namecache *ncp2; 1521 int r; 1522 1523 track.depth = 0; 1524 track.resume_ncp = NULL; 1525 1526 for (;;) { 1527 r = _cache_inval_internal(ncp, flags, &track); 1528 if (track.resume_ncp == NULL) 1529 break; 1530 _cache_unlock(ncp); 1531 while ((ncp2 = track.resume_ncp) != NULL) { 1532 track.resume_ncp = NULL; 1533 _cache_lock(ncp2); 1534 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY, 1535 &track); 1536 /*_cache_put(ncp2);*/ 1537 cache_zap(ncp2); 1538 } 1539 _cache_lock(ncp); 1540 } 1541 return(r); 1542 } 1543 1544 int 1545 cache_inval(struct nchandle *nch, int flags) 1546 { 1547 return(_cache_inval(nch->ncp, flags)); 1548 } 1549 1550 /* 1551 * Helper for _cache_inval(). The passed ncp is refd and locked and 1552 * remains that way on return, but may be unlocked/relocked multiple 1553 * times by the routine. 1554 */ 1555 static int 1556 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track) 1557 { 1558 struct namecache *nextkid; 1559 int rcnt = 0; 1560 1561 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1562 1563 _cache_setunresolved(ncp); 1564 if (flags & CINV_DESTROY) { 1565 ncp->nc_flag |= NCF_DESTROYED; 1566 ++ncp->nc_generation; 1567 } 1568 1569 while ((flags & CINV_CHILDREN) && 1570 (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL 1571 ) { 1572 struct namecache *kid; 1573 int restart; 1574 1575 restart = 0; 1576 _cache_hold(nextkid); 1577 if (++track->depth > MAX_RECURSION_DEPTH) { 1578 track->resume_ncp = ncp; 1579 _cache_hold(ncp); 1580 ++rcnt; 1581 } 1582 while ((kid = nextkid) != NULL) { 1583 /* 1584 * Parent (ncp) must be locked for the iteration. 1585 */ 1586 nextkid = NULL; 1587 if (kid->nc_parent != ncp) { 1588 _cache_drop(kid); 1589 kprintf("cache_inval_internal restartA %s\n", 1590 ncp->nc_name); 1591 restart = 1; 1592 break; 1593 } 1594 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL) 1595 _cache_hold(nextkid); 1596 1597 /* 1598 * Parent unlocked for this section to avoid 1599 * deadlocks. Then lock the kid and check for 1600 * races. 1601 */ 1602 _cache_unlock(ncp); 1603 if (track->resume_ncp) { 1604 _cache_drop(kid); 1605 _cache_lock(ncp); 1606 break; 1607 } 1608 _cache_lock(kid); 1609 if (kid->nc_parent != ncp) { 1610 kprintf("cache_inval_internal " 1611 "restartB %s\n", 1612 ncp->nc_name); 1613 restart = 1; 1614 _cache_unlock(kid); 1615 _cache_drop(kid); 1616 _cache_lock(ncp); 1617 break; 1618 } 1619 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 || 1620 TAILQ_FIRST(&kid->nc_list) 1621 ) { 1622 1623 rcnt += _cache_inval_internal(kid, 1624 flags & ~CINV_DESTROY, track); 1625 /*_cache_unlock(kid);*/ 1626 /*_cache_drop(kid);*/ 1627 cache_zap(kid); 1628 } else { 1629 cache_zap(kid); 1630 } 1631 1632 /* 1633 * Relock parent to continue scan 1634 */ 1635 _cache_lock(ncp); 1636 } 1637 if (nextkid) 1638 _cache_drop(nextkid); 1639 --track->depth; 1640 if (restart == 0) 1641 break; 1642 } 1643 1644 /* 1645 * Someone could have gotten in there while ncp was unlocked, 1646 * retry if so. 1647 */ 1648 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 1649 ++rcnt; 1650 return (rcnt); 1651 } 1652 1653 /* 1654 * Invalidate a vnode's namecache associations. To avoid races against 1655 * the resolver we do not invalidate a node which we previously invalidated 1656 * but which was then re-resolved while we were in the invalidation loop. 1657 * 1658 * Returns non-zero if any namecache entries remain after the invalidation 1659 * loop completed. 1660 * 1661 * NOTE: Unlike the namecache topology which guarentees that ncp's will not 1662 * be ripped out of the topology while held, the vnode's v_namecache 1663 * list has no such restriction. NCP's can be ripped out of the list 1664 * at virtually any time if not locked, even if held. 1665 * 1666 * In addition, the v_namecache list itself must be locked via 1667 * the vnode's spinlock. 1668 */ 1669 int 1670 cache_inval_vp(struct vnode *vp, int flags) 1671 { 1672 struct namecache *ncp; 1673 struct namecache *next; 1674 1675 restart: 1676 spin_lock(&vp->v_spin); 1677 ncp = TAILQ_FIRST(&vp->v_namecache); 1678 if (ncp) 1679 _cache_hold(ncp); 1680 while (ncp) { 1681 /* loop entered with ncp held and vp spin-locked */ 1682 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1683 _cache_hold(next); 1684 spin_unlock(&vp->v_spin); 1685 _cache_lock(ncp); 1686 if (ncp->nc_vp != vp) { 1687 kprintf("Warning: cache_inval_vp: race-A detected on " 1688 "%s\n", ncp->nc_name); 1689 _cache_put(ncp); 1690 if (next) 1691 _cache_drop(next); 1692 goto restart; 1693 } 1694 _cache_inval(ncp, flags); 1695 _cache_put(ncp); /* also releases reference */ 1696 ncp = next; 1697 spin_lock(&vp->v_spin); 1698 if (ncp && ncp->nc_vp != vp) { 1699 spin_unlock(&vp->v_spin); 1700 kprintf("Warning: cache_inval_vp: race-B detected on " 1701 "%s\n", ncp->nc_name); 1702 _cache_drop(ncp); 1703 goto restart; 1704 } 1705 } 1706 spin_unlock(&vp->v_spin); 1707 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1708 } 1709 1710 /* 1711 * This routine is used instead of the normal cache_inval_vp() when we 1712 * are trying to recycle otherwise good vnodes. 1713 * 1714 * Return 0 on success, non-zero if not all namecache records could be 1715 * disassociated from the vnode (for various reasons). 1716 */ 1717 int 1718 cache_inval_vp_nonblock(struct vnode *vp) 1719 { 1720 struct namecache *ncp; 1721 struct namecache *next; 1722 1723 spin_lock(&vp->v_spin); 1724 ncp = TAILQ_FIRST(&vp->v_namecache); 1725 if (ncp) 1726 _cache_hold(ncp); 1727 while (ncp) { 1728 /* loop entered with ncp held */ 1729 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1730 _cache_hold(next); 1731 spin_unlock(&vp->v_spin); 1732 if (_cache_lock_nonblock(ncp)) { 1733 _cache_drop(ncp); 1734 if (next) 1735 _cache_drop(next); 1736 goto done; 1737 } 1738 if (ncp->nc_vp != vp) { 1739 kprintf("Warning: cache_inval_vp: race-A detected on " 1740 "%s\n", ncp->nc_name); 1741 _cache_put(ncp); 1742 if (next) 1743 _cache_drop(next); 1744 goto done; 1745 } 1746 _cache_inval(ncp, 0); 1747 _cache_put(ncp); /* also releases reference */ 1748 ncp = next; 1749 spin_lock(&vp->v_spin); 1750 if (ncp && ncp->nc_vp != vp) { 1751 spin_unlock(&vp->v_spin); 1752 kprintf("Warning: cache_inval_vp: race-B detected on " 1753 "%s\n", ncp->nc_name); 1754 _cache_drop(ncp); 1755 goto done; 1756 } 1757 } 1758 spin_unlock(&vp->v_spin); 1759 done: 1760 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1761 } 1762 1763 /* 1764 * Clears the universal directory search 'ok' flag. This flag allows 1765 * nlookup() to bypass normal vnode checks. This flag is a cached flag 1766 * so clearing it simply forces revalidation. 1767 */ 1768 void 1769 cache_inval_wxok(struct vnode *vp) 1770 { 1771 struct namecache *ncp; 1772 1773 spin_lock(&vp->v_spin); 1774 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) { 1775 if (ncp->nc_flag & (NCF_WXOK | NCF_NOTX)) 1776 atomic_clear_short(&ncp->nc_flag, NCF_WXOK | NCF_NOTX); 1777 } 1778 spin_unlock(&vp->v_spin); 1779 } 1780 1781 /* 1782 * The source ncp has been renamed to the target ncp. All elements have been 1783 * locked, including the parent ncp's. 1784 * 1785 * The target ncp is destroyed (as a normal rename-over would destroy the 1786 * target file or directory). 1787 * 1788 * Because there may be references to the source ncp we cannot copy its 1789 * contents to the target. Instead the source ncp is relinked as the target 1790 * and the target ncp is removed from the namecache topology. 1791 */ 1792 void 1793 cache_rename(struct nchandle *fnch, struct nchandle *tnch) 1794 { 1795 struct namecache *fncp = fnch->ncp; 1796 struct namecache *tncp = tnch->ncp; 1797 struct namecache *tncp_par; 1798 struct nchash_head *nchpp; 1799 u_int32_t hash; 1800 char *oname; 1801 char *nname; 1802 1803 ++fncp->nc_generation; 1804 ++tncp->nc_generation; 1805 if (tncp->nc_nlen) { 1806 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHEAUX, M_WAITOK); 1807 bcopy(tncp->nc_name, nname, tncp->nc_nlen); 1808 nname[tncp->nc_nlen] = 0; 1809 } else { 1810 nname = NULL; 1811 } 1812 1813 /* 1814 * Rename fncp (unlink) 1815 */ 1816 _cache_unlink_parent(fncp); 1817 oname = fncp->nc_name; 1818 fncp->nc_name = nname; 1819 fncp->nc_nlen = tncp->nc_nlen; 1820 if (oname) 1821 kfree(oname, M_VFSCACHEAUX); 1822 1823 tncp_par = tncp->nc_parent; 1824 KKASSERT(tncp_par->nc_lock.lk_lockholder == curthread); 1825 1826 /* 1827 * Rename fncp (relink) 1828 */ 1829 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT); 1830 hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash); 1831 nchpp = NCHHASH(hash); 1832 1833 spin_lock(&nchpp->spin); 1834 _cache_link_parent(fncp, tncp_par, nchpp); 1835 spin_unlock(&nchpp->spin); 1836 1837 /* 1838 * Get rid of the overwritten tncp (unlink) 1839 */ 1840 _cache_unlink(tncp); 1841 } 1842 1843 /* 1844 * Perform actions consistent with unlinking a file. The passed-in ncp 1845 * must be locked. 1846 * 1847 * The ncp is marked DESTROYED so it no longer shows up in searches, 1848 * and will be physically deleted when the vnode goes away. 1849 * 1850 * If the related vnode has no refs then we cycle it through vget()/vput() 1851 * to (possibly if we don't have a ref race) trigger a deactivation, 1852 * allowing the VFS to trivially detect and recycle the deleted vnode 1853 * via VOP_INACTIVE(). 1854 * 1855 * NOTE: _cache_rename() will automatically call _cache_unlink() on the 1856 * target ncp. 1857 */ 1858 void 1859 cache_unlink(struct nchandle *nch) 1860 { 1861 _cache_unlink(nch->ncp); 1862 } 1863 1864 static void 1865 _cache_unlink(struct namecache *ncp) 1866 { 1867 struct vnode *vp; 1868 1869 /* 1870 * Causes lookups to fail and allows another ncp with the same 1871 * name to be created under ncp->nc_parent. 1872 */ 1873 ncp->nc_flag |= NCF_DESTROYED; 1874 ++ncp->nc_generation; 1875 1876 /* 1877 * Attempt to trigger a deactivation. Set VREF_FINALIZE to 1878 * force action on the 1->0 transition. Do not destroy the 1879 * vp association if a vp is present (leave the destroyed ncp 1880 * resolved through the vp finalization). 1881 * 1882 * Cleanup the refs in the resolved-not-found case by setting 1883 * the ncp to an unresolved state. This improves our ability 1884 * to get rid of dead ncp elements in other cache_*() routines. 1885 */ 1886 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1887 vp = ncp->nc_vp; 1888 if (vp) { 1889 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 1890 if (VREFCNT(vp) <= 0) { 1891 if (vget(vp, LK_SHARED) == 0) 1892 vput(vp); 1893 } 1894 } else { 1895 _cache_setunresolved(ncp); 1896 } 1897 } 1898 } 1899 1900 /* 1901 * Return non-zero if the nch might be associated with an open and/or mmap()'d 1902 * file. The easy solution is to just return non-zero if the vnode has refs. 1903 * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to 1904 * force the reclaim). 1905 */ 1906 int 1907 cache_isopen(struct nchandle *nch) 1908 { 1909 struct vnode *vp; 1910 struct namecache *ncp = nch->ncp; 1911 1912 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 1913 (vp = ncp->nc_vp) != NULL && 1914 VREFCNT(vp)) { 1915 return 1; 1916 } 1917 return 0; 1918 } 1919 1920 1921 /* 1922 * vget the vnode associated with the namecache entry. Resolve the namecache 1923 * entry if necessary. The passed ncp must be referenced and locked. If 1924 * the ncp is resolved it might be locked shared. 1925 * 1926 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked 1927 * (depending on the passed lk_type) will be returned in *vpp with an error 1928 * of 0, or NULL will be returned in *vpp with a non-0 error code. The 1929 * most typical error is ENOENT, meaning that the ncp represents a negative 1930 * cache hit and there is no vnode to retrieve, but other errors can occur 1931 * too. 1932 * 1933 * The vget() can race a reclaim. If this occurs we re-resolve the 1934 * namecache entry. 1935 * 1936 * There are numerous places in the kernel where vget() is called on a 1937 * vnode while one or more of its namecache entries is locked. Releasing 1938 * a vnode never deadlocks against locked namecache entries (the vnode 1939 * will not get recycled while referenced ncp's exist). This means we 1940 * can safely acquire the vnode. In fact, we MUST NOT release the ncp 1941 * lock when acquiring the vp lock or we might cause a deadlock. 1942 * 1943 * NOTE: The passed-in ncp must be locked exclusively if it is initially 1944 * unresolved. If a reclaim race occurs the passed-in ncp will be 1945 * relocked exclusively before being re-resolved. 1946 */ 1947 int 1948 cache_vget(struct nchandle *nch, struct ucred *cred, 1949 int lk_type, struct vnode **vpp) 1950 { 1951 struct namecache *ncp; 1952 struct vnode *vp; 1953 int error; 1954 1955 ncp = nch->ncp; 1956 again: 1957 vp = NULL; 1958 if (ncp->nc_flag & NCF_UNRESOLVED) 1959 error = cache_resolve(nch, cred); 1960 else 1961 error = 0; 1962 1963 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 1964 error = vget(vp, lk_type); 1965 if (error) { 1966 /* 1967 * VRECLAIM race 1968 * 1969 * The ncp may have been locked shared, we must relock 1970 * it exclusively before we can set it to unresolved. 1971 */ 1972 if (error == ENOENT) { 1973 kprintf("Warning: vnode reclaim race detected " 1974 "in cache_vget on %p (%s)\n", 1975 vp, ncp->nc_name); 1976 _cache_unlock(ncp); 1977 _cache_lock(ncp); 1978 _cache_setunresolved(ncp); 1979 goto again; 1980 } 1981 1982 /* 1983 * Not a reclaim race, some other error. 1984 */ 1985 KKASSERT(ncp->nc_vp == vp); 1986 vp = NULL; 1987 } else { 1988 KKASSERT(ncp->nc_vp == vp); 1989 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 1990 } 1991 } 1992 if (error == 0 && vp == NULL) 1993 error = ENOENT; 1994 *vpp = vp; 1995 return(error); 1996 } 1997 1998 /* 1999 * Similar to cache_vget() but only acquires a ref on the vnode. The vnode 2000 * is already held by virtuue of the ncp being locked, but it might not be 2001 * referenced and while it is not referenced it can transition into the 2002 * VRECLAIMED state. 2003 * 2004 * NOTE: The passed-in ncp must be locked exclusively if it is initially 2005 * unresolved. If a reclaim race occurs the passed-in ncp will be 2006 * relocked exclusively before being re-resolved. 2007 * 2008 * NOTE: At the moment we have to issue a vget() on the vnode, even though 2009 * we are going to immediately release the lock, in order to resolve 2010 * potential reclamation races. Once we have a solid vnode ref that 2011 * was (at some point) interlocked via a vget(), the vnode will not 2012 * be reclaimed. 2013 * 2014 * NOTE: vhold counts (v_auxrefs) do not prevent reclamation. 2015 */ 2016 int 2017 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp) 2018 { 2019 struct namecache *ncp; 2020 struct vnode *vp; 2021 int error; 2022 int v; 2023 2024 ncp = nch->ncp; 2025 again: 2026 vp = NULL; 2027 if (ncp->nc_flag & NCF_UNRESOLVED) 2028 error = cache_resolve(nch, cred); 2029 else 2030 error = 0; 2031 2032 while (error == 0 && (vp = ncp->nc_vp) != NULL) { 2033 /* 2034 * Try a lockless ref of the vnode. VRECLAIMED transitions 2035 * use the vx_lock state and update-counter mechanism so we 2036 * can detect if one is in-progress or occurred. 2037 * 2038 * If we can successfully ref the vnode and interlock against 2039 * the update-counter mechanism, and VRECLAIMED is found to 2040 * not be set after that, we should be good. 2041 */ 2042 v = spin_access_start_only(&vp->v_spin); 2043 if (__predict_true(spin_access_check_inprog(v) == 0)) { 2044 vref_special(vp); 2045 if (__predict_false( 2046 spin_access_end_only(&vp->v_spin, v))) { 2047 vrele(vp); 2048 continue; 2049 } 2050 if (__predict_true((vp->v_flag & VRECLAIMED) == 0)) { 2051 break; 2052 } 2053 vrele(vp); 2054 kprintf("CACHE_VREF: IN-RECLAIM\n"); 2055 } 2056 2057 /* 2058 * Do it the slow way 2059 */ 2060 error = vget(vp, LK_SHARED); 2061 if (error) { 2062 /* 2063 * VRECLAIM race 2064 */ 2065 if (error == ENOENT) { 2066 kprintf("Warning: vnode reclaim race detected " 2067 "in cache_vget on %p (%s)\n", 2068 vp, ncp->nc_name); 2069 _cache_unlock(ncp); 2070 _cache_lock(ncp); 2071 _cache_setunresolved(ncp); 2072 goto again; 2073 } 2074 2075 /* 2076 * Not a reclaim race, some other error. 2077 */ 2078 KKASSERT(ncp->nc_vp == vp); 2079 vp = NULL; 2080 } else { 2081 KKASSERT(ncp->nc_vp == vp); 2082 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 2083 /* caller does not want a lock */ 2084 vn_unlock(vp); 2085 } 2086 break; 2087 } 2088 if (error == 0 && vp == NULL) 2089 error = ENOENT; 2090 *vpp = vp; 2091 2092 return(error); 2093 } 2094 2095 /* 2096 * Return a referenced vnode representing the parent directory of 2097 * ncp. 2098 * 2099 * Because the caller has locked the ncp it should not be possible for 2100 * the parent ncp to go away. However, the parent can unresolve its 2101 * dvp at any time so we must be able to acquire a lock on the parent 2102 * to safely access nc_vp. 2103 * 2104 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock, 2105 * so use vhold()/vdrop() while holding the lock to prevent dvp from 2106 * getting destroyed. 2107 * 2108 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a 2109 * lock on the ncp in question.. 2110 */ 2111 struct vnode * 2112 cache_dvpref(struct namecache *ncp) 2113 { 2114 struct namecache *par; 2115 struct vnode *dvp; 2116 2117 dvp = NULL; 2118 if ((par = ncp->nc_parent) != NULL) { 2119 _cache_hold(par); 2120 _cache_lock(par); 2121 if ((par->nc_flag & NCF_UNRESOLVED) == 0) { 2122 if ((dvp = par->nc_vp) != NULL) 2123 vhold(dvp); 2124 } 2125 _cache_unlock(par); 2126 if (dvp) { 2127 if (vget(dvp, LK_SHARED) == 0) { 2128 vn_unlock(dvp); 2129 vdrop(dvp); 2130 /* return refd, unlocked dvp */ 2131 } else { 2132 vdrop(dvp); 2133 dvp = NULL; 2134 } 2135 } 2136 _cache_drop(par); 2137 } 2138 return(dvp); 2139 } 2140 2141 /* 2142 * Convert a directory vnode to a namecache record without any other 2143 * knowledge of the topology. This ONLY works with directory vnodes and 2144 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the 2145 * returned ncp (if not NULL) will be held and unlocked. 2146 * 2147 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned. 2148 * If 'makeit' is 1 we attempt to track-down and create the namecache topology 2149 * for dvp. This will fail only if the directory has been deleted out from 2150 * under the caller. 2151 * 2152 * Callers must always check for a NULL return no matter the value of 'makeit'. 2153 * 2154 * To avoid underflowing the kernel stack each recursive call increments 2155 * the makeit variable. 2156 */ 2157 2158 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2159 struct vnode *dvp, char *fakename); 2160 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2161 struct vnode **saved_dvp); 2162 2163 int 2164 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit, 2165 struct nchandle *nch) 2166 { 2167 struct vnode *saved_dvp; 2168 struct vnode *pvp; 2169 char *fakename; 2170 int error; 2171 2172 nch->ncp = NULL; 2173 nch->mount = dvp->v_mount; 2174 saved_dvp = NULL; 2175 fakename = NULL; 2176 2177 /* 2178 * Handle the makeit == 0 degenerate case 2179 */ 2180 if (makeit == 0) { 2181 spin_lock_shared(&dvp->v_spin); 2182 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2183 if (nch->ncp) 2184 cache_hold(nch); 2185 spin_unlock_shared(&dvp->v_spin); 2186 } 2187 2188 /* 2189 * Loop until resolution, inside code will break out on error. 2190 */ 2191 while (makeit) { 2192 /* 2193 * Break out if we successfully acquire a working ncp. 2194 */ 2195 spin_lock_shared(&dvp->v_spin); 2196 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2197 if (nch->ncp) { 2198 cache_hold(nch); 2199 spin_unlock_shared(&dvp->v_spin); 2200 break; 2201 } 2202 spin_unlock_shared(&dvp->v_spin); 2203 2204 /* 2205 * If dvp is the root of its filesystem it should already 2206 * have a namecache pointer associated with it as a side 2207 * effect of the mount, but it may have been disassociated. 2208 */ 2209 if (dvp->v_flag & VROOT) { 2210 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp); 2211 error = cache_resolve_mp(nch->mount); 2212 _cache_put(nch->ncp); 2213 if (ncvp_debug) { 2214 kprintf("cache_fromdvp: resolve root of mount %p error %d", 2215 dvp->v_mount, error); 2216 } 2217 if (error) { 2218 if (ncvp_debug) 2219 kprintf(" failed\n"); 2220 nch->ncp = NULL; 2221 break; 2222 } 2223 if (ncvp_debug) 2224 kprintf(" succeeded\n"); 2225 continue; 2226 } 2227 2228 /* 2229 * If we are recursed too deeply resort to an O(n^2) 2230 * algorithm to resolve the namecache topology. The 2231 * resolved pvp is left referenced in saved_dvp to 2232 * prevent the tree from being destroyed while we loop. 2233 */ 2234 if (makeit > 20) { 2235 error = cache_fromdvp_try(dvp, cred, &saved_dvp); 2236 if (error) { 2237 kprintf("lookupdotdot(longpath) failed %d " 2238 "dvp %p\n", error, dvp); 2239 nch->ncp = NULL; 2240 break; 2241 } 2242 continue; 2243 } 2244 2245 /* 2246 * Get the parent directory and resolve its ncp. 2247 */ 2248 if (fakename) { 2249 kfree(fakename, M_TEMP); 2250 fakename = NULL; 2251 } 2252 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2253 &fakename); 2254 if (error) { 2255 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp); 2256 break; 2257 } 2258 vn_unlock(pvp); 2259 2260 /* 2261 * Reuse makeit as a recursion depth counter. On success 2262 * nch will be fully referenced. 2263 */ 2264 cache_fromdvp(pvp, cred, makeit + 1, nch); 2265 vrele(pvp); 2266 if (nch->ncp == NULL) 2267 break; 2268 2269 /* 2270 * Do an inefficient scan of pvp (embodied by ncp) to look 2271 * for dvp. This will create a namecache record for dvp on 2272 * success. We loop up to recheck on success. 2273 * 2274 * ncp and dvp are both held but not locked. 2275 */ 2276 error = cache_inefficient_scan(nch, cred, dvp, fakename); 2277 if (error) { 2278 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n", 2279 pvp, nch->ncp->nc_name, dvp); 2280 cache_drop(nch); 2281 /* nch was NULLed out, reload mount */ 2282 nch->mount = dvp->v_mount; 2283 break; 2284 } 2285 if (ncvp_debug) { 2286 kprintf("cache_fromdvp: scan %p (%s) succeeded\n", 2287 pvp, nch->ncp->nc_name); 2288 } 2289 cache_drop(nch); 2290 /* nch was NULLed out, reload mount */ 2291 nch->mount = dvp->v_mount; 2292 } 2293 2294 /* 2295 * If nch->ncp is non-NULL it will have been held already. 2296 */ 2297 if (fakename) 2298 kfree(fakename, M_TEMP); 2299 if (saved_dvp) 2300 vrele(saved_dvp); 2301 if (nch->ncp) 2302 return (0); 2303 return (EINVAL); 2304 } 2305 2306 /* 2307 * Go up the chain of parent directories until we find something 2308 * we can resolve into the namecache. This is very inefficient. 2309 */ 2310 static 2311 int 2312 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2313 struct vnode **saved_dvp) 2314 { 2315 struct nchandle nch; 2316 struct vnode *pvp; 2317 int error; 2318 static time_t last_fromdvp_report; 2319 char *fakename; 2320 2321 /* 2322 * Loop getting the parent directory vnode until we get something we 2323 * can resolve in the namecache. 2324 */ 2325 vref(dvp); 2326 nch.mount = dvp->v_mount; 2327 nch.ncp = NULL; 2328 fakename = NULL; 2329 2330 for (;;) { 2331 if (fakename) { 2332 kfree(fakename, M_TEMP); 2333 fakename = NULL; 2334 } 2335 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2336 &fakename); 2337 if (error) { 2338 vrele(dvp); 2339 break; 2340 } 2341 vn_unlock(pvp); 2342 spin_lock_shared(&pvp->v_spin); 2343 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) { 2344 _cache_hold(nch.ncp); 2345 spin_unlock_shared(&pvp->v_spin); 2346 vrele(pvp); 2347 break; 2348 } 2349 spin_unlock_shared(&pvp->v_spin); 2350 if (pvp->v_flag & VROOT) { 2351 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp); 2352 error = cache_resolve_mp(nch.mount); 2353 _cache_unlock(nch.ncp); 2354 vrele(pvp); 2355 if (error) { 2356 _cache_drop(nch.ncp); 2357 nch.ncp = NULL; 2358 vrele(dvp); 2359 } 2360 break; 2361 } 2362 vrele(dvp); 2363 dvp = pvp; 2364 } 2365 if (error == 0) { 2366 if (last_fromdvp_report != time_uptime) { 2367 last_fromdvp_report = time_uptime; 2368 kprintf("Warning: extremely inefficient path " 2369 "resolution on %s\n", 2370 nch.ncp->nc_name); 2371 } 2372 error = cache_inefficient_scan(&nch, cred, dvp, fakename); 2373 2374 /* 2375 * Hopefully dvp now has a namecache record associated with 2376 * it. Leave it referenced to prevent the kernel from 2377 * recycling the vnode. Otherwise extremely long directory 2378 * paths could result in endless recycling. 2379 */ 2380 if (*saved_dvp) 2381 vrele(*saved_dvp); 2382 *saved_dvp = dvp; 2383 _cache_drop(nch.ncp); 2384 } 2385 if (fakename) 2386 kfree(fakename, M_TEMP); 2387 return (error); 2388 } 2389 2390 /* 2391 * Do an inefficient scan of the directory represented by ncp looking for 2392 * the directory vnode dvp. ncp must be held but not locked on entry and 2393 * will be held on return. dvp must be refd but not locked on entry and 2394 * will remain refd on return. 2395 * 2396 * Why do this at all? Well, due to its stateless nature the NFS server 2397 * converts file handles directly to vnodes without necessarily going through 2398 * the namecache ops that would otherwise create the namecache topology 2399 * leading to the vnode. We could either (1) Change the namecache algorithms 2400 * to allow disconnect namecache records that are re-merged opportunistically, 2401 * or (2) Make the NFS server backtrack and scan to recover a connected 2402 * namecache topology in order to then be able to issue new API lookups. 2403 * 2404 * It turns out that (1) is a huge mess. It takes a nice clean set of 2405 * namecache algorithms and introduces a lot of complication in every subsystem 2406 * that calls into the namecache to deal with the re-merge case, especially 2407 * since we are using the namecache to placehold negative lookups and the 2408 * vnode might not be immediately assigned. (2) is certainly far less 2409 * efficient then (1), but since we are only talking about directories here 2410 * (which are likely to remain cached), the case does not actually run all 2411 * that often and has the supreme advantage of not polluting the namecache 2412 * algorithms. 2413 * 2414 * If a fakename is supplied just construct a namecache entry using the 2415 * fake name. 2416 */ 2417 static int 2418 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2419 struct vnode *dvp, char *fakename) 2420 { 2421 struct nlcomponent nlc; 2422 struct nchandle rncp; 2423 struct dirent *den; 2424 struct vnode *pvp; 2425 struct vattr vat; 2426 struct iovec iov; 2427 struct uio uio; 2428 int blksize; 2429 int eofflag; 2430 int bytes; 2431 char *rbuf; 2432 int error; 2433 2434 vat.va_blocksize = 0; 2435 if ((error = VOP_GETATTR(dvp, &vat)) != 0) 2436 return (error); 2437 cache_lock(nch); 2438 error = cache_vref(nch, cred, &pvp); 2439 cache_unlock(nch); 2440 if (error) 2441 return (error); 2442 if (ncvp_debug) { 2443 kprintf("inefficient_scan of (%p,%s): directory iosize %ld " 2444 "vattr fileid = %lld\n", 2445 nch->ncp, nch->ncp->nc_name, 2446 vat.va_blocksize, 2447 (long long)vat.va_fileid); 2448 } 2449 2450 /* 2451 * Use the supplied fakename if not NULL. Fake names are typically 2452 * not in the actual filesystem hierarchy. This is used by HAMMER 2453 * to glue @@timestamp recursions together. 2454 */ 2455 if (fakename) { 2456 nlc.nlc_nameptr = fakename; 2457 nlc.nlc_namelen = strlen(fakename); 2458 rncp = cache_nlookup(nch, &nlc); 2459 goto done; 2460 } 2461 2462 if ((blksize = vat.va_blocksize) == 0) 2463 blksize = DEV_BSIZE; 2464 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK); 2465 rncp.ncp = NULL; 2466 2467 eofflag = 0; 2468 uio.uio_offset = 0; 2469 again: 2470 iov.iov_base = rbuf; 2471 iov.iov_len = blksize; 2472 uio.uio_iov = &iov; 2473 uio.uio_iovcnt = 1; 2474 uio.uio_resid = blksize; 2475 uio.uio_segflg = UIO_SYSSPACE; 2476 uio.uio_rw = UIO_READ; 2477 uio.uio_td = curthread; 2478 2479 if (ncvp_debug >= 2) 2480 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset); 2481 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL); 2482 if (error == 0) { 2483 den = (struct dirent *)rbuf; 2484 bytes = blksize - uio.uio_resid; 2485 2486 while (bytes > 0) { 2487 if (ncvp_debug >= 2) { 2488 kprintf("cache_inefficient_scan: %*.*s\n", 2489 den->d_namlen, den->d_namlen, 2490 den->d_name); 2491 } 2492 if (den->d_type != DT_WHT && 2493 den->d_ino == vat.va_fileid) { 2494 if (ncvp_debug) { 2495 kprintf("cache_inefficient_scan: " 2496 "MATCHED inode %lld path %s/%*.*s\n", 2497 (long long)vat.va_fileid, 2498 nch->ncp->nc_name, 2499 den->d_namlen, den->d_namlen, 2500 den->d_name); 2501 } 2502 nlc.nlc_nameptr = den->d_name; 2503 nlc.nlc_namelen = den->d_namlen; 2504 rncp = cache_nlookup(nch, &nlc); 2505 KKASSERT(rncp.ncp != NULL); 2506 break; 2507 } 2508 bytes -= _DIRENT_DIRSIZ(den); 2509 den = _DIRENT_NEXT(den); 2510 } 2511 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize) 2512 goto again; 2513 } 2514 kfree(rbuf, M_TEMP); 2515 done: 2516 vrele(pvp); 2517 if (rncp.ncp) { 2518 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) { 2519 _cache_setvp(rncp.mount, rncp.ncp, dvp); 2520 if (ncvp_debug >= 2) { 2521 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n", 2522 nch->ncp->nc_name, rncp.ncp->nc_name, dvp); 2523 } 2524 } else { 2525 if (ncvp_debug >= 2) { 2526 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n", 2527 nch->ncp->nc_name, rncp.ncp->nc_name, dvp, 2528 rncp.ncp->nc_vp); 2529 } 2530 } 2531 if (rncp.ncp->nc_vp == NULL) 2532 error = rncp.ncp->nc_error; 2533 /* 2534 * Release rncp after a successful nlookup. rncp was fully 2535 * referenced. 2536 */ 2537 cache_put(&rncp); 2538 } else { 2539 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n", 2540 dvp, nch->ncp->nc_name); 2541 error = ENOENT; 2542 } 2543 return (error); 2544 } 2545 2546 /* 2547 * This function must be called with the ncp held and locked and will unlock 2548 * and drop it during zapping. 2549 * 2550 * Zap a namecache entry. The ncp is unconditionally set to an unresolved 2551 * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list 2552 * and removes the related reference. If the ncp can be removed, and the 2553 * parent can be zapped non-blocking, this function loops up. 2554 * 2555 * There will be one ref from the caller (which we now own). The only 2556 * remaining autonomous refs to the ncp will then be due to nc_parent->nc_list, 2557 * so possibly 2 refs left. Taking this into account, if there are no 2558 * additional refs and no children, the ncp will be removed from the topology 2559 * and destroyed. 2560 * 2561 * References and/or children may exist if the ncp is in the middle of the 2562 * topology, preventing the ncp from being destroyed. 2563 * 2564 * If nonblock is non-zero and the parent ncp cannot be locked we give up. 2565 * 2566 * This function may return a held (but NOT locked) parent node which the 2567 * caller must drop in a loop. Looping is one way to avoid unbounded recursion 2568 * due to deep namecache trees. 2569 * 2570 * WARNING! For MPSAFE operation this routine must acquire up to three 2571 * spin locks to be able to safely test nc_refs. Lock order is 2572 * very important. 2573 * 2574 * hash spinlock if on hash list 2575 * parent spinlock if child of parent 2576 * (the ncp is unresolved so there is no vnode association) 2577 */ 2578 static int 2579 cache_zap(struct namecache *ncp) 2580 { 2581 struct namecache *par; 2582 struct vnode *dropvp; 2583 struct nchash_head *nchpp; 2584 int refcmp; 2585 int nonblock = 1; /* XXX cleanup */ 2586 int res = 0; 2587 2588 again: 2589 /* 2590 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED. 2591 * This gets rid of any vp->v_namecache list or negative list and 2592 * the related ref. 2593 */ 2594 _cache_setunresolved(ncp); 2595 2596 /* 2597 * Try to scrap the entry and possibly tail-recurse on its parent. 2598 * We only scrap unref'd (other then our ref) unresolved entries, 2599 * we do not scrap 'live' entries. 2600 * 2601 * If nc_parent is non NULL we expect 2 references, else just 1. 2602 * If there are more, someone else also holds the ncp and we cannot 2603 * destroy it. 2604 */ 2605 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 2606 KKASSERT(ncp->nc_refs > 0); 2607 2608 /* 2609 * If the ncp is linked to its parent it will also be in the hash 2610 * table. We have to be able to lock the parent and the hash table. 2611 * 2612 * Acquire locks. Note that the parent can't go away while we hold 2613 * a child locked. If nc_parent is present, expect 2 refs instead 2614 * of 1. 2615 */ 2616 nchpp = NULL; 2617 if ((par = ncp->nc_parent) != NULL) { 2618 if (nonblock) { 2619 if (_cache_lock_nonblock(par)) { 2620 /* lock failed */ 2621 ncp->nc_flag |= NCF_DEFEREDZAP; 2622 atomic_add_long( 2623 &pcpu_ncache[mycpu->gd_cpuid].numdefered, 2624 1); 2625 _cache_unlock(ncp); 2626 _cache_drop(ncp); /* caller's ref */ 2627 return res; 2628 } 2629 _cache_hold(par); 2630 } else { 2631 _cache_hold(par); 2632 _cache_lock(par); 2633 } 2634 nchpp = ncp->nc_head; 2635 spin_lock(&nchpp->spin); 2636 } 2637 2638 /* 2639 * With the parent and nchpp locked, and the vnode removed 2640 * (no vp->v_namecache), we expect 1 or 2 refs. If there are 2641 * more someone else has a ref and we cannot zap the entry. 2642 * 2643 * one for our hold 2644 * one for our parent link (parent also has one from the linkage) 2645 */ 2646 if (par) 2647 refcmp = 2; 2648 else 2649 refcmp = 1; 2650 2651 /* 2652 * On failure undo the work we've done so far and drop the 2653 * caller's ref and ncp. 2654 */ 2655 if (ncp->nc_refs != refcmp || TAILQ_FIRST(&ncp->nc_list)) { 2656 if (par) { 2657 spin_unlock(&nchpp->spin); 2658 _cache_put(par); 2659 } 2660 _cache_unlock(ncp); 2661 _cache_drop(ncp); 2662 return res; 2663 } 2664 2665 /* 2666 * We own all the refs and with the spinlocks held no further 2667 * refs can be acquired by others. 2668 * 2669 * Remove us from the hash list and parent list. We have to 2670 * drop a ref on the parent's vp if the parent's list becomes 2671 * empty. 2672 */ 2673 dropvp = NULL; 2674 if (par) { 2675 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 2676 2677 KKASSERT(nchpp == ncp->nc_head); 2678 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 2679 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 2680 atomic_add_long(&pn->vfscache_count, -1); 2681 if (TAILQ_EMPTY(&ncp->nc_list)) 2682 atomic_add_long(&pn->vfscache_leafs, -1); 2683 2684 if (TAILQ_EMPTY(&par->nc_list)) { 2685 atomic_add_long(&pn->vfscache_leafs, 1); 2686 if (par->nc_vp) 2687 dropvp = par->nc_vp; 2688 } 2689 ncp->nc_parent = NULL; 2690 ncp->nc_head = NULL; 2691 spin_unlock(&nchpp->spin); 2692 _cache_drop(par); /* removal of ncp from par->nc_list */ 2693 /*_cache_unlock(par);*/ 2694 } else { 2695 KKASSERT(ncp->nc_head == NULL); 2696 } 2697 2698 /* 2699 * ncp should not have picked up any refs. Physically 2700 * destroy the ncp. 2701 */ 2702 if (ncp->nc_refs != refcmp) { 2703 panic("cache_zap: %p bad refs %d (expected %d)\n", 2704 ncp, ncp->nc_refs, refcmp); 2705 } 2706 /* _cache_unlock(ncp) not required */ 2707 ncp->nc_refs = -1; /* safety */ 2708 if (ncp->nc_name) 2709 kfree(ncp->nc_name, M_VFSCACHEAUX); 2710 kfree_obj(ncp, M_VFSCACHE); 2711 res = 1; 2712 2713 /* 2714 * Delayed drop (we had to release our spinlocks) 2715 */ 2716 if (dropvp) 2717 vdrop(dropvp); 2718 2719 /* 2720 * Loop up if we can recursively clean out the parent. 2721 */ 2722 if (par) { 2723 refcmp = 1; /* ref on parent */ 2724 if (par->nc_parent) /* par->par */ 2725 ++refcmp; 2726 par->nc_flag &= ~NCF_DEFEREDZAP; 2727 if ((par->nc_flag & NCF_UNRESOLVED) && 2728 par->nc_refs == refcmp && 2729 TAILQ_EMPTY(&par->nc_list)) 2730 { 2731 ncp = par; 2732 goto again; 2733 } 2734 _cache_unlock(par); 2735 _cache_drop(par); 2736 } 2737 return 1; 2738 } 2739 2740 /* 2741 * Clean up dangling negative cache and defered-drop entries in the 2742 * namecache. 2743 * 2744 * This routine is called in the critical path and also called from 2745 * vnlru(). When called from vnlru we use a lower limit to try to 2746 * deal with the negative cache before the critical path has to start 2747 * dealing with it. 2748 */ 2749 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t; 2750 2751 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2752 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2753 2754 void 2755 cache_hysteresis(int critpath) 2756 { 2757 long poslimit; 2758 long neglimit = maxvnodes / ncnegfactor; 2759 long xnumcache = vfscache_leafs; 2760 2761 if (critpath == 0) 2762 neglimit = neglimit * 8 / 10; 2763 2764 /* 2765 * Don't cache too many negative hits. We use hysteresis to reduce 2766 * the impact on the critical path. 2767 */ 2768 switch(neg_cache_hysteresis_state[critpath]) { 2769 case CHI_LOW: 2770 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) { 2771 if (critpath) 2772 _cache_cleanneg(ncnegflush); 2773 else 2774 _cache_cleanneg(ncnegflush + 2775 vfscache_negs - neglimit); 2776 neg_cache_hysteresis_state[critpath] = CHI_HIGH; 2777 } 2778 break; 2779 case CHI_HIGH: 2780 if (vfscache_negs > MINNEG * 9 / 10 && 2781 vfscache_negs * 9 / 10 > neglimit 2782 ) { 2783 if (critpath) 2784 _cache_cleanneg(ncnegflush); 2785 else 2786 _cache_cleanneg(ncnegflush + 2787 vfscache_negs * 9 / 10 - 2788 neglimit); 2789 } else { 2790 neg_cache_hysteresis_state[critpath] = CHI_LOW; 2791 } 2792 break; 2793 } 2794 2795 /* 2796 * Don't cache too many positive hits. We use hysteresis to reduce 2797 * the impact on the critical path. 2798 * 2799 * Excessive positive hits can accumulate due to large numbers of 2800 * hardlinks (the vnode cache will not prevent hl ncps from growing 2801 * into infinity). 2802 */ 2803 if ((poslimit = ncposlimit) == 0) 2804 poslimit = maxvnodes * 2; 2805 if (critpath == 0) 2806 poslimit = poslimit * 8 / 10; 2807 2808 switch(pos_cache_hysteresis_state[critpath]) { 2809 case CHI_LOW: 2810 if (xnumcache > poslimit && xnumcache > MINPOS) { 2811 if (critpath) 2812 _cache_cleanpos(ncposflush); 2813 else 2814 _cache_cleanpos(ncposflush + 2815 xnumcache - poslimit); 2816 pos_cache_hysteresis_state[critpath] = CHI_HIGH; 2817 } 2818 break; 2819 case CHI_HIGH: 2820 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) { 2821 if (critpath) 2822 _cache_cleanpos(ncposflush); 2823 else 2824 _cache_cleanpos(ncposflush + 2825 xnumcache - poslimit * 5 / 6); 2826 } else { 2827 pos_cache_hysteresis_state[critpath] = CHI_LOW; 2828 } 2829 break; 2830 } 2831 2832 /* 2833 * Clean out dangling defered-zap ncps which could not be cleanly 2834 * dropped if too many build up. Note that numdefered is 2835 * heuristical. Make sure we are real-time for the current cpu, 2836 * plus the global rollup. 2837 */ 2838 if (pcpu_ncache[mycpu->gd_cpuid].numdefered + numdefered > neglimit) { 2839 _cache_cleandefered(); 2840 } 2841 } 2842 2843 /* 2844 * NEW NAMECACHE LOOKUP API 2845 * 2846 * Lookup an entry in the namecache. The passed par_nch must be referenced 2847 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp 2848 * is ALWAYS returned, eve if the supplied component is illegal. 2849 * 2850 * The resulting namecache entry should be returned to the system with 2851 * cache_put() or cache_unlock() + cache_drop(). 2852 * 2853 * namecache locks are recursive but care must be taken to avoid lock order 2854 * reversals (hence why the passed par_nch must be unlocked). Locking 2855 * rules are to order for parent traversals, not for child traversals. 2856 * 2857 * Nobody else will be able to manipulate the associated namespace (e.g. 2858 * create, delete, rename, rename-target) until the caller unlocks the 2859 * entry. 2860 * 2861 * The returned entry will be in one of three states: positive hit (non-null 2862 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set). 2863 * Unresolved entries must be resolved through the filesystem to associate the 2864 * vnode and/or determine whether a positive or negative hit has occured. 2865 * 2866 * It is not necessary to lock a directory in order to lock namespace under 2867 * that directory. In fact, it is explicitly not allowed to do that. A 2868 * directory is typically only locked when being created, renamed, or 2869 * destroyed. 2870 * 2871 * The directory (par) may be unresolved, in which case any returned child 2872 * will likely also be marked unresolved. Likely but not guarenteed. Since 2873 * the filesystem lookup requires a resolved directory vnode the caller is 2874 * responsible for resolving the namecache chain top-down. This API 2875 * specifically allows whole chains to be created in an unresolved state. 2876 */ 2877 struct nchandle 2878 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc) 2879 { 2880 struct nchandle nch; 2881 struct namecache *ncp; 2882 struct namecache *new_ncp; 2883 struct namecache *rep_ncp; /* reuse a destroyed ncp */ 2884 struct nchash_head *nchpp; 2885 struct mount *mp; 2886 u_int32_t hash; 2887 globaldata_t gd; 2888 int par_locked; 2889 int use_excl; 2890 2891 gd = mycpu; 2892 mp = par_nch->mount; 2893 par_locked = 0; 2894 2895 /* 2896 * This is a good time to call it, no ncp's are locked by 2897 * the caller or us. 2898 */ 2899 cache_hysteresis(1); 2900 2901 /* 2902 * Try to locate an existing entry 2903 */ 2904 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 2905 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 2906 new_ncp = NULL; 2907 use_excl = 0; 2908 nchpp = NCHHASH(hash); 2909 restart: 2910 rep_ncp = NULL; 2911 if (use_excl) 2912 spin_lock(&nchpp->spin); 2913 else 2914 spin_lock_shared(&nchpp->spin); 2915 2916 /* 2917 * Do a reverse scan to collect any DESTROYED ncps prior to matching 2918 * an existing entry. 2919 */ 2920 TAILQ_FOREACH_REVERSE(ncp, &nchpp->list, nchash_list, nc_hash) { 2921 /* 2922 * Break out if we find a matching entry. Note that 2923 * UNRESOLVED entries may match, but DESTROYED entries 2924 * do not. 2925 * 2926 * We may be able to reuse DESTROYED entries that we come 2927 * across, even if the name does not match, as long as 2928 * nc_nlen is correct and the only hold ref is from the nchpp 2929 * list itself. 2930 */ 2931 if (ncp->nc_parent == par_nch->ncp && 2932 ncp->nc_nlen == nlc->nlc_namelen) { 2933 if (ncp->nc_flag & NCF_DESTROYED) { 2934 if (ncp->nc_refs == 1 && rep_ncp == NULL) 2935 rep_ncp = ncp; 2936 continue; 2937 } 2938 if (bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen)) 2939 continue; 2940 2941 /* 2942 * Matched ncp 2943 */ 2944 _cache_hold(ncp); 2945 if (rep_ncp) 2946 _cache_hold(rep_ncp); 2947 2948 if (use_excl) 2949 spin_unlock(&nchpp->spin); 2950 else 2951 spin_unlock_shared(&nchpp->spin); 2952 2953 if (par_locked) { 2954 _cache_unlock(par_nch->ncp); 2955 par_locked = 0; 2956 } 2957 2958 /* 2959 * Really try to destroy rep_ncp if encountered. 2960 * Various edge cases can build up more than one, 2961 * so loop if we succeed. This isn't perfect, but 2962 * we can't afford to have tons of entries build 2963 * up on a single nhcpp list due to rename-over 2964 * operations. If that were to happen, the system 2965 * would bog down quickly. 2966 */ 2967 if (rep_ncp) { 2968 if (_cache_lock_nonblock(rep_ncp) == 0) { 2969 if (rep_ncp->nc_flag & NCF_DESTROYED) { 2970 if (cache_zap(rep_ncp)) { 2971 _cache_drop(ncp); 2972 goto restart; 2973 } 2974 } else { 2975 _cache_unlock(rep_ncp); 2976 _cache_drop(rep_ncp); 2977 } 2978 } else { 2979 _cache_drop(rep_ncp); 2980 } 2981 } 2982 2983 /* 2984 * Continue processing the matched entry 2985 */ 2986 if (_cache_lock_special(ncp) == 0) { 2987 /* 2988 * Successfully locked but we must re-test 2989 * conditions that might have changed since 2990 * we did not have the lock before. 2991 */ 2992 if (ncp->nc_parent != par_nch->ncp || 2993 ncp->nc_nlen != nlc->nlc_namelen || 2994 bcmp(ncp->nc_name, nlc->nlc_nameptr, 2995 ncp->nc_nlen) || 2996 (ncp->nc_flag & NCF_DESTROYED)) { 2997 _cache_put(ncp); 2998 goto restart; 2999 } 3000 _cache_auto_unresolve(mp, ncp); 3001 if (new_ncp) { 3002 _cache_free(new_ncp); 3003 new_ncp = NULL; /* safety */ 3004 } 3005 goto found; 3006 } 3007 _cache_get(ncp); /* cycle the lock to block */ 3008 _cache_put(ncp); 3009 _cache_drop(ncp); 3010 goto restart; 3011 } 3012 } 3013 3014 /* 3015 * We failed to locate the entry, try to resurrect a destroyed 3016 * entry that we did find that is already correctly linked into 3017 * nchpp and the parent. We must re-test conditions after 3018 * successfully locking rep_ncp. 3019 * 3020 * This case can occur under heavy loads due to not being able 3021 * to safely lock the parent in cache_zap(). Nominally a repeated 3022 * create/unlink load, but only the namelen needs to match. 3023 * 3024 * An exclusive lock on the nchpp is required to process this case, 3025 * otherwise a race can cause duplicate entries to be created with 3026 * one cpu reusing a DESTROYED ncp while another creates a new_ncp. 3027 */ 3028 if (rep_ncp && use_excl) { 3029 if (_cache_lock_nonblock(rep_ncp) == 0) { 3030 _cache_hold(rep_ncp); 3031 if (rep_ncp->nc_parent == par_nch->ncp && 3032 rep_ncp->nc_nlen == nlc->nlc_namelen && 3033 (rep_ncp->nc_flag & NCF_DESTROYED) && 3034 rep_ncp->nc_refs == 2) { 3035 /* 3036 * Update nc_name. 3037 */ 3038 ncp = rep_ncp; 3039 bcopy(nlc->nlc_nameptr, ncp->nc_name, 3040 nlc->nlc_namelen); 3041 3042 /* 3043 * This takes some care. We must clear the 3044 * NCF_DESTROYED flag before unlocking the 3045 * hash chain so other concurrent searches 3046 * do not skip this element. 3047 * 3048 * We must also unlock the hash chain before 3049 * unresolving the ncp to avoid deadlocks. 3050 * We hold the lock on the ncp so we can safely 3051 * reinitialize nc_flag after that. 3052 */ 3053 ncp->nc_flag &= ~NCF_DESTROYED; 3054 spin_unlock(&nchpp->spin); /* use_excl */ 3055 3056 _cache_setunresolved(ncp); 3057 ncp->nc_flag = NCF_UNRESOLVED; 3058 ncp->nc_error = ENOTCONN; 3059 if (par_locked) { 3060 _cache_unlock(par_nch->ncp); 3061 par_locked = 0; 3062 } 3063 if (new_ncp) { 3064 _cache_free(new_ncp); 3065 new_ncp = NULL; /* safety */ 3066 } 3067 goto found; 3068 } 3069 _cache_put(rep_ncp); 3070 } 3071 } 3072 3073 /* 3074 * Otherwise create a new entry and add it to the cache. The parent 3075 * ncp must also be locked so we can link into it. 3076 * 3077 * We have to relookup after possibly blocking in kmalloc or 3078 * when locking par_nch. 3079 * 3080 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3081 * mount case, in which case nc_name will be NULL. 3082 * 3083 * NOTE: In the rep_ncp != NULL case we are trying to reuse 3084 * a DESTROYED entry, but didn't have an exclusive lock. 3085 * In this situation we do not create a new_ncp. 3086 */ 3087 if (new_ncp == NULL) { 3088 if (use_excl) 3089 spin_unlock(&nchpp->spin); 3090 else 3091 spin_unlock_shared(&nchpp->spin); 3092 if (rep_ncp == NULL) { 3093 new_ncp = cache_alloc(nlc->nlc_namelen); 3094 if (nlc->nlc_namelen) { 3095 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3096 nlc->nlc_namelen); 3097 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3098 } 3099 } 3100 use_excl = 1; 3101 goto restart; 3102 } 3103 3104 /* 3105 * NOTE! The spinlock is held exclusively here because new_ncp 3106 * is non-NULL. 3107 */ 3108 if (par_locked == 0) { 3109 spin_unlock(&nchpp->spin); 3110 _cache_lock(par_nch->ncp); 3111 par_locked = 1; 3112 goto restart; 3113 } 3114 3115 /* 3116 * Link to parent (requires another ref, the one already in new_ncp 3117 * is what we wil lreturn). 3118 * 3119 * WARNING! We still hold the spinlock. We have to set the hash 3120 * table entry atomically. 3121 */ 3122 ncp = new_ncp; 3123 ++ncp->nc_refs; 3124 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3125 spin_unlock(&nchpp->spin); 3126 _cache_unlock(par_nch->ncp); 3127 /* par_locked = 0 - not used */ 3128 found: 3129 /* 3130 * stats and namecache size management 3131 */ 3132 if (ncp->nc_flag & NCF_UNRESOLVED) 3133 ++gd->gd_nchstats->ncs_miss; 3134 else if (ncp->nc_vp) 3135 ++gd->gd_nchstats->ncs_goodhits; 3136 else 3137 ++gd->gd_nchstats->ncs_neghits; 3138 nch.mount = mp; 3139 nch.ncp = ncp; 3140 _cache_mntref(nch.mount); 3141 3142 return(nch); 3143 } 3144 3145 /* 3146 * Attempt to lookup a namecache entry and return with a shared namecache 3147 * lock. This operates non-blocking. EWOULDBLOCK is returned if excl is 3148 * set or we are unable to lock. 3149 */ 3150 int 3151 cache_nlookup_maybe_shared(struct nchandle *par_nch, 3152 struct nlcomponent *nlc, 3153 int excl, struct nchandle *res_nch) 3154 { 3155 struct namecache *ncp; 3156 struct nchash_head *nchpp; 3157 struct mount *mp; 3158 u_int32_t hash; 3159 globaldata_t gd; 3160 3161 /* 3162 * If exclusive requested or shared namecache locks are disabled, 3163 * return failure. 3164 */ 3165 if (ncp_shared_lock_disable || excl) 3166 return(EWOULDBLOCK); 3167 3168 gd = mycpu; 3169 mp = par_nch->mount; 3170 3171 /* 3172 * This is a good time to call it, no ncp's are locked by 3173 * the caller or us. 3174 */ 3175 cache_hysteresis(1); 3176 3177 /* 3178 * Try to locate an existing entry 3179 */ 3180 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3181 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3182 nchpp = NCHHASH(hash); 3183 3184 spin_lock_shared(&nchpp->spin); 3185 3186 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3187 /* 3188 * Break out if we find a matching entry. Note that 3189 * UNRESOLVED entries may match, but DESTROYED entries 3190 * do not. 3191 */ 3192 if (ncp->nc_parent == par_nch->ncp && 3193 ncp->nc_nlen == nlc->nlc_namelen && 3194 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3195 (ncp->nc_flag & NCF_DESTROYED) == 0 3196 ) { 3197 _cache_hold(ncp); 3198 spin_unlock_shared(&nchpp->spin); 3199 3200 if (_cache_lock_shared_special(ncp) == 0) { 3201 if (ncp->nc_parent == par_nch->ncp && 3202 ncp->nc_nlen == nlc->nlc_namelen && 3203 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3204 ncp->nc_nlen) == 0 && 3205 (ncp->nc_flag & NCF_DESTROYED) == 0 && 3206 (ncp->nc_flag & NCF_UNRESOLVED) == 0 && 3207 _cache_auto_unresolve_test(mp, ncp) == 0) { 3208 goto found; 3209 } 3210 _cache_unlock(ncp); 3211 } 3212 _cache_drop(ncp); 3213 return(EWOULDBLOCK); 3214 } 3215 } 3216 3217 /* 3218 * Failure 3219 */ 3220 spin_unlock_shared(&nchpp->spin); 3221 return(EWOULDBLOCK); 3222 3223 /* 3224 * Success 3225 * 3226 * Note that nc_error might be non-zero (e.g ENOENT). 3227 */ 3228 found: 3229 res_nch->mount = mp; 3230 res_nch->ncp = ncp; 3231 ++gd->gd_nchstats->ncs_goodhits; 3232 _cache_mntref(res_nch->mount); 3233 3234 KKASSERT(ncp->nc_error != EWOULDBLOCK); 3235 return(ncp->nc_error); 3236 } 3237 3238 /* 3239 * This is a non-blocking verison of cache_nlookup() used by 3240 * nfs_readdirplusrpc_uio(). It can fail for any reason and 3241 * will return nch.ncp == NULL in that case. 3242 */ 3243 struct nchandle 3244 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc) 3245 { 3246 struct nchandle nch; 3247 struct namecache *ncp; 3248 struct namecache *new_ncp; 3249 struct nchash_head *nchpp; 3250 struct mount *mp; 3251 u_int32_t hash; 3252 globaldata_t gd; 3253 int par_locked; 3254 3255 gd = mycpu; 3256 mp = par_nch->mount; 3257 par_locked = 0; 3258 3259 /* 3260 * Try to locate an existing entry 3261 */ 3262 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3263 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3264 new_ncp = NULL; 3265 nchpp = NCHHASH(hash); 3266 restart: 3267 spin_lock(&nchpp->spin); 3268 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3269 /* 3270 * Break out if we find a matching entry. Note that 3271 * UNRESOLVED entries may match, but DESTROYED entries 3272 * do not. 3273 */ 3274 if (ncp->nc_parent == par_nch->ncp && 3275 ncp->nc_nlen == nlc->nlc_namelen && 3276 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3277 (ncp->nc_flag & NCF_DESTROYED) == 0 3278 ) { 3279 _cache_hold(ncp); 3280 spin_unlock(&nchpp->spin); 3281 if (par_locked) { 3282 _cache_unlock(par_nch->ncp); 3283 par_locked = 0; 3284 } 3285 if (_cache_lock_special(ncp) == 0) { 3286 if (ncp->nc_parent != par_nch->ncp || 3287 ncp->nc_nlen != nlc->nlc_namelen || 3288 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) || 3289 (ncp->nc_flag & NCF_DESTROYED)) { 3290 kprintf("cache_lookup_nonblock: " 3291 "ncp-race %p %*.*s\n", 3292 ncp, 3293 nlc->nlc_namelen, 3294 nlc->nlc_namelen, 3295 nlc->nlc_nameptr); 3296 _cache_unlock(ncp); 3297 _cache_drop(ncp); 3298 goto failed; 3299 } 3300 _cache_auto_unresolve(mp, ncp); 3301 if (new_ncp) { 3302 _cache_free(new_ncp); 3303 new_ncp = NULL; 3304 } 3305 goto found; 3306 } 3307 _cache_drop(ncp); 3308 goto failed; 3309 } 3310 } 3311 3312 /* 3313 * We failed to locate an entry, create a new entry and add it to 3314 * the cache. The parent ncp must also be locked so we 3315 * can link into it. 3316 * 3317 * We have to relookup after possibly blocking in kmalloc or 3318 * when locking par_nch. 3319 * 3320 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3321 * mount case, in which case nc_name will be NULL. 3322 */ 3323 if (new_ncp == NULL) { 3324 spin_unlock(&nchpp->spin); 3325 new_ncp = cache_alloc(nlc->nlc_namelen); 3326 if (nlc->nlc_namelen) { 3327 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3328 nlc->nlc_namelen); 3329 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3330 } 3331 goto restart; 3332 } 3333 if (par_locked == 0) { 3334 spin_unlock(&nchpp->spin); 3335 if (_cache_lock_nonblock(par_nch->ncp) == 0) { 3336 par_locked = 1; 3337 goto restart; 3338 } 3339 goto failed; 3340 } 3341 3342 /* 3343 * Link to parent (requires another ref, the one already in new_ncp 3344 * is what we wil lreturn). 3345 * 3346 * WARNING! We still hold the spinlock. We have to set the hash 3347 * table entry atomically. 3348 */ 3349 ncp = new_ncp; 3350 ++ncp->nc_refs; 3351 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3352 spin_unlock(&nchpp->spin); 3353 _cache_unlock(par_nch->ncp); 3354 /* par_locked = 0 - not used */ 3355 found: 3356 /* 3357 * stats and namecache size management 3358 */ 3359 if (ncp->nc_flag & NCF_UNRESOLVED) 3360 ++gd->gd_nchstats->ncs_miss; 3361 else if (ncp->nc_vp) 3362 ++gd->gd_nchstats->ncs_goodhits; 3363 else 3364 ++gd->gd_nchstats->ncs_neghits; 3365 nch.mount = mp; 3366 nch.ncp = ncp; 3367 _cache_mntref(nch.mount); 3368 3369 return(nch); 3370 failed: 3371 if (new_ncp) { 3372 _cache_free(new_ncp); 3373 new_ncp = NULL; 3374 } 3375 nch.mount = NULL; 3376 nch.ncp = NULL; 3377 return(nch); 3378 } 3379 3380 /* 3381 * This version is non-locking. The caller must validate the result 3382 * for parent-to-child continuity. 3383 * 3384 * It can fail for any reason and will return nch.ncp == NULL in that case. 3385 */ 3386 struct nchandle 3387 cache_nlookup_nonlocked(struct nchandle *par_nch, struct nlcomponent *nlc) 3388 { 3389 struct nchandle nch; 3390 struct namecache *ncp; 3391 struct nchash_head *nchpp; 3392 struct mount *mp; 3393 u_int32_t hash; 3394 globaldata_t gd; 3395 3396 gd = mycpu; 3397 mp = par_nch->mount; 3398 3399 /* 3400 * Try to locate an existing entry 3401 */ 3402 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3403 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3404 nchpp = NCHHASH(hash); 3405 3406 spin_lock_shared(&nchpp->spin); 3407 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3408 /* 3409 * Break out if we find a matching entry. Note that 3410 * UNRESOLVED entries may match, but DESTROYED entries 3411 * do not. 3412 * 3413 * Resolved NFS entries which have timed out fail so the 3414 * caller can rerun with normal locking. 3415 */ 3416 if (ncp->nc_parent == par_nch->ncp && 3417 ncp->nc_nlen == nlc->nlc_namelen && 3418 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3419 (ncp->nc_flag & NCF_DESTROYED) == 0 3420 ) { 3421 if (_cache_auto_unresolve_test(par_nch->mount, ncp)) 3422 break; 3423 _cache_hold(ncp); 3424 spin_unlock_shared(&nchpp->spin); 3425 goto found; 3426 } 3427 } 3428 spin_unlock_shared(&nchpp->spin); 3429 nch.mount = NULL; 3430 nch.ncp = NULL; 3431 return nch; 3432 found: 3433 /* 3434 * stats and namecache size management 3435 */ 3436 if (ncp->nc_flag & NCF_UNRESOLVED) 3437 ++gd->gd_nchstats->ncs_miss; 3438 else if (ncp->nc_vp) 3439 ++gd->gd_nchstats->ncs_goodhits; 3440 else 3441 ++gd->gd_nchstats->ncs_neghits; 3442 nch.mount = mp; 3443 nch.ncp = ncp; 3444 _cache_mntref(nch.mount); 3445 3446 return(nch); 3447 } 3448 3449 /* 3450 * The namecache entry is marked as being used as a mount point. 3451 * Locate the mount if it is visible to the caller. The DragonFly 3452 * mount system allows arbitrary loops in the topology and disentangles 3453 * those loops by matching against (mp, ncp) rather than just (ncp). 3454 * This means any given ncp can dive any number of mounts, depending 3455 * on the relative mount (e.g. nullfs) the caller is at in the topology. 3456 * 3457 * We use a very simple frontend cache to reduce SMP conflicts, 3458 * which we have to do because the mountlist scan needs an exclusive 3459 * lock around its ripout info list. Not to mention that there might 3460 * be a lot of mounts. 3461 * 3462 * Because all mounts can potentially be accessed by all cpus, break the cpu's 3463 * down a bit to allow some contention rather than making the cache 3464 * excessively huge. 3465 * 3466 * The hash table is split into per-cpu areas, is 4-way set-associative. 3467 */ 3468 struct findmount_info { 3469 struct mount *result; 3470 struct mount *nch_mount; 3471 struct namecache *nch_ncp; 3472 }; 3473 3474 static __inline 3475 struct ncmount_cache * 3476 ncmount_cache_lookup4(struct mount *mp, struct namecache *ncp) 3477 { 3478 uint32_t hash; 3479 3480 hash = iscsi_crc32(&mp, sizeof(mp)); 3481 hash = iscsi_crc32_ext(&ncp, sizeof(ncp), hash); 3482 hash ^= hash >> 16; 3483 hash = hash & ((NCMOUNT_NUMCACHE - 1) & ~(NCMOUNT_SET - 1)); 3484 3485 return (&ncmount_cache[hash]); 3486 } 3487 3488 static 3489 struct ncmount_cache * 3490 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp) 3491 { 3492 struct ncmount_cache *ncc; 3493 struct ncmount_cache *best; 3494 int delta; 3495 int best_delta; 3496 int i; 3497 3498 ncc = ncmount_cache_lookup4(mp, ncp); 3499 3500 /* 3501 * NOTE: When checking for a ticks overflow implement a slop of 3502 * 2 ticks just to be safe, because ticks is accessed 3503 * non-atomically one CPU can increment it while another 3504 * is still using the old value. 3505 */ 3506 if (ncc->ncp == ncp && ncc->mp == mp) /* 0 */ 3507 return ncc; 3508 delta = (int)(ticks - ncc->ticks); /* beware GCC opts */ 3509 if (delta < -2) /* overflow reset */ 3510 ncc->ticks = ticks; 3511 best = ncc; 3512 best_delta = delta; 3513 3514 for (i = 1; i < NCMOUNT_SET; ++i) { /* 1, 2, 3 */ 3515 ++ncc; 3516 if (ncc->ncp == ncp && ncc->mp == mp) 3517 return ncc; 3518 delta = (int)(ticks - ncc->ticks); 3519 if (delta < -2) 3520 ncc->ticks = ticks; 3521 if (delta > best_delta) { 3522 best_delta = delta; 3523 best = ncc; 3524 } 3525 } 3526 return best; 3527 } 3528 3529 /* 3530 * pcpu-optimized mount search. Locate the recursive mountpoint, avoid 3531 * doing an expensive mountlist_scan*() if possible. 3532 * 3533 * (mp, ncp) -> mountonpt.k 3534 * 3535 * Returns a referenced mount pointer or NULL 3536 * 3537 * General SMP operation uses a per-cpu umount_spin to interlock unmount 3538 * operations (that is, where the mp_target can be freed out from under us). 3539 * 3540 * Lookups use the ncc->updating counter to validate the contents in order 3541 * to avoid having to obtain the per cache-element spin-lock. In addition, 3542 * the ticks field is only updated when it changes. However, if our per-cpu 3543 * lock fails due to an unmount-in-progress, we fall-back to the 3544 * cache-element's spin-lock. 3545 */ 3546 struct mount * 3547 cache_findmount(struct nchandle *nch) 3548 { 3549 struct findmount_info info; 3550 struct ncmount_cache *ncc; 3551 struct ncmount_cache ncc_copy; 3552 struct mount *target; 3553 struct pcpu_ncache *pcpu; 3554 struct spinlock *spinlk; 3555 int update; 3556 3557 pcpu = pcpu_ncache; 3558 if (ncmount_cache_enable == 0 || pcpu == NULL) { 3559 ncc = NULL; 3560 goto skip; 3561 } 3562 pcpu += mycpu->gd_cpuid; 3563 3564 again: 3565 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3566 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3567 found: 3568 /* 3569 * This is a bit messy for now because we do not yet have 3570 * safe disposal of mount structures. We have to ref 3571 * ncc->mp_target but the 'update' counter only tell us 3572 * whether the cache has changed after the fact. 3573 * 3574 * For now get a per-cpu spinlock that will only contend 3575 * against umount's. This is the best path. If it fails, 3576 * instead of waiting on the umount we fall-back to a 3577 * shared ncc->spin lock, which will generally only cost a 3578 * cache ping-pong. 3579 */ 3580 update = ncc->updating; 3581 if (__predict_true(spin_trylock(&pcpu->umount_spin))) { 3582 spinlk = &pcpu->umount_spin; 3583 } else { 3584 spinlk = &ncc->spin; 3585 spin_lock_shared(spinlk); 3586 } 3587 if (update & 1) { /* update in progress */ 3588 spin_unlock_any(spinlk); 3589 goto skip; 3590 } 3591 ncc_copy = *ncc; 3592 cpu_lfence(); 3593 if (ncc->updating != update) { /* content changed */ 3594 spin_unlock_any(spinlk); 3595 goto again; 3596 } 3597 if (ncc_copy.ncp != nch->ncp || ncc_copy.mp != nch->mount) { 3598 spin_unlock_any(spinlk); 3599 goto again; 3600 } 3601 if (ncc_copy.isneg == 0) { 3602 target = ncc_copy.mp_target; 3603 if (target->mnt_ncmounton.mount == nch->mount && 3604 target->mnt_ncmounton.ncp == nch->ncp) { 3605 /* 3606 * Cache hit (positive) (avoid dirtying 3607 * the cache line if possible) 3608 */ 3609 if (ncc->ticks != (int)ticks) 3610 ncc->ticks = (int)ticks; 3611 _cache_mntref(target); 3612 } 3613 } else { 3614 /* 3615 * Cache hit (negative) (avoid dirtying 3616 * the cache line if possible) 3617 */ 3618 if (ncc->ticks != (int)ticks) 3619 ncc->ticks = (int)ticks; 3620 target = NULL; 3621 } 3622 spin_unlock_any(spinlk); 3623 3624 return target; 3625 } 3626 skip: 3627 3628 /* 3629 * Slow 3630 */ 3631 info.result = NULL; 3632 info.nch_mount = nch->mount; 3633 info.nch_ncp = nch->ncp; 3634 mountlist_scan(cache_findmount_callback, &info, 3635 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | MNTSCAN_NOUNLOCK); 3636 3637 /* 3638 * To reduce multi-re-entry on the cache, relookup in the cache. 3639 * This can still race, obviously, but that's ok. 3640 */ 3641 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3642 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3643 if (info.result) 3644 atomic_add_int(&info.result->mnt_refs, -1); 3645 goto found; 3646 } 3647 3648 /* 3649 * Cache the result. 3650 */ 3651 if ((info.result == NULL || 3652 (info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0)) { 3653 spin_lock(&ncc->spin); 3654 atomic_add_int_nonlocked(&ncc->updating, 1); 3655 cpu_sfence(); 3656 KKASSERT(ncc->updating & 1); 3657 if (ncc->mp != nch->mount) { 3658 if (ncc->mp) 3659 atomic_add_int(&ncc->mp->mnt_refs, -1); 3660 atomic_add_int(&nch->mount->mnt_refs, 1); 3661 ncc->mp = nch->mount; 3662 } 3663 ncc->ncp = nch->ncp; /* ptr compares only, not refd*/ 3664 ncc->ticks = (int)ticks; 3665 3666 if (info.result) { 3667 ncc->isneg = 0; 3668 if (ncc->mp_target != info.result) { 3669 if (ncc->mp_target) 3670 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3671 ncc->mp_target = info.result; 3672 atomic_add_int(&info.result->mnt_refs, 1); 3673 } 3674 } else { 3675 ncc->isneg = 1; 3676 if (ncc->mp_target) { 3677 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3678 ncc->mp_target = NULL; 3679 } 3680 } 3681 cpu_sfence(); 3682 atomic_add_int_nonlocked(&ncc->updating, 1); 3683 spin_unlock(&ncc->spin); 3684 } 3685 return(info.result); 3686 } 3687 3688 static 3689 int 3690 cache_findmount_callback(struct mount *mp, void *data) 3691 { 3692 struct findmount_info *info = data; 3693 3694 /* 3695 * Check the mount's mounted-on point against the passed nch. 3696 */ 3697 if (mp->mnt_ncmounton.mount == info->nch_mount && 3698 mp->mnt_ncmounton.ncp == info->nch_ncp 3699 ) { 3700 info->result = mp; 3701 _cache_mntref(mp); 3702 return(-1); 3703 } 3704 return(0); 3705 } 3706 3707 void 3708 cache_dropmount(struct mount *mp) 3709 { 3710 _cache_mntrel(mp); 3711 } 3712 3713 /* 3714 * mp is being mounted, scrap entries matching mp->mnt_ncmounton (positive 3715 * or negative). 3716 * 3717 * A full scan is not required, but for now just do it anyway. 3718 */ 3719 void 3720 cache_ismounting(struct mount *mp) 3721 { 3722 struct ncmount_cache *ncc; 3723 struct mount *ncc_mp; 3724 int i; 3725 3726 if (pcpu_ncache == NULL) 3727 return; 3728 3729 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 3730 ncc = &ncmount_cache[i]; 3731 if (ncc->mp != mp->mnt_ncmounton.mount || 3732 ncc->ncp != mp->mnt_ncmounton.ncp) { 3733 continue; 3734 } 3735 spin_lock(&ncc->spin); 3736 atomic_add_int_nonlocked(&ncc->updating, 1); 3737 cpu_sfence(); 3738 KKASSERT(ncc->updating & 1); 3739 if (ncc->mp != mp->mnt_ncmounton.mount || 3740 ncc->ncp != mp->mnt_ncmounton.ncp) { 3741 cpu_sfence(); 3742 ++ncc->updating; 3743 spin_unlock(&ncc->spin); 3744 continue; 3745 } 3746 ncc_mp = ncc->mp; 3747 ncc->ncp = NULL; 3748 ncc->mp = NULL; 3749 if (ncc_mp) 3750 atomic_add_int(&ncc_mp->mnt_refs, -1); 3751 ncc_mp = ncc->mp_target; 3752 ncc->mp_target = NULL; 3753 if (ncc_mp) 3754 atomic_add_int(&ncc_mp->mnt_refs, -1); 3755 ncc->ticks = (int)ticks - hz * 120; 3756 3757 cpu_sfence(); 3758 atomic_add_int_nonlocked(&ncc->updating, 1); 3759 spin_unlock(&ncc->spin); 3760 } 3761 3762 /* 3763 * Pre-cache the mount point 3764 */ 3765 ncc = ncmount_cache_lookup(mp->mnt_ncmounton.mount, 3766 mp->mnt_ncmounton.ncp); 3767 3768 spin_lock(&ncc->spin); 3769 atomic_add_int_nonlocked(&ncc->updating, 1); 3770 cpu_sfence(); 3771 KKASSERT(ncc->updating & 1); 3772 3773 if (ncc->mp) 3774 atomic_add_int(&ncc->mp->mnt_refs, -1); 3775 atomic_add_int(&mp->mnt_ncmounton.mount->mnt_refs, 1); 3776 ncc->mp = mp->mnt_ncmounton.mount; 3777 ncc->ncp = mp->mnt_ncmounton.ncp; /* ptr compares only */ 3778 ncc->ticks = (int)ticks; 3779 3780 ncc->isneg = 0; 3781 if (ncc->mp_target != mp) { 3782 if (ncc->mp_target) 3783 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3784 ncc->mp_target = mp; 3785 atomic_add_int(&mp->mnt_refs, 1); 3786 } 3787 cpu_sfence(); 3788 atomic_add_int_nonlocked(&ncc->updating, 1); 3789 spin_unlock(&ncc->spin); 3790 } 3791 3792 /* 3793 * Scrap any ncmount_cache entries related to mp. Not only do we need to 3794 * scrap entries matching mp->mnt_ncmounton, but we also need to scrap any 3795 * negative hits involving (mp, <any>). 3796 * 3797 * A full scan is required. 3798 */ 3799 void 3800 cache_unmounting(struct mount *mp) 3801 { 3802 struct ncmount_cache *ncc; 3803 struct pcpu_ncache *pcpu; 3804 struct mount *ncc_mp; 3805 int i; 3806 3807 pcpu = pcpu_ncache; 3808 if (pcpu == NULL) 3809 return; 3810 3811 for (i = 0; i < ncpus; ++i) 3812 spin_lock(&pcpu[i].umount_spin); 3813 3814 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 3815 ncc = &ncmount_cache[i]; 3816 if (ncc->mp != mp && ncc->mp_target != mp) 3817 continue; 3818 spin_lock(&ncc->spin); 3819 atomic_add_int_nonlocked(&ncc->updating, 1); 3820 cpu_sfence(); 3821 3822 if (ncc->mp != mp && ncc->mp_target != mp) { 3823 atomic_add_int_nonlocked(&ncc->updating, 1); 3824 cpu_sfence(); 3825 spin_unlock(&ncc->spin); 3826 continue; 3827 } 3828 ncc_mp = ncc->mp; 3829 ncc->ncp = NULL; 3830 ncc->mp = NULL; 3831 if (ncc_mp) 3832 atomic_add_int(&ncc_mp->mnt_refs, -1); 3833 ncc_mp = ncc->mp_target; 3834 ncc->mp_target = NULL; 3835 if (ncc_mp) 3836 atomic_add_int(&ncc_mp->mnt_refs, -1); 3837 ncc->ticks = (int)ticks - hz * 120; 3838 3839 cpu_sfence(); 3840 atomic_add_int_nonlocked(&ncc->updating, 1); 3841 spin_unlock(&ncc->spin); 3842 } 3843 3844 for (i = 0; i < ncpus; ++i) 3845 spin_unlock(&pcpu[i].umount_spin); 3846 } 3847 3848 /* 3849 * Resolve an unresolved namecache entry, generally by looking it up. 3850 * The passed ncp must be locked and refd. 3851 * 3852 * Theoretically since a vnode cannot be recycled while held, and since 3853 * the nc_parent chain holds its vnode as long as children exist, the 3854 * direct parent of the cache entry we are trying to resolve should 3855 * have a valid vnode. If not then generate an error that we can 3856 * determine is related to a resolver bug. 3857 * 3858 * However, if a vnode was in the middle of a recyclement when the NCP 3859 * got locked, ncp->nc_vp might point to a vnode that is about to become 3860 * invalid. cache_resolve() handles this case by unresolving the entry 3861 * and then re-resolving it. 3862 * 3863 * Note that successful resolution does not necessarily return an error 3864 * code of 0. If the ncp resolves to a negative cache hit then ENOENT 3865 * will be returned. 3866 */ 3867 int 3868 cache_resolve(struct nchandle *nch, struct ucred *cred) 3869 { 3870 struct namecache *par_tmp; 3871 struct namecache *par; 3872 struct namecache *ncp; 3873 struct nchandle nctmp; 3874 struct mount *mp; 3875 struct vnode *dvp; 3876 int error; 3877 3878 ncp = nch->ncp; 3879 mp = nch->mount; 3880 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 3881 restart: 3882 /* 3883 * If the ncp is already resolved we have nothing to do. However, 3884 * we do want to guarentee that a usable vnode is returned when 3885 * a vnode is present, so make sure it hasn't been reclaimed. 3886 */ 3887 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3888 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3889 _cache_setunresolved(ncp); 3890 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 3891 return (ncp->nc_error); 3892 } 3893 3894 /* 3895 * If the ncp was destroyed it will never resolve again. This 3896 * can basically only happen when someone is chdir'd into an 3897 * empty directory which is then rmdir'd. We want to catch this 3898 * here and not dive the VFS because the VFS might actually 3899 * have a way to re-resolve the disconnected ncp, which will 3900 * result in inconsistencies in the cdir/nch for proc->p_fd. 3901 */ 3902 if (ncp->nc_flag & NCF_DESTROYED) 3903 return(EINVAL); 3904 3905 /* 3906 * Mount points need special handling because the parent does not 3907 * belong to the same filesystem as the ncp. 3908 */ 3909 if (ncp == mp->mnt_ncmountpt.ncp) 3910 return (cache_resolve_mp(mp)); 3911 3912 /* 3913 * We expect an unbroken chain of ncps to at least the mount point, 3914 * and even all the way to root (but this code doesn't have to go 3915 * past the mount point). 3916 */ 3917 if (ncp->nc_parent == NULL) { 3918 kprintf("EXDEV case 1 %p %*.*s\n", ncp, 3919 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3920 ncp->nc_error = EXDEV; 3921 return(ncp->nc_error); 3922 } 3923 3924 /* 3925 * The vp's of the parent directories in the chain are held via vhold() 3926 * due to the existance of the child, and should not disappear. 3927 * However, there are cases where they can disappear: 3928 * 3929 * - due to filesystem I/O errors. 3930 * - due to NFS being stupid about tracking the namespace and 3931 * destroys the namespace for entire directories quite often. 3932 * - due to forced unmounts. 3933 * - due to an rmdir (parent will be marked DESTROYED) 3934 * 3935 * When this occurs we have to track the chain backwards and resolve 3936 * it, looping until the resolver catches up to the current node. We 3937 * could recurse here but we might run ourselves out of kernel stack 3938 * so we do it in a more painful manner. This situation really should 3939 * not occur all that often, or if it does not have to go back too 3940 * many nodes to resolve the ncp. 3941 */ 3942 while ((dvp = cache_dvpref(ncp)) == NULL) { 3943 /* 3944 * This case can occur if a process is CD'd into a 3945 * directory which is then rmdir'd. If the parent is marked 3946 * destroyed there is no point trying to resolve it. 3947 */ 3948 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) 3949 return(ENOENT); 3950 par = ncp->nc_parent; 3951 _cache_hold(par); 3952 _cache_lock(par); 3953 while ((par_tmp = par->nc_parent) != NULL && 3954 par_tmp->nc_vp == NULL) { 3955 _cache_hold(par_tmp); 3956 _cache_lock(par_tmp); 3957 _cache_put(par); 3958 par = par_tmp; 3959 } 3960 if (par->nc_parent == NULL) { 3961 kprintf("EXDEV case 2 %*.*s\n", 3962 par->nc_nlen, par->nc_nlen, par->nc_name); 3963 _cache_put(par); 3964 return (EXDEV); 3965 } 3966 /* 3967 * The parent is not set in stone, ref and lock it to prevent 3968 * it from disappearing. Also note that due to renames it 3969 * is possible for our ncp to move and for par to no longer 3970 * be one of its parents. We resolve it anyway, the loop 3971 * will handle any moves. 3972 */ 3973 _cache_get(par); /* additional hold/lock */ 3974 _cache_put(par); /* from earlier hold/lock */ 3975 if (par == nch->mount->mnt_ncmountpt.ncp) { 3976 cache_resolve_mp(nch->mount); 3977 } else if ((dvp = cache_dvpref(par)) == NULL) { 3978 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", 3979 par->nc_nlen, par->nc_nlen, par->nc_name); 3980 _cache_put(par); 3981 continue; 3982 } else { 3983 if (par->nc_flag & NCF_UNRESOLVED) { 3984 nctmp.mount = mp; 3985 nctmp.ncp = par; 3986 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3987 } 3988 vrele(dvp); 3989 } 3990 if ((error = par->nc_error) != 0) { 3991 if (par->nc_error != EAGAIN) { 3992 kprintf("EXDEV case 3 %*.*s error %d\n", 3993 par->nc_nlen, par->nc_nlen, par->nc_name, 3994 par->nc_error); 3995 _cache_put(par); 3996 return(error); 3997 } 3998 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 3999 par, par->nc_nlen, par->nc_nlen, par->nc_name); 4000 } 4001 _cache_put(par); 4002 /* loop */ 4003 } 4004 4005 /* 4006 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected 4007 * ncp's and reattach them. If this occurs the original ncp is marked 4008 * EAGAIN to force a relookup. 4009 * 4010 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed 4011 * ncp must already be resolved. 4012 */ 4013 if (dvp) { 4014 nctmp.mount = mp; 4015 nctmp.ncp = ncp; 4016 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 4017 vrele(dvp); 4018 } else { 4019 ncp->nc_error = EPERM; 4020 } 4021 if (ncp->nc_error == EAGAIN) { 4022 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n", 4023 ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 4024 goto restart; 4025 } 4026 return(ncp->nc_error); 4027 } 4028 4029 /* 4030 * Resolve the ncp associated with a mount point. Such ncp's almost always 4031 * remain resolved and this routine is rarely called. NFS MPs tends to force 4032 * re-resolution more often due to its mac-truck-smash-the-namecache 4033 * method of tracking namespace changes. 4034 * 4035 * The semantics for this call is that the passed ncp must be locked on 4036 * entry and will be locked on return. However, if we actually have to 4037 * resolve the mount point we temporarily unlock the entry in order to 4038 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of 4039 * the unlock we have to recheck the flags after we relock. 4040 */ 4041 static int 4042 cache_resolve_mp(struct mount *mp) 4043 { 4044 struct namecache *ncp = mp->mnt_ncmountpt.ncp; 4045 struct vnode *vp; 4046 int error; 4047 4048 KKASSERT(mp != NULL); 4049 4050 /* 4051 * If the ncp is already resolved we have nothing to do. However, 4052 * we do want to guarentee that a usable vnode is returned when 4053 * a vnode is present, so make sure it hasn't been reclaimed. 4054 */ 4055 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 4056 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 4057 _cache_setunresolved(ncp); 4058 } 4059 4060 if (ncp->nc_flag & NCF_UNRESOLVED) { 4061 _cache_unlock(ncp); 4062 while (vfs_busy(mp, 0)) 4063 ; 4064 error = VFS_ROOT(mp, &vp); 4065 _cache_lock(ncp); 4066 4067 /* 4068 * recheck the ncp state after relocking. 4069 */ 4070 if (ncp->nc_flag & NCF_UNRESOLVED) { 4071 ncp->nc_error = error; 4072 if (error == 0) { 4073 _cache_setvp(mp, ncp, vp); 4074 vput(vp); 4075 } else { 4076 kprintf("[diagnostic] cache_resolve_mp: failed" 4077 " to resolve mount %p err=%d ncp=%p\n", 4078 mp, error, ncp); 4079 _cache_setvp(mp, ncp, NULL); 4080 } 4081 } else if (error == 0) { 4082 vput(vp); 4083 } 4084 vfs_unbusy(mp); 4085 } 4086 return(ncp->nc_error); 4087 } 4088 4089 /* 4090 * Resolve the parent vnode 4091 */ 4092 int 4093 cache_resolve_dvp(struct nchandle *nch, struct ucred *cred, struct vnode **dvpp) 4094 { 4095 struct namecache *par_tmp; 4096 struct namecache *par; 4097 struct namecache *ncp; 4098 struct nchandle nctmp; 4099 struct mount *mp; 4100 struct vnode *dvp; 4101 int error; 4102 4103 *dvpp = NULL; 4104 ncp = nch->ncp; 4105 mp = nch->mount; 4106 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 4107 4108 /* 4109 * Treat this as a mount point even if it has a parent (e.g. 4110 * null-mount). Return a NULL dvp and no error. 4111 */ 4112 if (ncp == mp->mnt_ncmountpt.ncp) 4113 return 0; 4114 4115 /* 4116 * If the ncp was destroyed there is no parent directory, return 4117 * EINVAL. 4118 */ 4119 if (ncp->nc_flag & NCF_DESTROYED) 4120 return(EINVAL); 4121 4122 /* 4123 * No parent if at the root of a filesystem, no error. Typically 4124 * not applicable to null-mounts. This case should have been caught 4125 * in the above ncmountpt check. 4126 */ 4127 if (ncp->nc_parent == NULL) 4128 return 0; 4129 4130 /* 4131 * Resolve the parent dvp. 4132 * 4133 * The vp's of the parent directories in the chain are held via vhold() 4134 * due to the existance of the child, and should not disappear. 4135 * However, there are cases where they can disappear: 4136 * 4137 * - due to filesystem I/O errors. 4138 * - due to NFS being stupid about tracking the namespace and 4139 * destroys the namespace for entire directories quite often. 4140 * - due to forced unmounts. 4141 * - due to an rmdir (parent will be marked DESTROYED) 4142 * 4143 * When this occurs we have to track the chain backwards and resolve 4144 * it, looping until the resolver catches up to the current node. We 4145 * could recurse here but we might run ourselves out of kernel stack 4146 * so we do it in a more painful manner. This situation really should 4147 * not occur all that often, or if it does not have to go back too 4148 * many nodes to resolve the ncp. 4149 */ 4150 while ((dvp = cache_dvpref(ncp)) == NULL) { 4151 /* 4152 * This case can occur if a process is CD'd into a 4153 * directory which is then rmdir'd. If the parent is marked 4154 * destroyed there is no point trying to resolve it. 4155 */ 4156 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) 4157 return(ENOENT); 4158 par = ncp->nc_parent; 4159 _cache_hold(par); 4160 _cache_lock(par); 4161 while ((par_tmp = par->nc_parent) != NULL && 4162 par_tmp->nc_vp == NULL) { 4163 _cache_hold(par_tmp); 4164 _cache_lock(par_tmp); 4165 _cache_put(par); 4166 par = par_tmp; 4167 } 4168 if (par->nc_parent == NULL) { 4169 kprintf("EXDEV case 2 %*.*s\n", 4170 par->nc_nlen, par->nc_nlen, par->nc_name); 4171 _cache_put(par); 4172 return (EXDEV); 4173 } 4174 4175 /* 4176 * The parent is not set in stone, ref and lock it to prevent 4177 * it from disappearing. Also note that due to renames it 4178 * is possible for our ncp to move and for par to no longer 4179 * be one of its parents. We resolve it anyway, the loop 4180 * will handle any moves. 4181 */ 4182 _cache_get(par); /* additional hold/lock */ 4183 _cache_put(par); /* from earlier hold/lock */ 4184 if (par == nch->mount->mnt_ncmountpt.ncp) { 4185 cache_resolve_mp(nch->mount); 4186 } else if ((dvp = cache_dvpref(par)) == NULL) { 4187 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", 4188 par->nc_nlen, par->nc_nlen, par->nc_name); 4189 _cache_put(par); 4190 continue; 4191 } else { 4192 if (par->nc_flag & NCF_UNRESOLVED) { 4193 nctmp.mount = mp; 4194 nctmp.ncp = par; 4195 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 4196 } 4197 vrele(dvp); 4198 } 4199 if ((error = par->nc_error) != 0) { 4200 if (par->nc_error != EAGAIN) { 4201 kprintf("EXDEV case 3 %*.*s error %d\n", 4202 par->nc_nlen, par->nc_nlen, par->nc_name, 4203 par->nc_error); 4204 _cache_put(par); 4205 return(error); 4206 } 4207 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 4208 par, par->nc_nlen, par->nc_nlen, par->nc_name); 4209 } 4210 _cache_put(par); 4211 /* loop */ 4212 } 4213 4214 /* 4215 * We have a referenced dvp 4216 */ 4217 *dvpp = dvp; 4218 return 0; 4219 } 4220 4221 /* 4222 * Clean out negative cache entries when too many have accumulated. 4223 */ 4224 static void 4225 _cache_cleanneg(long count) 4226 { 4227 struct pcpu_ncache *pn; 4228 struct namecache *ncp; 4229 static uint32_t neg_rover; 4230 uint32_t n; 4231 long vnegs; 4232 4233 n = neg_rover++; /* SMP heuristical, race ok */ 4234 cpu_ccfence(); 4235 n = n % (uint32_t)ncpus; 4236 4237 /* 4238 * Normalize vfscache_negs and count. count is sometimes based 4239 * on vfscache_negs. vfscache_negs is heuristical and can sometimes 4240 * have crazy values. 4241 */ 4242 vnegs = vfscache_negs; 4243 cpu_ccfence(); 4244 if (vnegs <= MINNEG) 4245 vnegs = MINNEG; 4246 if (count < 1) 4247 count = 1; 4248 4249 pn = &pcpu_ncache[n]; 4250 spin_lock(&pn->neg_spin); 4251 count = pn->neg_count * count / vnegs + 1; 4252 spin_unlock(&pn->neg_spin); 4253 4254 /* 4255 * Attempt to clean out the specified number of negative cache 4256 * entries. 4257 */ 4258 while (count > 0) { 4259 spin_lock(&pn->neg_spin); 4260 ncp = TAILQ_FIRST(&pn->neg_list); 4261 if (ncp == NULL) { 4262 spin_unlock(&pn->neg_spin); 4263 break; 4264 } 4265 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 4266 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 4267 _cache_hold(ncp); 4268 spin_unlock(&pn->neg_spin); 4269 4270 /* 4271 * This can race, so we must re-check that the ncp 4272 * is on the ncneg.list after successfully locking it. 4273 */ 4274 if (_cache_lock_special(ncp) == 0) { 4275 if (ncp->nc_vp == NULL && 4276 (ncp->nc_flag & NCF_UNRESOLVED) == 0) 4277 { 4278 cache_zap(ncp); 4279 } else { 4280 _cache_unlock(ncp); 4281 _cache_drop(ncp); 4282 } 4283 } else { 4284 _cache_drop(ncp); 4285 } 4286 --count; 4287 } 4288 } 4289 4290 /* 4291 * Clean out positive cache entries when too many have accumulated. 4292 */ 4293 static void 4294 _cache_cleanpos(long count) 4295 { 4296 static volatile int rover; 4297 struct nchash_head *nchpp; 4298 struct namecache *ncp; 4299 int rover_copy; 4300 4301 /* 4302 * Attempt to clean out the specified number of negative cache 4303 * entries. 4304 */ 4305 while (count > 0) { 4306 rover_copy = ++rover; /* MPSAFEENOUGH */ 4307 cpu_ccfence(); 4308 nchpp = NCHHASH(rover_copy); 4309 4310 if (TAILQ_FIRST(&nchpp->list) == NULL) { 4311 --count; 4312 continue; 4313 } 4314 4315 /* 4316 * Cycle ncp on list, ignore and do not move DUMMY 4317 * ncps. These are temporary list iterators. 4318 * 4319 * We must cycle the ncp to the end of the list to 4320 * ensure that all ncp's have an equal chance of 4321 * being removed. 4322 */ 4323 spin_lock(&nchpp->spin); 4324 ncp = TAILQ_FIRST(&nchpp->list); 4325 while (ncp && (ncp->nc_flag & NCF_DUMMY)) 4326 ncp = TAILQ_NEXT(ncp, nc_hash); 4327 if (ncp) { 4328 TAILQ_REMOVE(&nchpp->list, ncp, nc_hash); 4329 TAILQ_INSERT_TAIL(&nchpp->list, ncp, nc_hash); 4330 _cache_hold(ncp); 4331 } 4332 spin_unlock(&nchpp->spin); 4333 4334 if (ncp) { 4335 if (_cache_lock_special(ncp) == 0) { 4336 cache_zap(ncp); 4337 } else { 4338 _cache_drop(ncp); 4339 } 4340 } 4341 --count; 4342 } 4343 } 4344 4345 /* 4346 * This is a kitchen sink function to clean out ncps which we 4347 * tried to zap from cache_drop() but failed because we were 4348 * unable to acquire the parent lock. 4349 * 4350 * Such entries can also be removed via cache_inval_vp(), such 4351 * as when unmounting. 4352 */ 4353 static void 4354 _cache_cleandefered(void) 4355 { 4356 struct nchash_head *nchpp; 4357 struct namecache *ncp; 4358 struct namecache dummy; 4359 int i; 4360 4361 /* 4362 * Create a list iterator. DUMMY indicates that this is a list 4363 * iterator, DESTROYED prevents matches by lookup functions. 4364 */ 4365 numdefered = 0; 4366 pcpu_ncache[mycpu->gd_cpuid].numdefered = 0; 4367 bzero(&dummy, sizeof(dummy)); 4368 dummy.nc_flag = NCF_DESTROYED | NCF_DUMMY; 4369 dummy.nc_refs = 1; 4370 4371 for (i = 0; i <= nchash; ++i) { 4372 nchpp = &nchashtbl[i]; 4373 4374 spin_lock(&nchpp->spin); 4375 TAILQ_INSERT_HEAD(&nchpp->list, &dummy, nc_hash); 4376 ncp = &dummy; 4377 while ((ncp = TAILQ_NEXT(ncp, nc_hash)) != NULL) { 4378 if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0) 4379 continue; 4380 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4381 TAILQ_INSERT_AFTER(&nchpp->list, ncp, &dummy, nc_hash); 4382 _cache_hold(ncp); 4383 spin_unlock(&nchpp->spin); 4384 if (_cache_lock_nonblock(ncp) == 0) { 4385 ncp->nc_flag &= ~NCF_DEFEREDZAP; 4386 _cache_unlock(ncp); 4387 } 4388 _cache_drop(ncp); 4389 spin_lock(&nchpp->spin); 4390 ncp = &dummy; 4391 } 4392 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4393 spin_unlock(&nchpp->spin); 4394 } 4395 } 4396 4397 /* 4398 * Name cache initialization, from vfsinit() when we are booting 4399 */ 4400 void 4401 nchinit(void) 4402 { 4403 struct pcpu_ncache *pn; 4404 globaldata_t gd; 4405 int i; 4406 4407 /* 4408 * Per-cpu accounting and negative hit list 4409 */ 4410 pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus, 4411 M_VFSCACHEAUX, M_WAITOK|M_ZERO); 4412 for (i = 0; i < ncpus; ++i) { 4413 pn = &pcpu_ncache[i]; 4414 TAILQ_INIT(&pn->neg_list); 4415 spin_init(&pn->neg_spin, "ncneg"); 4416 spin_init(&pn->umount_spin, "ncumm"); 4417 } 4418 4419 /* 4420 * Initialise per-cpu namecache effectiveness statistics. 4421 */ 4422 for (i = 0; i < ncpus; ++i) { 4423 gd = globaldata_find(i); 4424 gd->gd_nchstats = &nchstats[i]; 4425 } 4426 4427 /* 4428 * Create a generous namecache hash table 4429 */ 4430 nchashtbl = hashinit_ext(vfs_inodehashsize(), 4431 sizeof(struct nchash_head), 4432 M_VFSCACHEAUX, &nchash); 4433 for (i = 0; i <= (int)nchash; ++i) { 4434 TAILQ_INIT(&nchashtbl[i].list); 4435 spin_init(&nchashtbl[i].spin, "nchinit_hash"); 4436 } 4437 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) 4438 spin_init(&ncmount_cache[i].spin, "nchinit_cache"); 4439 nclockwarn = 5 * hz; 4440 } 4441 4442 /* 4443 * Called from start_init() to bootstrap the root filesystem. Returns 4444 * a referenced, unlocked namecache record. 4445 */ 4446 void 4447 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp) 4448 { 4449 nch->ncp = cache_alloc(0); 4450 nch->mount = mp; 4451 _cache_mntref(mp); 4452 if (vp) 4453 _cache_setvp(nch->mount, nch->ncp, vp); 4454 } 4455 4456 /* 4457 * vfs_cache_setroot() 4458 * 4459 * Create an association between the root of our namecache and 4460 * the root vnode. This routine may be called several times during 4461 * booting. 4462 * 4463 * If the caller intends to save the returned namecache pointer somewhere 4464 * it must cache_hold() it. 4465 */ 4466 void 4467 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch) 4468 { 4469 struct vnode *ovp; 4470 struct nchandle onch; 4471 4472 ovp = rootvnode; 4473 onch = rootnch; 4474 rootvnode = nvp; 4475 if (nch) 4476 rootnch = *nch; 4477 else 4478 cache_zero(&rootnch); 4479 if (ovp) 4480 vrele(ovp); 4481 if (onch.ncp) 4482 cache_drop(&onch); 4483 } 4484 4485 /* 4486 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache 4487 * topology and is being removed as quickly as possible. The new VOP_N*() 4488 * API calls are required to make specific adjustments using the supplied 4489 * ncp pointers rather then just bogusly purging random vnodes. 4490 * 4491 * Invalidate all namecache entries to a particular vnode as well as 4492 * any direct children of that vnode in the namecache. This is a 4493 * 'catch all' purge used by filesystems that do not know any better. 4494 * 4495 * Note that the linkage between the vnode and its namecache entries will 4496 * be removed, but the namecache entries themselves might stay put due to 4497 * active references from elsewhere in the system or due to the existance of 4498 * the children. The namecache topology is left intact even if we do not 4499 * know what the vnode association is. Such entries will be marked 4500 * NCF_UNRESOLVED. 4501 */ 4502 void 4503 cache_purge(struct vnode *vp) 4504 { 4505 cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN); 4506 } 4507 4508 __read_mostly static int disablecwd; 4509 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 4510 "Disable getcwd"); 4511 4512 /* 4513 * MPALMOSTSAFE 4514 */ 4515 int 4516 sys___getcwd(struct sysmsg *sysmsg, const struct __getcwd_args *uap) 4517 { 4518 u_int buflen; 4519 int error; 4520 char *buf; 4521 char *bp; 4522 4523 if (disablecwd) 4524 return (ENODEV); 4525 4526 buflen = uap->buflen; 4527 if (buflen == 0) 4528 return (EINVAL); 4529 if (buflen > MAXPATHLEN) 4530 buflen = MAXPATHLEN; 4531 4532 buf = kmalloc(buflen, M_TEMP, M_WAITOK); 4533 bp = kern_getcwd(buf, buflen, &error); 4534 if (error == 0) 4535 error = copyout(bp, uap->buf, strlen(bp) + 1); 4536 kfree(buf, M_TEMP); 4537 return (error); 4538 } 4539 4540 char * 4541 kern_getcwd(char *buf, size_t buflen, int *error) 4542 { 4543 struct proc *p = curproc; 4544 char *bp; 4545 int i, slash_prefixed; 4546 struct filedesc *fdp; 4547 struct nchandle nch; 4548 struct namecache *ncp; 4549 4550 bp = buf; 4551 bp += buflen - 1; 4552 *bp = '\0'; 4553 fdp = p->p_fd; 4554 slash_prefixed = 0; 4555 4556 nch = fdp->fd_ncdir; 4557 ncp = nch.ncp; 4558 if (ncp) 4559 _cache_hold(ncp); 4560 4561 while (ncp && (ncp != fdp->fd_nrdir.ncp || 4562 nch.mount != fdp->fd_nrdir.mount) 4563 ) { 4564 if (ncp->nc_flag & NCF_DESTROYED) { 4565 _cache_drop(ncp); 4566 ncp = NULL; 4567 break; 4568 } 4569 /* 4570 * While traversing upwards if we encounter the root 4571 * of the current mount we have to skip to the mount point 4572 * in the underlying filesystem. 4573 */ 4574 if (ncp == nch.mount->mnt_ncmountpt.ncp) { 4575 nch = nch.mount->mnt_ncmounton; 4576 _cache_drop(ncp); 4577 ncp = nch.ncp; 4578 if (ncp) 4579 _cache_hold(ncp); 4580 continue; 4581 } 4582 4583 /* 4584 * Prepend the path segment 4585 */ 4586 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4587 if (bp == buf) { 4588 *error = ERANGE; 4589 bp = NULL; 4590 goto done; 4591 } 4592 *--bp = ncp->nc_name[i]; 4593 } 4594 if (bp == buf) { 4595 *error = ERANGE; 4596 bp = NULL; 4597 goto done; 4598 } 4599 *--bp = '/'; 4600 slash_prefixed = 1; 4601 4602 /* 4603 * Go up a directory. This isn't a mount point so we don't 4604 * have to check again. 4605 */ 4606 while ((nch.ncp = ncp->nc_parent) != NULL) { 4607 if (ncp_shared_lock_disable) 4608 _cache_lock(ncp); 4609 else 4610 _cache_lock_shared(ncp); 4611 if (nch.ncp != ncp->nc_parent) { 4612 _cache_unlock(ncp); 4613 continue; 4614 } 4615 _cache_hold(nch.ncp); 4616 _cache_unlock(ncp); 4617 break; 4618 } 4619 _cache_drop(ncp); 4620 ncp = nch.ncp; 4621 } 4622 if (ncp == NULL) { 4623 *error = ENOENT; 4624 bp = NULL; 4625 goto done; 4626 } 4627 if (!slash_prefixed) { 4628 if (bp == buf) { 4629 *error = ERANGE; 4630 bp = NULL; 4631 goto done; 4632 } 4633 *--bp = '/'; 4634 } 4635 *error = 0; 4636 done: 4637 if (ncp) 4638 _cache_drop(ncp); 4639 return (bp); 4640 } 4641 4642 /* 4643 * Thus begins the fullpath magic. 4644 * 4645 * The passed nchp is referenced but not locked. 4646 */ 4647 __read_mostly static int disablefullpath; 4648 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, 4649 &disablefullpath, 0, 4650 "Disable fullpath lookups"); 4651 4652 int 4653 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase, 4654 char **retbuf, char **freebuf, int guess) 4655 { 4656 struct nchandle fd_nrdir; 4657 struct nchandle nch; 4658 struct namecache *ncp; 4659 struct mount *mp, *new_mp; 4660 char *bp, *buf; 4661 int slash_prefixed; 4662 int error = 0; 4663 int i; 4664 4665 *retbuf = NULL; 4666 *freebuf = NULL; 4667 4668 buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK); 4669 bp = buf + MAXPATHLEN - 1; 4670 *bp = '\0'; 4671 if (nchbase) 4672 fd_nrdir = *nchbase; 4673 else if (p != NULL) 4674 fd_nrdir = p->p_fd->fd_nrdir; 4675 else 4676 fd_nrdir = rootnch; 4677 slash_prefixed = 0; 4678 nch = *nchp; 4679 ncp = nch.ncp; 4680 if (ncp) 4681 _cache_hold(ncp); 4682 mp = nch.mount; 4683 4684 while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) { 4685 new_mp = NULL; 4686 4687 /* 4688 * If we are asked to guess the upwards path, we do so whenever 4689 * we encounter an ncp marked as a mountpoint. We try to find 4690 * the actual mountpoint by finding the mountpoint with this 4691 * ncp. 4692 */ 4693 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) { 4694 new_mp = mount_get_by_nc(ncp); 4695 } 4696 /* 4697 * While traversing upwards if we encounter the root 4698 * of the current mount we have to skip to the mount point. 4699 */ 4700 if (ncp == mp->mnt_ncmountpt.ncp) { 4701 new_mp = mp; 4702 } 4703 if (new_mp) { 4704 nch = new_mp->mnt_ncmounton; 4705 _cache_drop(ncp); 4706 ncp = nch.ncp; 4707 if (ncp) 4708 _cache_hold(ncp); 4709 mp = nch.mount; 4710 continue; 4711 } 4712 4713 /* 4714 * Prepend the path segment 4715 */ 4716 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4717 if (bp == buf) { 4718 kfree(buf, M_TEMP); 4719 error = ENOMEM; 4720 goto done; 4721 } 4722 *--bp = ncp->nc_name[i]; 4723 } 4724 if (bp == buf) { 4725 kfree(buf, M_TEMP); 4726 error = ENOMEM; 4727 goto done; 4728 } 4729 *--bp = '/'; 4730 slash_prefixed = 1; 4731 4732 /* 4733 * Go up a directory. This isn't a mount point so we don't 4734 * have to check again. 4735 * 4736 * We can only safely access nc_parent with ncp held locked. 4737 */ 4738 while ((nch.ncp = ncp->nc_parent) != NULL) { 4739 _cache_lock_shared(ncp); 4740 if (nch.ncp != ncp->nc_parent) { 4741 _cache_unlock(ncp); 4742 continue; 4743 } 4744 _cache_hold(nch.ncp); 4745 _cache_unlock(ncp); 4746 break; 4747 } 4748 _cache_drop(ncp); 4749 ncp = nch.ncp; 4750 } 4751 if (ncp == NULL) { 4752 kfree(buf, M_TEMP); 4753 error = ENOENT; 4754 goto done; 4755 } 4756 4757 if (!slash_prefixed) { 4758 if (bp == buf) { 4759 kfree(buf, M_TEMP); 4760 error = ENOMEM; 4761 goto done; 4762 } 4763 *--bp = '/'; 4764 } 4765 *retbuf = bp; 4766 *freebuf = buf; 4767 error = 0; 4768 done: 4769 if (ncp) 4770 _cache_drop(ncp); 4771 return(error); 4772 } 4773 4774 int 4775 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, 4776 char **freebuf, int guess) 4777 { 4778 struct namecache *ncp; 4779 struct nchandle nch; 4780 int error; 4781 4782 *freebuf = NULL; 4783 if (disablefullpath) 4784 return (ENODEV); 4785 4786 if (p == NULL) 4787 return (EINVAL); 4788 4789 /* vn is NULL, client wants us to use p->p_textvp */ 4790 if (vn == NULL) { 4791 if ((vn = p->p_textvp) == NULL) 4792 return (EINVAL); 4793 } 4794 spin_lock_shared(&vn->v_spin); 4795 TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) { 4796 if (ncp->nc_nlen) 4797 break; 4798 } 4799 if (ncp == NULL) { 4800 spin_unlock_shared(&vn->v_spin); 4801 return (EINVAL); 4802 } 4803 _cache_hold(ncp); 4804 spin_unlock_shared(&vn->v_spin); 4805 4806 nch.ncp = ncp; 4807 nch.mount = vn->v_mount; 4808 error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess); 4809 _cache_drop(ncp); 4810 return (error); 4811 } 4812 4813 void 4814 vfscache_rollup_cpu(struct globaldata *gd) 4815 { 4816 struct pcpu_ncache *pn; 4817 long count; 4818 4819 if (pcpu_ncache == NULL) 4820 return; 4821 pn = &pcpu_ncache[gd->gd_cpuid]; 4822 4823 if (pn->vfscache_count) { 4824 count = atomic_swap_long(&pn->vfscache_count, 0); 4825 atomic_add_long(&vfscache_count, count); 4826 } 4827 if (pn->vfscache_leafs) { 4828 count = atomic_swap_long(&pn->vfscache_leafs, 0); 4829 atomic_add_long(&vfscache_leafs, count); 4830 } 4831 if (pn->vfscache_negs) { 4832 count = atomic_swap_long(&pn->vfscache_negs, 0); 4833 atomic_add_long(&vfscache_negs, count); 4834 } 4835 if (pn->numdefered) { 4836 count = atomic_swap_long(&pn->numdefered, 0); 4837 atomic_add_long(&numdefered, count); 4838 } 4839 } 4840