1 /* 2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1989, 1993, 1995 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * Poul-Henning Kamp of the FreeBSD Project. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 3. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 */ 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/uio.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 #include <sys/mount.h> 71 #include <sys/vnode.h> 72 #include <sys/malloc.h> 73 #include <sys/sysmsg.h> 74 #include <sys/spinlock.h> 75 #include <sys/proc.h> 76 #include <sys/nlookup.h> 77 #include <sys/filedesc.h> 78 #include <sys/fnv_hash.h> 79 #include <sys/globaldata.h> 80 #include <sys/kern_syscall.h> 81 #include <sys/dirent.h> 82 #include <ddb/ddb.h> 83 84 #include <sys/spinlock2.h> 85 86 #define MAX_RECURSION_DEPTH 64 87 88 /* 89 * Random lookups in the cache are accomplished with a hash table using 90 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock, 91 * but we use the ncp->update counter trick to avoid acquiring any 92 * contestable spin-locks during a lookup. 93 * 94 * Negative entries may exist and correspond to resolved namecache 95 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT 96 * will be set if the entry corresponds to a whited-out directory entry 97 * (verses simply not finding the entry at all). pcpu_ncache[n].neg_list 98 * is locked via pcpu_ncache[n].neg_spin; 99 * 100 * MPSAFE RULES: 101 * 102 * (1) ncp's typically have at least a nc_refs of 1, and usually 2. One 103 * is applicable to direct lookups via the hash table nchpp or via 104 * nc_list (the two are added or removed together). Removal of the ncp 105 * from the hash table drops this reference. The second is applicable 106 * to vp->v_namecache linkages (or negative list linkages), and removal 107 * of the ncp from these lists drops this reference. 108 * 109 * On the 1->0 transition of nc_refs the ncp can no longer be referenced 110 * and must be destroyed. No other thread should have access to it at 111 * this point so it can be safely locked and freed without any deadlock 112 * fears. 113 * 114 * The 1->0 transition can occur at almost any juncture and so cache_drop() 115 * deals with it directly. 116 * 117 * (2) Once the 1->0 transition occurs, the entity that caused the transition 118 * will be responsible for destroying the ncp. The ncp cannot be on any 119 * list or hash at this time, or be held by anyone other than the caller 120 * responsible for the transition. 121 * 122 * (3) A ncp must be locked in order to modify it. 123 * 124 * (5) ncp locks are ordered, child-to-parent. Child first, then parent. 125 * This may seem backwards but forward-scans use the hash table and thus 126 * can hold the parent unlocked while traversing downward. Deletions, 127 * on the other-hand, tend to propagate bottom-up since the ref on the 128 * is dropped as the children go away. 129 * 130 * (6) Both parent and child must be locked in order to enter the child onto 131 * the parent's nc_list. 132 */ 133 134 /* 135 * Structures associated with name cacheing. 136 */ 137 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash]) 138 #define MINNEG 1024 139 #define MINPOS 1024 140 #define NCMOUNT_NUMCACHE (16384) /* power of 2 */ 141 #define NCMOUNT_SET (8) /* power of 2 */ 142 143 MALLOC_DEFINE_OBJ(M_VFSCACHE, sizeof(struct namecache), 144 "namecache", "namecache entries"); 145 MALLOC_DEFINE(M_VFSCACHEAUX, "namecachestr", "namecache strings"); 146 147 TAILQ_HEAD(nchash_list, namecache); 148 149 /* 150 * Don't cachealign, but at least pad to 32 bytes so entries 151 * don't cross a cache line. 152 */ 153 struct nchash_head { 154 struct nchash_list list; /* 16 bytes */ 155 struct spinlock spin; /* 8 bytes */ 156 long pad01; /* 8 bytes */ 157 }; 158 159 struct ncmount_cache { 160 struct spinlock spin; 161 struct namecache *ncp; 162 struct mount *mp; 163 struct mount *mp_target; 164 int isneg; 165 int ticks; 166 int updating; 167 int unused01; 168 }; 169 170 struct pcpu_ncache { 171 struct spinlock umount_spin; /* cache_findmount/interlock */ 172 struct spinlock neg_spin; /* for neg_list and neg_count */ 173 struct namecache_list neg_list; 174 long neg_count; 175 long vfscache_negs; 176 long vfscache_count; 177 long vfscache_leafs; 178 long vfscache_unres; 179 long numdefered; 180 } __cachealign; 181 182 __read_mostly static struct nchash_head *nchashtbl; 183 __read_mostly static struct pcpu_ncache *pcpu_ncache; 184 static struct ncmount_cache ncmount_cache[NCMOUNT_NUMCACHE]; 185 186 /* 187 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server 188 * to create the namecache infrastructure leading to a dangling vnode. 189 * 190 * 0 Only errors are reported 191 * 1 Successes are reported 192 * 2 Successes + the whole directory scan is reported 193 * 3 Force the directory scan code run as if the parent vnode did not 194 * have a namecache record, even if it does have one. 195 */ 196 __read_mostly static int ncvp_debug; 197 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0, 198 "Namecache debug level (0-3)"); 199 200 __read_mostly static u_long nchash; /* size of hash table */ 201 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 202 "Size of namecache hash table"); 203 204 __read_mostly static int ncnegflush = 10; /* burst for negative flush */ 205 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0, 206 "Batch flush negative entries"); 207 208 __read_mostly static int ncposflush = 10; /* burst for positive flush */ 209 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0, 210 "Batch flush positive entries"); 211 212 __read_mostly static int ncnegfactor = 16; /* ratio of negative entries */ 213 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 214 "Ratio of negative namecache entries"); 215 216 __read_mostly static int ncposfactor = 16; /* ratio of unres+leaf entries */ 217 SYSCTL_INT(_debug, OID_AUTO, ncposfactor, CTLFLAG_RW, &ncposfactor, 0, 218 "Ratio of unresolved leaf namecache entries"); 219 220 __read_mostly static int nclockwarn; /* warn on locked entries in ticks */ 221 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0, 222 "Warn on locked namecache entries in ticks"); 223 224 __read_mostly static int ncposlimit; /* number of cache entries allocated */ 225 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0, 226 "Number of cache entries allocated"); 227 228 __read_mostly static int ncp_shared_lock_disable = 0; 229 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW, 230 &ncp_shared_lock_disable, 0, "Disable shared namecache locks"); 231 232 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), 233 "sizeof(struct vnode)"); 234 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), 235 "sizeof(struct namecache)"); 236 237 __read_mostly static int ncmount_cache_enable = 1; 238 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW, 239 &ncmount_cache_enable, 0, "mount point cache"); 240 241 static __inline void _cache_drop(struct namecache *ncp); 242 static int cache_resolve_mp(struct mount *mp); 243 static int cache_findmount_callback(struct mount *mp, void *data); 244 static void _cache_setunresolved(struct namecache *ncp); 245 static void _cache_cleanneg(long count); 246 static void _cache_cleanpos(long count); 247 static void _cache_cleandefered(void); 248 static void _cache_unlink(struct namecache *ncp); 249 250 /* 251 * The new name cache statistics (these are rolled up globals and not 252 * modified in the critical path, see struct pcpu_ncache). 253 */ 254 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); 255 static long vfscache_negs; 256 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0, 257 "Number of negative namecache entries"); 258 static long vfscache_count; 259 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0, 260 "Number of namecaches entries"); 261 static long vfscache_leafs; 262 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0, 263 "Number of leaf namecaches entries"); 264 static long vfscache_unres; 265 SYSCTL_LONG(_vfs_cache, OID_AUTO, numunres, CTLFLAG_RD, &vfscache_unres, 0, 266 "Number of unresolved leaf namecaches entries"); 267 static long numdefered; 268 SYSCTL_LONG(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0, 269 "Number of cache entries allocated"); 270 271 272 struct nchstats nchstats[SMP_MAXCPU]; 273 /* 274 * Export VFS cache effectiveness statistics to user-land. 275 * 276 * The statistics are left for aggregation to user-land so 277 * neat things can be achieved, like observing per-CPU cache 278 * distribution. 279 */ 280 static int 281 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 282 { 283 struct globaldata *gd; 284 int i, error; 285 286 error = 0; 287 for (i = 0; i < ncpus; ++i) { 288 gd = globaldata_find(i); 289 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats), 290 sizeof(struct nchstats)))) 291 break; 292 } 293 294 return (error); 295 } 296 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD, 297 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics"); 298 299 static int cache_zap(struct namecache *ncp); 300 301 /* 302 * Cache mount points and namecache records in order to avoid unnecessary 303 * atomic ops on mnt_refs and ncp->refs. This improves concurrent SMP 304 * performance and is particularly important on multi-socket systems to 305 * reduce cache-line ping-ponging. 306 * 307 * Try to keep the pcpu structure within one cache line (~64 bytes). 308 */ 309 #define MNTCACHE_COUNT 32 /* power of 2, multiple of SET */ 310 #define MNTCACHE_SET 8 /* set associativity */ 311 312 struct mntcache_elm { 313 struct namecache *ncp; 314 struct mount *mp; 315 int ticks; 316 int unused01; 317 }; 318 319 struct mntcache { 320 struct mntcache_elm array[MNTCACHE_COUNT]; 321 } __cachealign; 322 323 static struct mntcache pcpu_mntcache[MAXCPU]; 324 325 static __inline 326 struct mntcache_elm * 327 _cache_mntcache_hash(void *ptr) 328 { 329 struct mntcache_elm *elm; 330 int hv; 331 332 hv = iscsi_crc32(&ptr, sizeof(ptr)) & (MNTCACHE_COUNT - 1); 333 elm = &pcpu_mntcache[mycpu->gd_cpuid].array[hv & ~(MNTCACHE_SET - 1)]; 334 335 return elm; 336 } 337 338 static 339 void 340 _cache_mntref(struct mount *mp) 341 { 342 struct mntcache_elm *elm; 343 struct mount *mpr; 344 int i; 345 346 elm = _cache_mntcache_hash(mp); 347 for (i = 0; i < MNTCACHE_SET; ++i) { 348 if (elm->mp == mp) { 349 mpr = atomic_swap_ptr((void *)&elm->mp, NULL); 350 if (__predict_true(mpr == mp)) 351 return; 352 if (mpr) 353 atomic_add_int(&mpr->mnt_refs, -1); 354 } 355 ++elm; 356 } 357 atomic_add_int(&mp->mnt_refs, 1); 358 } 359 360 static 361 void 362 _cache_mntrel(struct mount *mp) 363 { 364 struct mntcache_elm *elm; 365 struct mntcache_elm *best; 366 struct mount *mpr; 367 int delta1; 368 int delta2; 369 int i; 370 371 elm = _cache_mntcache_hash(mp); 372 best = elm; 373 for (i = 0; i < MNTCACHE_SET; ++i) { 374 if (elm->mp == NULL) { 375 mpr = atomic_swap_ptr((void *)&elm->mp, mp); 376 if (__predict_false(mpr != NULL)) { 377 atomic_add_int(&mpr->mnt_refs, -1); 378 } 379 elm->ticks = ticks; 380 return; 381 } 382 delta1 = ticks - best->ticks; 383 delta2 = ticks - elm->ticks; 384 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 385 best = elm; 386 ++elm; 387 } 388 mpr = atomic_swap_ptr((void *)&best->mp, mp); 389 best->ticks = ticks; 390 if (mpr) 391 atomic_add_int(&mpr->mnt_refs, -1); 392 } 393 394 /* 395 * Clears all cached mount points on all cpus. This routine should only 396 * be called when we are waiting for a mount to clear, e.g. so we can 397 * unmount. 398 */ 399 void 400 cache_clearmntcache(struct mount *target __unused) 401 { 402 int n; 403 404 for (n = 0; n < ncpus; ++n) { 405 struct mntcache *cache = &pcpu_mntcache[n]; 406 struct mntcache_elm *elm; 407 struct namecache *ncp; 408 struct mount *mp; 409 int i; 410 411 for (i = 0; i < MNTCACHE_COUNT; ++i) { 412 elm = &cache->array[i]; 413 if (elm->mp) { 414 mp = atomic_swap_ptr((void *)&elm->mp, NULL); 415 if (mp) 416 atomic_add_int(&mp->mnt_refs, -1); 417 } 418 if (elm->ncp) { 419 ncp = atomic_swap_ptr((void *)&elm->ncp, NULL); 420 if (ncp) 421 _cache_drop(ncp); 422 } 423 } 424 } 425 } 426 427 /* 428 * Namespace locking. The caller must already hold a reference to the 429 * namecache structure in order to lock/unlock it. The controlling entity 430 * in a 1->0 transition does not need to lock the ncp to dispose of it, 431 * as nobody else will have visibility to it at that point. 432 * 433 * Note that holding a locked namecache structure prevents other threads 434 * from making namespace changes (e.g. deleting or creating), prevents 435 * vnode association state changes by other threads, and prevents the 436 * namecache entry from being resolved or unresolved by other threads. 437 * 438 * An exclusive lock owner has full authority to associate/disassociate 439 * vnodes and resolve/unresolve the locked ncp. 440 * 441 * A shared lock owner only has authority to acquire the underlying vnode, 442 * if any. 443 * 444 * The primary lock field is nc_lockstatus. nc_locktd is set after the 445 * fact (when locking) or cleared prior to unlocking. 446 * 447 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed 448 * or recycled, but it does NOT help you if the vnode had already 449 * initiated a recyclement. If this is important, use cache_get() 450 * rather then cache_lock() (and deal with the differences in the 451 * way the refs counter is handled). Or, alternatively, make an 452 * unconditional call to cache_validate() or cache_resolve() 453 * after cache_lock() returns. 454 */ 455 static __inline 456 void 457 _cache_lock(struct namecache *ncp) 458 { 459 int didwarn = 0; 460 int error; 461 462 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 463 while (__predict_false(error == EWOULDBLOCK)) { 464 if (didwarn == 0) { 465 didwarn = ticks - nclockwarn; 466 kprintf("[diagnostic] cache_lock: " 467 "%s blocked on %p " 468 "\"%*.*s\"\n", 469 curthread->td_comm, ncp, 470 ncp->nc_nlen, ncp->nc_nlen, 471 ncp->nc_name); 472 } 473 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_TIMELOCK); 474 } 475 if (__predict_false(didwarn)) { 476 kprintf("[diagnostic] cache_lock: " 477 "%s unblocked %*.*s after %d secs\n", 478 curthread->td_comm, 479 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 480 (int)(ticks - didwarn) / hz); 481 } 482 } 483 484 /* 485 * Release a previously acquired lock. 486 * 487 * A concurrent shared-lock acquisition or acquisition/release can 488 * race bit 31 so only drop the ncp if bit 31 was set. 489 */ 490 static __inline 491 void 492 _cache_unlock(struct namecache *ncp) 493 { 494 lockmgr(&ncp->nc_lock, LK_RELEASE); 495 } 496 497 /* 498 * Lock ncp exclusively, non-blocking. Return 0 on success. 499 */ 500 static __inline 501 int 502 _cache_lock_nonblock(struct namecache *ncp) 503 { 504 int error; 505 506 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_NOWAIT); 507 if (__predict_false(error != 0)) { 508 return(EWOULDBLOCK); 509 } 510 return 0; 511 } 512 513 /* 514 * This is a special form of _cache_lock() which only succeeds if 515 * it can get a pristine, non-recursive lock. The caller must have 516 * already ref'd the ncp. 517 * 518 * On success the ncp will be locked, on failure it will not. The 519 * ref count does not change either way. 520 * 521 * We want _cache_lock_special() (on success) to return a definitively 522 * usable vnode or a definitively unresolved ncp. 523 */ 524 static __inline 525 int 526 _cache_lock_special(struct namecache *ncp) 527 { 528 if (_cache_lock_nonblock(ncp) == 0) { 529 if (lockmgr_oneexcl(&ncp->nc_lock)) { 530 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 531 _cache_setunresolved(ncp); 532 return 0; 533 } 534 _cache_unlock(ncp); 535 } 536 return EWOULDBLOCK; 537 } 538 539 /* 540 * Shared lock, guarantees vp held 541 * 542 * The shared lock holds vp on the 0->1 transition. It is possible to race 543 * another shared lock release, preventing the other release from dropping 544 * the vnode and clearing bit 31. 545 * 546 * If it is not set then we are responsible for setting it, and this 547 * responsibility does not race with anyone else. 548 */ 549 static __inline 550 void 551 _cache_lock_shared(struct namecache *ncp) 552 { 553 int didwarn = 0; 554 int error; 555 556 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 557 while (__predict_false(error == EWOULDBLOCK)) { 558 if (didwarn == 0) { 559 didwarn = ticks - nclockwarn; 560 kprintf("[diagnostic] cache_lock_shared: " 561 "%s blocked on %p " 562 "\"%*.*s\"\n", 563 curthread->td_comm, ncp, 564 ncp->nc_nlen, ncp->nc_nlen, 565 ncp->nc_name); 566 } 567 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 568 } 569 if (__predict_false(didwarn)) { 570 kprintf("[diagnostic] cache_lock_shared: " 571 "%s unblocked %*.*s after %d secs\n", 572 curthread->td_comm, 573 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 574 (int)(ticks - didwarn) / hz); 575 } 576 } 577 578 /* 579 * Shared lock, guarantees vp held. Non-blocking. Returns 0 on success 580 */ 581 static __inline 582 int 583 _cache_lock_shared_nonblock(struct namecache *ncp) 584 { 585 int error; 586 587 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_NOWAIT); 588 if (__predict_false(error != 0)) { 589 return(EWOULDBLOCK); 590 } 591 return 0; 592 } 593 594 /* 595 * This function tries to get a shared lock but will back-off to an 596 * exclusive lock if: 597 * 598 * (1) Some other thread is trying to obtain an exclusive lock 599 * (to prevent the exclusive requester from getting livelocked out 600 * by many shared locks). 601 * 602 * (2) The current thread already owns an exclusive lock (to avoid 603 * deadlocking). 604 * 605 * WARNING! On machines with lots of cores we really want to try hard to 606 * get a shared lock or concurrent path lookups can chain-react 607 * into a very high-latency exclusive lock. 608 * 609 * This is very evident in dsynth's initial scans. 610 */ 611 static __inline 612 int 613 _cache_lock_shared_special(struct namecache *ncp) 614 { 615 /* 616 * Only honor a successful shared lock (returning 0) if there is 617 * no exclusive request pending and the vnode, if present, is not 618 * in a reclaimed state. 619 */ 620 if (_cache_lock_shared_nonblock(ncp) == 0) { 621 if (__predict_true(!lockmgr_exclpending(&ncp->nc_lock))) { 622 if (ncp->nc_vp == NULL || 623 (ncp->nc_vp->v_flag & VRECLAIMED) == 0) { 624 return(0); 625 } 626 } 627 _cache_unlock(ncp); 628 return(EWOULDBLOCK); 629 } 630 631 /* 632 * Non-blocking shared lock failed. If we already own the exclusive 633 * lock just acquire another exclusive lock (instead of deadlocking). 634 * Otherwise acquire a shared lock. 635 */ 636 if (lockstatus(&ncp->nc_lock, curthread) == LK_EXCLUSIVE) { 637 _cache_lock(ncp); 638 return(0); 639 } 640 _cache_lock_shared(ncp); 641 return(0); 642 } 643 644 static __inline 645 int 646 _cache_lockstatus(struct namecache *ncp) 647 { 648 int status; 649 650 status = lockstatus(&ncp->nc_lock, curthread); 651 if (status == 0 || status == LK_EXCLOTHER) 652 status = -1; 653 return status; 654 } 655 656 /* 657 * cache_hold() and cache_drop() prevent the premature deletion of a 658 * namecache entry but do not prevent operations (such as zapping) on 659 * that namecache entry. 660 * 661 * This routine may only be called from outside this source module if 662 * nc_refs is already deterministically at least 1, such as being 663 * associated with e.g. a process, file descriptor, or some other entity. 664 * 665 * Only the above situations, similar situations within this module where 666 * the ref count is deterministically at least 1, or when the ncp is found 667 * via the nchpp (hash table) lookup, can bump nc_refs. 668 * 669 * Very specifically, a ncp found via nc_list CANNOT bump nc_refs. It 670 * can still be removed from the nc_list, however, as long as the caller 671 * can acquire its lock (in the wrong order). 672 * 673 * This is a rare case where callers are allowed to hold a spinlock, 674 * so we can't ourselves. 675 */ 676 static __inline 677 struct namecache * 678 _cache_hold(struct namecache *ncp) 679 { 680 KKASSERT(ncp->nc_refs > 0); 681 atomic_add_int(&ncp->nc_refs, 1); 682 683 return(ncp); 684 } 685 686 /* 687 * Drop a cache entry. 688 * 689 * The 1->0 transition is special and requires the caller to destroy the 690 * entry. It means that the ncp is no longer on a nchpp list (since that 691 * would mean there was stilla ref). The ncp could still be on a nc_list 692 * but will not have any child of its own, again because nc_refs is now 0 693 * and children would have a ref to their parent. 694 * 695 * Once the 1->0 transition is made, nc_refs cannot be incremented again. 696 */ 697 static __inline 698 void 699 _cache_drop(struct namecache *ncp) 700 { 701 if (atomic_fetchadd_int(&ncp->nc_refs, -1) == 1) { 702 /* 703 * Executed unlocked (no need to lock on last drop) 704 */ 705 _cache_setunresolved(ncp); 706 707 /* 708 * Scrap it. 709 */ 710 ncp->nc_refs = -1; /* safety */ 711 if (ncp->nc_name) 712 kfree(ncp->nc_name, M_VFSCACHEAUX); 713 kfree_obj(ncp, M_VFSCACHE); 714 } 715 } 716 717 /* 718 * Link a new namecache entry to its parent and to the hash table. Be 719 * careful to avoid races if vhold() blocks in the future. 720 * 721 * Both ncp and par must be referenced and locked. The reference is 722 * transfered to the nchpp (and, most notably, NOT to the parent list). 723 * 724 * NOTE: The hash table spinlock is held across this call, we can't do 725 * anything fancy. 726 */ 727 static void 728 _cache_link_parent(struct namecache *ncp, struct namecache *par, 729 struct nchash_head *nchpp) 730 { 731 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 732 733 KKASSERT(ncp->nc_parent == NULL); 734 ncp->nc_parent = par; 735 ncp->nc_head = nchpp; 736 737 /* 738 * Set inheritance flags. Note that the parent flags may be 739 * stale due to getattr potentially not having been run yet 740 * (it gets run during nlookup()'s). 741 */ 742 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE); 743 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) 744 ncp->nc_flag |= NCF_SF_PNOCACHE; 745 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE)) 746 ncp->nc_flag |= NCF_UF_PCACHE; 747 748 /* 749 * Add to hash table and parent, adjust accounting 750 */ 751 TAILQ_INSERT_HEAD(&nchpp->list, ncp, nc_hash); 752 atomic_add_long(&pn->vfscache_count, 1); 753 754 /* 755 * ncp is a new leaf being added to the tree 756 */ 757 if (TAILQ_EMPTY(&ncp->nc_list)) { 758 atomic_add_long(&pn->vfscache_leafs, 1); 759 if (ncp->nc_flag & NCF_UNRESOLVED) 760 atomic_add_long(&pn->vfscache_unres, 1); 761 } 762 763 if (TAILQ_EMPTY(&par->nc_list)) { 764 /* 765 * Parent was, but now is no longer a leaf 766 */ 767 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 768 if (par->nc_flag & NCF_UNRESOLVED) 769 atomic_add_long(&pn->vfscache_unres, -1); 770 atomic_add_long(&pn->vfscache_leafs, -1); 771 772 /* 773 * Any vp associated with an ncp which has children must 774 * be held to prevent it from being recycled. 775 */ 776 if (par->nc_vp) 777 vhold(par->nc_vp); 778 } else { 779 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 780 } 781 _cache_hold(par); /* add nc_parent ref */ 782 } 783 784 /* 785 * Remove the parent and hash associations from a namecache structure. 786 * Drop the ref-count on the parent. The caller receives the ref 787 * from the ncp's nchpp linkage that was removed and may forward that 788 * ref to a new linkage. 789 790 * The caller usually holds an additional ref * on the ncp so the unlink 791 * cannot be the final drop. XXX should not be necessary now since the 792 * caller receives the ref from the nchpp linkage, assuming the ncp 793 * was linked in the first place. 794 * 795 * ncp must be locked, which means that there won't be any nc_parent 796 * removal races. This routine will acquire a temporary lock on 797 * the parent as well as the appropriate hash chain. 798 * 799 * par must be locked and will remain locked on return. 800 * 801 * nhcpp must be spin-locked. This routine eats the spin-lock. 802 */ 803 static __inline void 804 _cache_unlink_parent(struct namecache *par, struct namecache *ncp, 805 struct nchash_head *nchpp) 806 { 807 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 808 struct vnode *dropvp; 809 810 KKASSERT(ncp->nc_parent == par); 811 cpu_ccfence(); 812 813 /* don't add a ref, we drop the nchpp ref later */ 814 815 /* 816 * Remove from hash table and parent, adjust accounting 817 */ 818 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 819 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 820 atomic_add_long(&pn->vfscache_count, -1); 821 822 /* 823 * Removing leaf from tree 824 */ 825 if (TAILQ_EMPTY(&ncp->nc_list)) { 826 if (ncp->nc_flag & NCF_UNRESOLVED) 827 atomic_add_long(&pn->vfscache_unres, -1); 828 atomic_add_long(&pn->vfscache_leafs, -1); 829 } 830 831 /* 832 * Parent is now a leaf? 833 */ 834 dropvp = NULL; 835 if (TAILQ_EMPTY(&par->nc_list)) { 836 if (par->nc_flag & NCF_UNRESOLVED) 837 atomic_add_long(&pn->vfscache_unres, 1); 838 atomic_add_long(&pn->vfscache_leafs, 1); 839 if (par->nc_vp) 840 dropvp = par->nc_vp; 841 } 842 ncp->nc_parent = NULL; 843 ncp->nc_head = NULL; 844 spin_unlock(&nchpp->spin); 845 _cache_drop(par); /* drop nc_parent ref from ncp */ 846 847 /* 848 * We can only safely vdrop with no spinlocks held. 849 */ 850 if (dropvp) 851 vdrop(dropvp); 852 } 853 854 /* 855 * Allocate a new namecache structure. Most of the code does not require 856 * zero-termination of the string but it makes vop_compat_ncreate() easier. 857 * 858 * The returned ncp will be locked and referenced. The ref is generally meant 859 * to be transfered to the nchpp linkage. 860 */ 861 static struct namecache * 862 cache_alloc(int nlen) 863 { 864 struct namecache *ncp; 865 866 ncp = kmalloc_obj(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO); 867 if (nlen) 868 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHEAUX, M_WAITOK); 869 ncp->nc_nlen = nlen; 870 ncp->nc_flag = NCF_UNRESOLVED; 871 ncp->nc_error = ENOTCONN; /* needs to be resolved */ 872 ncp->nc_refs = 1; 873 TAILQ_INIT(&ncp->nc_list); 874 lockinit(&ncp->nc_lock, "ncplk", hz, LK_CANRECURSE); 875 lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 876 877 return(ncp); 878 } 879 880 /* 881 * Can only be called for the case where the ncp has never been 882 * associated with anything (so no spinlocks are needed). 883 */ 884 static void 885 _cache_free(struct namecache *ncp) 886 { 887 KKASSERT(ncp->nc_refs == 1); 888 if (ncp->nc_name) 889 kfree(ncp->nc_name, M_VFSCACHEAUX); 890 kfree_obj(ncp, M_VFSCACHE); 891 } 892 893 /* 894 * [re]initialize a nchandle. 895 */ 896 void 897 cache_zero(struct nchandle *nch) 898 { 899 nch->ncp = NULL; 900 nch->mount = NULL; 901 } 902 903 /* 904 * Ref and deref a nchandle structure (ncp + mp) 905 * 906 * The caller must specify a stable ncp pointer, typically meaning the 907 * ncp is already referenced but this can also occur indirectly through 908 * e.g. holding a lock on a direct child. 909 * 910 * WARNING: Caller may hold an unrelated read spinlock, which means we can't 911 * use read spinlocks here. 912 */ 913 struct nchandle * 914 cache_hold(struct nchandle *nch) 915 { 916 _cache_hold(nch->ncp); 917 _cache_mntref(nch->mount); 918 return(nch); 919 } 920 921 /* 922 * Create a copy of a namecache handle for an already-referenced 923 * entry. 924 */ 925 void 926 cache_copy(struct nchandle *nch, struct nchandle *target) 927 { 928 struct namecache *ncp; 929 struct mount *mp; 930 struct mntcache_elm *elm; 931 struct namecache *ncpr; 932 int i; 933 934 ncp = nch->ncp; 935 mp = nch->mount; 936 target->ncp = ncp; 937 target->mount = mp; 938 939 elm = _cache_mntcache_hash(ncp); 940 for (i = 0; i < MNTCACHE_SET; ++i) { 941 if (elm->ncp == ncp) { 942 ncpr = atomic_swap_ptr((void *)&elm->ncp, NULL); 943 if (ncpr == ncp) { 944 _cache_mntref(mp); 945 return; 946 } 947 if (ncpr) 948 _cache_drop(ncpr); 949 } 950 ++elm; 951 } 952 if (ncp) 953 _cache_hold(ncp); 954 _cache_mntref(mp); 955 } 956 957 /* 958 * Drop the nchandle, but try to cache the ref to avoid global atomic 959 * ops. This is typically done on the system root and jail root nchandles. 960 */ 961 void 962 cache_drop_and_cache(struct nchandle *nch, int elmno) 963 { 964 struct mntcache_elm *elm; 965 struct mntcache_elm *best; 966 struct namecache *ncpr; 967 int delta1; 968 int delta2; 969 int i; 970 971 if (elmno > 4) { 972 if (nch->ncp) { 973 _cache_drop(nch->ncp); 974 nch->ncp = NULL; 975 } 976 if (nch->mount) { 977 _cache_mntrel(nch->mount); 978 nch->mount = NULL; 979 } 980 return; 981 } 982 983 elm = _cache_mntcache_hash(nch->ncp); 984 best = elm; 985 for (i = 0; i < MNTCACHE_SET; ++i) { 986 if (elm->ncp == NULL) { 987 ncpr = atomic_swap_ptr((void *)&elm->ncp, nch->ncp); 988 _cache_mntrel(nch->mount); 989 elm->ticks = ticks; 990 nch->mount = NULL; 991 nch->ncp = NULL; 992 if (ncpr) 993 _cache_drop(ncpr); 994 return; 995 } 996 delta1 = ticks - best->ticks; 997 delta2 = ticks - elm->ticks; 998 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 999 best = elm; 1000 ++elm; 1001 } 1002 ncpr = atomic_swap_ptr((void *)&best->ncp, nch->ncp); 1003 _cache_mntrel(nch->mount); 1004 best->ticks = ticks; 1005 nch->mount = NULL; 1006 nch->ncp = NULL; 1007 if (ncpr) 1008 _cache_drop(ncpr); 1009 } 1010 1011 void 1012 cache_changemount(struct nchandle *nch, struct mount *mp) 1013 { 1014 _cache_mntref(mp); 1015 _cache_mntrel(nch->mount); 1016 nch->mount = mp; 1017 } 1018 1019 void 1020 cache_drop(struct nchandle *nch) 1021 { 1022 _cache_mntrel(nch->mount); 1023 _cache_drop(nch->ncp); 1024 nch->ncp = NULL; 1025 nch->mount = NULL; 1026 } 1027 1028 int 1029 cache_lockstatus(struct nchandle *nch) 1030 { 1031 return(_cache_lockstatus(nch->ncp)); 1032 } 1033 1034 void 1035 cache_lock(struct nchandle *nch) 1036 { 1037 _cache_lock(nch->ncp); 1038 } 1039 1040 void 1041 cache_lock_maybe_shared(struct nchandle *nch, int excl) 1042 { 1043 struct namecache *ncp = nch->ncp; 1044 1045 if (ncp_shared_lock_disable || excl || 1046 (ncp->nc_flag & NCF_UNRESOLVED)) { 1047 _cache_lock(ncp); 1048 } else { 1049 _cache_lock_shared(ncp); 1050 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1051 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1052 _cache_unlock(ncp); 1053 _cache_lock(ncp); 1054 } 1055 } else { 1056 _cache_unlock(ncp); 1057 _cache_lock(ncp); 1058 } 1059 } 1060 } 1061 1062 /* 1063 * Lock fncpd, fncp, tncpd, and tncp. tncp is already locked but may 1064 * have to be cycled to avoid deadlocks. Make sure all four are resolved. 1065 * 1066 * The caller is responsible for checking the validity upon return as 1067 * the records may have been flagged DESTROYED in the interim. 1068 * 1069 * Namecache lock ordering is leaf first, then parent. However, complex 1070 * interactions may occur between the source and target because there is 1071 * no ordering guarantee between (fncpd, fncp) and (tncpd and tncp). 1072 */ 1073 void 1074 cache_lock4_tondlocked(struct nchandle *fncpd, struct nchandle *fncp, 1075 struct nchandle *tncpd, struct nchandle *tncp, 1076 struct ucred *fcred, struct ucred *tcred) 1077 { 1078 int tlocked = 1; 1079 1080 /* 1081 * Lock tncp and tncpd 1082 * 1083 * NOTE: Because these ncps are not locked to begin with, it is 1084 * possible for other rename races to cause the normal lock 1085 * order assumptions to fail. 1086 * 1087 * NOTE: Lock ordering assumptions are valid if a leaf's parent 1088 * matches after the leaf has been locked. However, ordering 1089 * between the 'from' and the 'to' is not and an overlapping 1090 * lock order reversal is still possible. 1091 */ 1092 again: 1093 if (__predict_false(tlocked == 0)) { 1094 cache_lock(tncp); 1095 } 1096 if (__predict_false(cache_lock_nonblock(tncpd) != 0)) { 1097 cache_unlock(tncp); 1098 cache_lock(tncpd); cache_unlock(tncpd); /* cycle */ 1099 tlocked = 0; 1100 goto again; 1101 } 1102 1103 /* 1104 * Lock fncp and fncpd 1105 * 1106 * NOTE: Because these ncps are not locked to begin with, it is 1107 * possible for other rename races to cause the normal lock 1108 * order assumptions to fail. 1109 * 1110 * NOTE: Lock ordering assumptions are valid if a leaf's parent 1111 * matches after the leaf has been locked. However, ordering 1112 * between the 'from' and the 'to' is not and an overlapping 1113 * lock order reversal is still possible. 1114 */ 1115 if (__predict_false(cache_lock_nonblock(fncp) != 0)) { 1116 cache_unlock(tncpd); 1117 cache_unlock(tncp); 1118 cache_lock(fncp); cache_unlock(fncp); /* cycle */ 1119 tlocked = 0; 1120 goto again; 1121 } 1122 if (__predict_false(cache_lock_nonblock(fncpd) != 0)) { 1123 cache_unlock(fncp); 1124 cache_unlock(tncpd); 1125 cache_unlock(tncp); 1126 cache_lock(fncpd); cache_unlock(fncpd); /* cycle */ 1127 tlocked = 0; 1128 goto again; 1129 } 1130 if (__predict_true((fncpd->ncp->nc_flag & NCF_DESTROYED) == 0)) 1131 cache_resolve(fncpd, fcred); 1132 if (__predict_true((tncpd->ncp->nc_flag & NCF_DESTROYED) == 0)) 1133 cache_resolve(tncpd, tcred); 1134 if (__predict_true((fncp->ncp->nc_flag & NCF_DESTROYED) == 0)) 1135 cache_resolve(fncp, fcred); 1136 if (__predict_true((tncp->ncp->nc_flag & NCF_DESTROYED) == 0)) 1137 cache_resolve(tncp, tcred); 1138 } 1139 1140 int 1141 cache_lock_nonblock(struct nchandle *nch) 1142 { 1143 return(_cache_lock_nonblock(nch->ncp)); 1144 } 1145 1146 void 1147 cache_unlock(struct nchandle *nch) 1148 { 1149 _cache_unlock(nch->ncp); 1150 } 1151 1152 /* 1153 * ref-and-lock, unlock-and-deref functions. 1154 * 1155 * This function is primarily used by nlookup. Even though cache_lock 1156 * holds the vnode, it is possible that the vnode may have already 1157 * initiated a recyclement. 1158 * 1159 * We want cache_get() to return a definitively usable vnode or a 1160 * definitively unresolved ncp. 1161 */ 1162 static 1163 struct namecache * 1164 _cache_get(struct namecache *ncp) 1165 { 1166 _cache_hold(ncp); 1167 _cache_lock(ncp); 1168 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1169 _cache_setunresolved(ncp); 1170 return(ncp); 1171 } 1172 1173 /* 1174 * Attempt to obtain a shared lock on the ncp. A shared lock will only 1175 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is 1176 * valid. Otherwise an exclusive lock will be acquired instead. 1177 */ 1178 static 1179 struct namecache * 1180 _cache_get_maybe_shared(struct namecache *ncp, int excl) 1181 { 1182 if (ncp_shared_lock_disable || excl || 1183 (ncp->nc_flag & NCF_UNRESOLVED)) { 1184 return(_cache_get(ncp)); 1185 } 1186 _cache_hold(ncp); 1187 _cache_lock_shared(ncp); 1188 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1189 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1190 _cache_unlock(ncp); 1191 ncp = _cache_get(ncp); 1192 _cache_drop(ncp); 1193 } 1194 } else { 1195 _cache_unlock(ncp); 1196 ncp = _cache_get(ncp); 1197 _cache_drop(ncp); 1198 } 1199 return(ncp); 1200 } 1201 1202 /* 1203 * NOTE: The same nchandle can be passed for both arguments. 1204 */ 1205 void 1206 cache_get(struct nchandle *nch, struct nchandle *target) 1207 { 1208 KKASSERT(nch->ncp->nc_refs > 0); 1209 target->mount = nch->mount; 1210 target->ncp = _cache_get(nch->ncp); 1211 _cache_mntref(target->mount); 1212 } 1213 1214 void 1215 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl) 1216 { 1217 KKASSERT(nch->ncp->nc_refs > 0); 1218 target->mount = nch->mount; 1219 target->ncp = _cache_get_maybe_shared(nch->ncp, excl); 1220 _cache_mntref(target->mount); 1221 } 1222 1223 /* 1224 * Release a held and locked ncp 1225 */ 1226 static __inline 1227 void 1228 _cache_put(struct namecache *ncp) 1229 { 1230 _cache_unlock(ncp); 1231 _cache_drop(ncp); 1232 } 1233 1234 void 1235 cache_put(struct nchandle *nch) 1236 { 1237 _cache_mntrel(nch->mount); 1238 _cache_put(nch->ncp); 1239 nch->ncp = NULL; 1240 nch->mount = NULL; 1241 } 1242 1243 /* 1244 * Resolve an unresolved ncp by associating a vnode with it. If the 1245 * vnode is NULL, a negative cache entry is created. 1246 * 1247 * The ncp should be locked on entry and will remain locked on return. 1248 */ 1249 static 1250 void 1251 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp) 1252 { 1253 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 1254 1255 KKASSERT((ncp->nc_flag & NCF_UNRESOLVED) && 1256 (_cache_lockstatus(ncp) == LK_EXCLUSIVE) && 1257 ncp->nc_vp == NULL); 1258 1259 if (vp) { 1260 /* 1261 * Any vp associated with an ncp which has children must 1262 * be held. Any vp associated with a locked ncp must be held. 1263 */ 1264 if (!TAILQ_EMPTY(&ncp->nc_list)) 1265 vhold(vp); 1266 spin_lock(&vp->v_spin); 1267 ncp->nc_vp = vp; 1268 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode); 1269 ++vp->v_namecache_count; 1270 _cache_hold(ncp); /* v_namecache assoc */ 1271 spin_unlock(&vp->v_spin); 1272 vhold(vp); /* nc_vp */ 1273 1274 /* 1275 * Set auxiliary flags 1276 */ 1277 switch(vp->v_type) { 1278 case VDIR: 1279 ncp->nc_flag |= NCF_ISDIR; 1280 break; 1281 case VLNK: 1282 ncp->nc_flag |= NCF_ISSYMLINK; 1283 /* XXX cache the contents of the symlink */ 1284 break; 1285 default: 1286 break; 1287 } 1288 1289 ncp->nc_error = 0; 1290 1291 /* 1292 * XXX: this is a hack to work-around the lack of a real pfs vfs 1293 * implementation 1294 */ 1295 if (mp) { 1296 if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0) 1297 vp->v_pfsmp = mp; 1298 } 1299 } else { 1300 /* 1301 * When creating a negative cache hit we set the 1302 * namecache_gen. A later resolve will clean out the 1303 * negative cache hit if the mount point's namecache_gen 1304 * has changed. Used by devfs, could also be used by 1305 * other remote FSs. 1306 */ 1307 ncp->nc_vp = NULL; 1308 ncp->nc_negcpu = mycpu->gd_cpuid; 1309 spin_lock(&pn->neg_spin); 1310 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 1311 _cache_hold(ncp); /* neg_list assoc */ 1312 ++pn->neg_count; 1313 spin_unlock(&pn->neg_spin); 1314 atomic_add_long(&pn->vfscache_negs, 1); 1315 1316 ncp->nc_error = ENOENT; 1317 if (mp) 1318 VFS_NCPGEN_SET(mp, ncp); 1319 } 1320 1321 /* 1322 * Previously unresolved leaf is now resolved. 1323 */ 1324 if (TAILQ_EMPTY(&ncp->nc_list)) 1325 atomic_add_long(&pn->vfscache_unres, -1); 1326 1327 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP); 1328 } 1329 1330 void 1331 cache_setvp(struct nchandle *nch, struct vnode *vp) 1332 { 1333 _cache_setvp(nch->mount, nch->ncp, vp); 1334 } 1335 1336 /* 1337 * Used for NFS 1338 */ 1339 void 1340 cache_settimeout(struct nchandle *nch, int nticks) 1341 { 1342 struct namecache *ncp = nch->ncp; 1343 1344 if ((ncp->nc_timeout = ticks + nticks) == 0) 1345 ncp->nc_timeout = 1; 1346 } 1347 1348 /* 1349 * Disassociate the vnode or negative-cache association and mark a 1350 * namecache entry as unresolved again. Note that the ncp is still 1351 * left in the hash table and still linked to its parent. 1352 * 1353 * The ncp should be locked and refd on entry and will remain locked and refd 1354 * on return. 1355 * 1356 * This routine is normally never called on a directory containing children. 1357 * However, NFS often does just that in its rename() code as a cop-out to 1358 * avoid complex namespace operations. This disconnects a directory vnode 1359 * from its namecache and can cause the OLDAPI and NEWAPI to get out of 1360 * sync. 1361 * 1362 */ 1363 static 1364 void 1365 _cache_setunresolved(struct namecache *ncp) 1366 { 1367 struct vnode *vp; 1368 1369 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1370 struct pcpu_ncache *pn; 1371 1372 /* 1373 * Is a resolbed leaf now becoming unresolved? 1374 */ 1375 if (TAILQ_EMPTY(&ncp->nc_list)) { 1376 pn = &pcpu_ncache[mycpu->gd_cpuid]; 1377 atomic_add_long(&pn->vfscache_unres, 1); 1378 } 1379 1380 ncp->nc_flag |= NCF_UNRESOLVED; 1381 ncp->nc_timeout = 0; 1382 ncp->nc_error = ENOTCONN; 1383 if ((vp = ncp->nc_vp) != NULL) { 1384 spin_lock(&vp->v_spin); 1385 ncp->nc_vp = NULL; 1386 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode); 1387 --vp->v_namecache_count; 1388 spin_unlock(&vp->v_spin); 1389 1390 /* 1391 * Any vp associated with an ncp with children is 1392 * held by that ncp. Any vp associated with ncp 1393 * is held by that ncp. These conditions must be 1394 * undone when the vp is cleared out from the ncp. 1395 */ 1396 if (!TAILQ_EMPTY(&ncp->nc_list)) 1397 vdrop(vp); 1398 vdrop(vp); 1399 } else { 1400 pn = &pcpu_ncache[ncp->nc_negcpu]; 1401 1402 atomic_add_long(&pn->vfscache_negs, -1); 1403 spin_lock(&pn->neg_spin); 1404 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 1405 --pn->neg_count; 1406 spin_unlock(&pn->neg_spin); 1407 } 1408 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK); 1409 _cache_drop(ncp); /* from v_namecache or neg_list */ 1410 } 1411 } 1412 1413 /* 1414 * The cache_nresolve() code calls this function to automatically 1415 * set a resolved cache element to unresolved if it has timed out 1416 * or if it is a negative cache hit and the mount point namecache_gen 1417 * has changed. 1418 */ 1419 static __inline int 1420 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp) 1421 { 1422 /* 1423 * Try to zap entries that have timed out. We have 1424 * to be careful here because locked leafs may depend 1425 * on the vnode remaining intact in a parent, so only 1426 * do this under very specific conditions. 1427 */ 1428 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 && 1429 TAILQ_EMPTY(&ncp->nc_list)) { 1430 return 1; 1431 } 1432 1433 /* 1434 * If a resolved negative cache hit is invalid due to 1435 * the mount's namecache generation being bumped, zap it. 1436 */ 1437 if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) { 1438 return 1; 1439 } 1440 1441 /* 1442 * Otherwise we are good 1443 */ 1444 return 0; 1445 } 1446 1447 static __inline void 1448 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp) 1449 { 1450 /* 1451 * Already in an unresolved state, nothing to do. 1452 */ 1453 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1454 if (_cache_auto_unresolve_test(mp, ncp)) 1455 _cache_setunresolved(ncp); 1456 } 1457 } 1458 1459 void 1460 cache_setunresolved(struct nchandle *nch) 1461 { 1462 _cache_setunresolved(nch->ncp); 1463 } 1464 1465 /* 1466 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist 1467 * looking for matches. This flag tells the lookup code when it must 1468 * check for a mount linkage and also prevents the directories in question 1469 * from being deleted or renamed. 1470 */ 1471 static 1472 int 1473 cache_clrmountpt_callback(struct mount *mp, void *data) 1474 { 1475 struct nchandle *nch = data; 1476 1477 if (mp->mnt_ncmounton.ncp == nch->ncp) 1478 return(1); 1479 if (mp->mnt_ncmountpt.ncp == nch->ncp) 1480 return(1); 1481 return(0); 1482 } 1483 1484 /* 1485 * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated 1486 * with a mount point. 1487 */ 1488 void 1489 cache_clrmountpt(struct nchandle *nch) 1490 { 1491 int count; 1492 1493 count = mountlist_scan(cache_clrmountpt_callback, nch, 1494 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | 1495 MNTSCAN_NOUNLOCK); 1496 if (count == 0) 1497 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT; 1498 } 1499 1500 /* 1501 * Invalidate portions of the namecache topology given a starting entry. 1502 * The passed ncp is set to an unresolved state and: 1503 * 1504 * The passed ncp must be referenced and locked. The routine may unlock 1505 * and relock ncp several times, and will recheck the children and loop 1506 * to catch races. When done the passed ncp will be returned with the 1507 * reference and lock intact. 1508 * 1509 * CINV_DESTROY - Set a flag in the passed ncp entry indicating 1510 * that the physical underlying nodes have been 1511 * destroyed... as in deleted. For example, when 1512 * a directory is removed. This will cause record 1513 * lookups on the name to no longer be able to find 1514 * the record and tells the resolver to return failure 1515 * rather then trying to resolve through the parent. 1516 * 1517 * The topology itself, including ncp->nc_name, 1518 * remains intact. 1519 * 1520 * This only applies to the passed ncp, if CINV_CHILDREN 1521 * is specified the children are not flagged. 1522 * 1523 * CINV_CHILDREN - Set all children (recursively) to an unresolved 1524 * state as well. 1525 * 1526 * Note that this will also have the side effect of 1527 * cleaning out any unreferenced nodes in the topology 1528 * from the leaves up as the recursion backs out. 1529 * 1530 * Note that the topology for any referenced nodes remains intact, but 1531 * the nodes will be marked as having been destroyed and will be set 1532 * to an unresolved state. 1533 * 1534 * It is possible for cache_inval() to race a cache_resolve(), meaning that 1535 * the namecache entry may not actually be invalidated on return if it was 1536 * revalidated while recursing down into its children. This code guarentees 1537 * that the node(s) will go through an invalidation cycle, but does not 1538 * guarentee that they will remain in an invalidated state. 1539 * 1540 * Returns non-zero if a revalidation was detected during the invalidation 1541 * recursion, zero otherwise. Note that since only the original ncp is 1542 * locked the revalidation ultimately can only indicate that the original ncp 1543 * *MIGHT* no have been reresolved. 1544 * 1545 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we 1546 * have to avoid blowing out the kernel stack. We do this by saving the 1547 * deep namecache node and aborting the recursion, then re-recursing at that 1548 * node using a depth-first algorithm in order to allow multiple deep 1549 * recursions to chain through each other, then we restart the invalidation 1550 * from scratch. 1551 */ 1552 1553 struct cinvtrack { 1554 struct namecache *resume_ncp; 1555 int depth; 1556 }; 1557 1558 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *); 1559 1560 static 1561 int 1562 _cache_inval(struct namecache *ncp, int flags) 1563 { 1564 struct cinvtrack track; 1565 struct namecache *ncp2; 1566 int r; 1567 1568 track.depth = 0; 1569 track.resume_ncp = NULL; 1570 1571 for (;;) { 1572 r = _cache_inval_internal(ncp, flags, &track); 1573 if (track.resume_ncp == NULL) 1574 break; 1575 _cache_unlock(ncp); 1576 while ((ncp2 = track.resume_ncp) != NULL) { 1577 track.resume_ncp = NULL; 1578 _cache_lock(ncp2); 1579 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY, 1580 &track); 1581 /*_cache_put(ncp2);*/ 1582 cache_zap(ncp2); 1583 } 1584 _cache_lock(ncp); 1585 } 1586 return(r); 1587 } 1588 1589 int 1590 cache_inval(struct nchandle *nch, int flags) 1591 { 1592 return(_cache_inval(nch->ncp, flags)); 1593 } 1594 1595 /* 1596 * Helper for _cache_inval(). The passed ncp is refd and locked and 1597 * remains that way on return, but may be unlocked/relocked multiple 1598 * times by the routine. 1599 */ 1600 static int 1601 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track) 1602 { 1603 struct namecache *nextkid; 1604 int rcnt = 0; 1605 1606 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1607 1608 _cache_setunresolved(ncp); 1609 if (flags & CINV_DESTROY) { 1610 ncp->nc_flag |= NCF_DESTROYED; 1611 ++ncp->nc_generation; 1612 } 1613 1614 while ((flags & CINV_CHILDREN) && 1615 (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL 1616 ) { 1617 struct namecache *kid; 1618 int restart; 1619 1620 restart = 0; 1621 _cache_hold(nextkid); 1622 if (++track->depth > MAX_RECURSION_DEPTH) { 1623 track->resume_ncp = ncp; 1624 _cache_hold(ncp); 1625 ++rcnt; 1626 } 1627 while ((kid = nextkid) != NULL) { 1628 /* 1629 * Parent (ncp) must be locked for the iteration. 1630 */ 1631 nextkid = NULL; 1632 if (kid->nc_parent != ncp) { 1633 _cache_drop(kid); 1634 kprintf("cache_inval_internal restartA %s\n", 1635 ncp->nc_name); 1636 restart = 1; 1637 break; 1638 } 1639 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL) 1640 _cache_hold(nextkid); 1641 1642 /* 1643 * Parent unlocked for this section to avoid 1644 * deadlocks. Then lock the kid and check for 1645 * races. 1646 */ 1647 _cache_unlock(ncp); 1648 if (track->resume_ncp) { 1649 _cache_drop(kid); 1650 _cache_lock(ncp); 1651 break; 1652 } 1653 _cache_lock(kid); 1654 if (kid->nc_parent != ncp) { 1655 kprintf("cache_inval_internal " 1656 "restartB %s\n", 1657 ncp->nc_name); 1658 restart = 1; 1659 _cache_unlock(kid); 1660 _cache_drop(kid); 1661 _cache_lock(ncp); 1662 break; 1663 } 1664 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 || 1665 TAILQ_FIRST(&kid->nc_list) 1666 ) { 1667 1668 rcnt += _cache_inval_internal(kid, 1669 flags & ~CINV_DESTROY, track); 1670 /*_cache_unlock(kid);*/ 1671 /*_cache_drop(kid);*/ 1672 cache_zap(kid); 1673 } else { 1674 cache_zap(kid); 1675 } 1676 1677 /* 1678 * Relock parent to continue scan 1679 */ 1680 _cache_lock(ncp); 1681 } 1682 if (nextkid) 1683 _cache_drop(nextkid); 1684 --track->depth; 1685 if (restart == 0) 1686 break; 1687 } 1688 1689 /* 1690 * Someone could have gotten in there while ncp was unlocked, 1691 * retry if so. 1692 */ 1693 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 1694 ++rcnt; 1695 return (rcnt); 1696 } 1697 1698 /* 1699 * Invalidate a vnode's namecache associations. To avoid races against 1700 * the resolver we do not invalidate a node which we previously invalidated 1701 * but which was then re-resolved while we were in the invalidation loop. 1702 * 1703 * Returns non-zero if any namecache entries remain after the invalidation 1704 * loop completed. 1705 * 1706 * NOTE: Unlike the namecache topology which guarentees that ncp's will not 1707 * be ripped out of the topology while held, the vnode's v_namecache 1708 * list has no such restriction. NCP's can be ripped out of the list 1709 * at virtually any time if not locked, even if held. 1710 * 1711 * In addition, the v_namecache list itself must be locked via 1712 * the vnode's spinlock. 1713 */ 1714 int 1715 cache_inval_vp(struct vnode *vp, int flags) 1716 { 1717 struct namecache *ncp; 1718 struct namecache *next; 1719 1720 restart: 1721 spin_lock(&vp->v_spin); 1722 ncp = TAILQ_FIRST(&vp->v_namecache); 1723 if (ncp) 1724 _cache_hold(ncp); 1725 while (ncp) { 1726 /* loop entered with ncp held and vp spin-locked */ 1727 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1728 _cache_hold(next); 1729 spin_unlock(&vp->v_spin); 1730 _cache_lock(ncp); 1731 if (ncp->nc_vp != vp) { 1732 kprintf("Warning: cache_inval_vp: race-A detected on " 1733 "%s\n", ncp->nc_name); 1734 _cache_put(ncp); 1735 if (next) 1736 _cache_drop(next); 1737 goto restart; 1738 } 1739 _cache_inval(ncp, flags); 1740 _cache_put(ncp); /* also releases reference */ 1741 ncp = next; 1742 spin_lock(&vp->v_spin); 1743 if (ncp && ncp->nc_vp != vp) { 1744 spin_unlock(&vp->v_spin); 1745 kprintf("Warning: cache_inval_vp: race-B detected on " 1746 "%s\n", ncp->nc_name); 1747 _cache_drop(ncp); 1748 goto restart; 1749 } 1750 } 1751 spin_unlock(&vp->v_spin); 1752 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1753 } 1754 1755 /* 1756 * This routine is used instead of the normal cache_inval_vp() when we 1757 * are trying to recycle otherwise good vnodes. 1758 * 1759 * Return 0 on success, non-zero if not all namecache records could be 1760 * disassociated from the vnode (for various reasons). 1761 */ 1762 int 1763 cache_inval_vp_nonblock(struct vnode *vp) 1764 { 1765 struct namecache *ncp; 1766 struct namecache *next; 1767 1768 spin_lock(&vp->v_spin); 1769 ncp = TAILQ_FIRST(&vp->v_namecache); 1770 if (ncp) 1771 _cache_hold(ncp); 1772 while (ncp) { 1773 /* loop entered with ncp held */ 1774 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1775 _cache_hold(next); 1776 spin_unlock(&vp->v_spin); 1777 if (_cache_lock_nonblock(ncp)) { 1778 _cache_drop(ncp); 1779 if (next) 1780 _cache_drop(next); 1781 goto done; 1782 } 1783 if (ncp->nc_vp != vp) { 1784 kprintf("Warning: cache_inval_vp: race-A detected on " 1785 "%s\n", ncp->nc_name); 1786 _cache_put(ncp); 1787 if (next) 1788 _cache_drop(next); 1789 goto done; 1790 } 1791 _cache_inval(ncp, 0); 1792 _cache_put(ncp); /* also releases reference */ 1793 ncp = next; 1794 spin_lock(&vp->v_spin); 1795 if (ncp && ncp->nc_vp != vp) { 1796 spin_unlock(&vp->v_spin); 1797 kprintf("Warning: cache_inval_vp: race-B detected on " 1798 "%s\n", ncp->nc_name); 1799 _cache_drop(ncp); 1800 goto done; 1801 } 1802 } 1803 spin_unlock(&vp->v_spin); 1804 done: 1805 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1806 } 1807 1808 /* 1809 * Attempt to quickly invalidate the vnode's namecache entry. This function 1810 * will also dive the ncp and free its children but only if they are trivial. 1811 * All locks are non-blocking and the function will fail if required locks 1812 * cannot be obtained. 1813 * 1814 * We want this sort of function to be able to guarantee progress when vnlru 1815 * wants to recycle a vnode. Directories could otherwise get stuck and not 1816 * be able to recycle due to destroyed or unresolved children in the 1817 * namecache. 1818 */ 1819 void 1820 cache_inval_vp_quick(struct vnode *vp) 1821 { 1822 struct namecache *ncp; 1823 struct namecache *kid; 1824 1825 spin_lock(&vp->v_spin); 1826 while ((ncp = TAILQ_FIRST(&vp->v_namecache)) != NULL) { 1827 _cache_hold(ncp); 1828 spin_unlock(&vp->v_spin); 1829 if (_cache_lock_nonblock(ncp)) { 1830 _cache_drop(ncp); 1831 return; 1832 } 1833 1834 /* 1835 * Try to trivially destroy any children. 1836 */ 1837 while ((kid = TAILQ_FIRST(&ncp->nc_list)) != NULL) { 1838 struct nchash_head *nchpp; 1839 1840 /* 1841 * Early test without the lock 1842 */ 1843 if (TAILQ_FIRST(&kid->nc_list) || 1844 kid->nc_vp || 1845 kid->nc_refs != 1) 1846 { 1847 _cache_put(ncp); 1848 return; 1849 } 1850 1851 _cache_hold(kid); 1852 if (_cache_lock_nonblock(kid)) { 1853 _cache_drop(kid); 1854 _cache_put(ncp); 1855 return; 1856 } 1857 1858 /* 1859 * A destruction/free test requires the parent, 1860 * the child, and the hash table to be locked. 1861 */ 1862 nchpp = kid->nc_head; 1863 spin_lock(&nchpp->spin); 1864 1865 /* 1866 * Give up if the child isn't trivial. 1867 */ 1868 if (kid->nc_parent != ncp || 1869 kid->nc_vp || 1870 kid->nc_refs != 2 || 1871 TAILQ_FIRST(&kid->nc_list)) 1872 { 1873 spin_unlock(&nchpp->spin); 1874 _cache_put(kid); 1875 _cache_put(ncp); 1876 return; 1877 } 1878 1879 /* 1880 * Kaboom (eats nchpp) 1881 * 1882 * Call eats nhcpp spin-lock 1883 */ 1884 _cache_unlink_parent(ncp, kid, nchpp); 1885 1886 /* _cache_unlock(kid) not required */ 1887 kid->nc_refs = -1; /* safety */ 1888 if (kid->nc_name) 1889 kfree(kid->nc_name, M_VFSCACHEAUX); 1890 kfree_obj(kid, M_VFSCACHE); 1891 } 1892 1893 /* 1894 * Success, disassociate and release the ncp. Do not 1895 * try to zap it here. 1896 * 1897 * NOTE: Releasing the ncp here leaves it in the tree, 1898 * but since we have disassociated the vnode this 1899 * ncp entry becomes 'trivial' and successive calls 1900 * to cache_inval_vp_quick() will be able to continue 1901 * to make progress. 1902 */ 1903 _cache_setunresolved(ncp); 1904 _cache_put(ncp); 1905 spin_lock(&vp->v_spin); 1906 } 1907 spin_unlock(&vp->v_spin); 1908 } 1909 1910 /* 1911 * Clears the universal directory search 'ok' flag. This flag allows 1912 * nlookup() to bypass normal vnode checks. This flag is a cached flag 1913 * so clearing it simply forces revalidation. 1914 */ 1915 void 1916 cache_inval_wxok(struct vnode *vp) 1917 { 1918 struct namecache *ncp; 1919 1920 spin_lock(&vp->v_spin); 1921 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) { 1922 if (ncp->nc_flag & (NCF_WXOK | NCF_NOTX)) 1923 atomic_clear_short(&ncp->nc_flag, NCF_WXOK | NCF_NOTX); 1924 } 1925 spin_unlock(&vp->v_spin); 1926 } 1927 1928 /* 1929 * The source ncp has been renamed to the target ncp. All elements have been 1930 * locked, including the parent ncp's. 1931 * 1932 * The target ncp is destroyed (as a normal rename-over would destroy the 1933 * target file or directory). 1934 * 1935 * Because there may be references to the source ncp we cannot copy its 1936 * contents to the target. Instead the source ncp is relinked as the target 1937 * and the target ncp is removed from the namecache topology. 1938 */ 1939 void 1940 cache_rename(struct nchandle *fnch, struct nchandle *tnch) 1941 { 1942 struct namecache *fncp = fnch->ncp; 1943 struct namecache *tncp = tnch->ncp; 1944 struct namecache *par; 1945 struct nchash_head *nchpp; 1946 u_int32_t hash; 1947 char *oname; 1948 char *nname; 1949 1950 ++fncp->nc_generation; 1951 ++tncp->nc_generation; 1952 if (tncp->nc_nlen) { 1953 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHEAUX, M_WAITOK); 1954 bcopy(tncp->nc_name, nname, tncp->nc_nlen); 1955 nname[tncp->nc_nlen] = 0; 1956 } else { 1957 nname = NULL; 1958 } 1959 1960 /* 1961 * Rename fncp (unlink) 1962 */ 1963 if (fncp->nc_parent) { 1964 par = fncp->nc_parent; 1965 _cache_hold(par); 1966 _cache_lock(par); 1967 nchpp = fncp->nc_head; 1968 spin_lock(&nchpp->spin); 1969 _cache_unlink_parent(par, fncp, nchpp); /* eats nchpp */ 1970 _cache_put(par); 1971 } else { 1972 par = NULL; 1973 nchpp = NULL; 1974 } 1975 oname = fncp->nc_name; 1976 fncp->nc_name = nname; 1977 fncp->nc_nlen = tncp->nc_nlen; 1978 if (oname) 1979 kfree(oname, M_VFSCACHEAUX); 1980 1981 par = tncp->nc_parent; 1982 KKASSERT(par->nc_lock.lk_lockholder == curthread); 1983 1984 /* 1985 * Rename fncp (relink) 1986 */ 1987 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT); 1988 hash = fnv_32_buf(&par, sizeof(par), hash); 1989 nchpp = NCHHASH(hash); 1990 1991 spin_lock(&nchpp->spin); 1992 _cache_link_parent(fncp, par, nchpp); 1993 spin_unlock(&nchpp->spin); 1994 1995 /* 1996 * Get rid of the overwritten tncp (unlink) 1997 */ 1998 _cache_unlink(tncp); 1999 } 2000 2001 /* 2002 * Perform actions consistent with unlinking a file. The passed-in ncp 2003 * must be locked. 2004 * 2005 * The ncp is marked DESTROYED so it no longer shows up in searches, 2006 * and will be physically deleted when the vnode goes away. 2007 * 2008 * If the related vnode has no refs then we cycle it through vget()/vput() 2009 * to (possibly if we don't have a ref race) trigger a deactivation, 2010 * allowing the VFS to trivially detect and recycle the deleted vnode 2011 * via VOP_INACTIVE(). 2012 * 2013 * NOTE: _cache_rename() will automatically call _cache_unlink() on the 2014 * target ncp. 2015 */ 2016 void 2017 cache_unlink(struct nchandle *nch) 2018 { 2019 _cache_unlink(nch->ncp); 2020 } 2021 2022 static void 2023 _cache_unlink(struct namecache *ncp) 2024 { 2025 struct vnode *vp; 2026 2027 /* 2028 * Causes lookups to fail and allows another ncp with the same 2029 * name to be created under ncp->nc_parent. 2030 */ 2031 ncp->nc_flag |= NCF_DESTROYED; 2032 ++ncp->nc_generation; 2033 2034 /* 2035 * Attempt to trigger a deactivation. Set VREF_FINALIZE to 2036 * force action on the 1->0 transition. Do not destroy the 2037 * vp association if a vp is present (leave the destroyed ncp 2038 * resolved through the vp finalization). 2039 * 2040 * Cleanup the refs in the resolved-not-found case by setting 2041 * the ncp to an unresolved state. This improves our ability 2042 * to get rid of dead ncp elements in other cache_*() routines. 2043 */ 2044 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 2045 vp = ncp->nc_vp; 2046 if (vp) { 2047 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 2048 if (VREFCNT(vp) <= 0) { 2049 if (vget(vp, LK_SHARED) == 0) 2050 vput(vp); 2051 } 2052 } else { 2053 _cache_setunresolved(ncp); 2054 } 2055 } 2056 } 2057 2058 /* 2059 * Return non-zero if the nch might be associated with an open and/or mmap()'d 2060 * file. The easy solution is to just return non-zero if the vnode has refs. 2061 * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to 2062 * force the reclaim). 2063 */ 2064 int 2065 cache_isopen(struct nchandle *nch) 2066 { 2067 struct vnode *vp; 2068 struct namecache *ncp = nch->ncp; 2069 2070 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 2071 (vp = ncp->nc_vp) != NULL && 2072 VREFCNT(vp)) { 2073 return 1; 2074 } 2075 return 0; 2076 } 2077 2078 2079 /* 2080 * vget the vnode associated with the namecache entry. Resolve the namecache 2081 * entry if necessary. The passed ncp must be referenced and locked. If 2082 * the ncp is resolved it might be locked shared. 2083 * 2084 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked 2085 * (depending on the passed lk_type) will be returned in *vpp with an error 2086 * of 0, or NULL will be returned in *vpp with a non-0 error code. The 2087 * most typical error is ENOENT, meaning that the ncp represents a negative 2088 * cache hit and there is no vnode to retrieve, but other errors can occur 2089 * too. 2090 * 2091 * The vget() can race a reclaim. If this occurs we re-resolve the 2092 * namecache entry. 2093 * 2094 * There are numerous places in the kernel where vget() is called on a 2095 * vnode while one or more of its namecache entries is locked. Releasing 2096 * a vnode never deadlocks against locked namecache entries (the vnode 2097 * will not get recycled while referenced ncp's exist). This means we 2098 * can safely acquire the vnode. In fact, we MUST NOT release the ncp 2099 * lock when acquiring the vp lock or we might cause a deadlock. 2100 * 2101 * NOTE: The passed-in ncp must be locked exclusively if it is initially 2102 * unresolved. If a reclaim race occurs the passed-in ncp will be 2103 * relocked exclusively before being re-resolved. 2104 */ 2105 int 2106 cache_vget(struct nchandle *nch, struct ucred *cred, 2107 int lk_type, struct vnode **vpp) 2108 { 2109 struct namecache *ncp; 2110 struct vnode *vp; 2111 int error; 2112 2113 ncp = nch->ncp; 2114 again: 2115 vp = NULL; 2116 if (ncp->nc_flag & NCF_UNRESOLVED) 2117 error = cache_resolve(nch, cred); 2118 else 2119 error = 0; 2120 2121 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 2122 error = vget(vp, lk_type); 2123 if (error) { 2124 /* 2125 * VRECLAIM race 2126 * 2127 * The ncp may have been locked shared, we must relock 2128 * it exclusively before we can set it to unresolved. 2129 */ 2130 if (error == ENOENT) { 2131 kprintf("Warning: vnode reclaim race detected " 2132 "in cache_vget on %p (%s)\n", 2133 vp, ncp->nc_name); 2134 _cache_unlock(ncp); 2135 _cache_lock(ncp); 2136 _cache_setunresolved(ncp); 2137 goto again; 2138 } 2139 2140 /* 2141 * Not a reclaim race, some other error. 2142 */ 2143 KKASSERT(ncp->nc_vp == vp); 2144 vp = NULL; 2145 } else { 2146 KKASSERT(ncp->nc_vp == vp); 2147 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 2148 } 2149 } 2150 if (error == 0 && vp == NULL) 2151 error = ENOENT; 2152 *vpp = vp; 2153 return(error); 2154 } 2155 2156 /* 2157 * Similar to cache_vget() but only acquires a ref on the vnode. The vnode 2158 * is already held by virtuue of the ncp being locked, but it might not be 2159 * referenced and while it is not referenced it can transition into the 2160 * VRECLAIMED state. 2161 * 2162 * NOTE: The passed-in ncp must be locked exclusively if it is initially 2163 * unresolved. If a reclaim race occurs the passed-in ncp will be 2164 * relocked exclusively before being re-resolved. 2165 * 2166 * NOTE: At the moment we have to issue a vget() on the vnode, even though 2167 * we are going to immediately release the lock, in order to resolve 2168 * potential reclamation races. Once we have a solid vnode ref that 2169 * was (at some point) interlocked via a vget(), the vnode will not 2170 * be reclaimed. 2171 * 2172 * NOTE: vhold counts (v_auxrefs) do not prevent reclamation. 2173 */ 2174 int 2175 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp) 2176 { 2177 struct namecache *ncp; 2178 struct vnode *vp; 2179 int error; 2180 int v; 2181 2182 ncp = nch->ncp; 2183 again: 2184 vp = NULL; 2185 if (ncp->nc_flag & NCF_UNRESOLVED) 2186 error = cache_resolve(nch, cred); 2187 else 2188 error = 0; 2189 2190 while (error == 0 && (vp = ncp->nc_vp) != NULL) { 2191 /* 2192 * Try a lockless ref of the vnode. VRECLAIMED transitions 2193 * use the vx_lock state and update-counter mechanism so we 2194 * can detect if one is in-progress or occurred. 2195 * 2196 * If we can successfully ref the vnode and interlock against 2197 * the update-counter mechanism, and VRECLAIMED is found to 2198 * not be set after that, we should be good. 2199 */ 2200 v = spin_access_start_only(&vp->v_spin); 2201 if (__predict_true(spin_access_check_inprog(v) == 0)) { 2202 vref_special(vp); 2203 if (__predict_false( 2204 spin_access_end_only(&vp->v_spin, v))) { 2205 vrele(vp); 2206 continue; 2207 } 2208 if (__predict_true((vp->v_flag & VRECLAIMED) == 0)) { 2209 break; 2210 } 2211 vrele(vp); 2212 kprintf("CACHE_VREF: IN-RECLAIM\n"); 2213 } 2214 2215 /* 2216 * Do it the slow way 2217 */ 2218 error = vget(vp, LK_SHARED); 2219 if (error) { 2220 /* 2221 * VRECLAIM race 2222 */ 2223 if (error == ENOENT) { 2224 kprintf("Warning: vnode reclaim race detected " 2225 "in cache_vget on %p (%s)\n", 2226 vp, ncp->nc_name); 2227 _cache_unlock(ncp); 2228 _cache_lock(ncp); 2229 _cache_setunresolved(ncp); 2230 goto again; 2231 } 2232 2233 /* 2234 * Not a reclaim race, some other error. 2235 */ 2236 KKASSERT(ncp->nc_vp == vp); 2237 vp = NULL; 2238 } else { 2239 KKASSERT(ncp->nc_vp == vp); 2240 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 2241 /* caller does not want a lock */ 2242 vn_unlock(vp); 2243 } 2244 break; 2245 } 2246 if (error == 0 && vp == NULL) 2247 error = ENOENT; 2248 *vpp = vp; 2249 2250 return(error); 2251 } 2252 2253 /* 2254 * Return a referenced vnode representing the parent directory of 2255 * ncp. 2256 * 2257 * Because the caller has locked the ncp it should not be possible for 2258 * the parent ncp to go away. However, the parent can unresolve its 2259 * dvp at any time so we must be able to acquire a lock on the parent 2260 * to safely access nc_vp. 2261 * 2262 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock, 2263 * so use vhold()/vdrop() while holding the lock to prevent dvp from 2264 * getting destroyed. 2265 * 2266 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a 2267 * lock on the ncp in question.. 2268 */ 2269 struct vnode * 2270 cache_dvpref(struct namecache *ncp) 2271 { 2272 struct namecache *par; 2273 struct vnode *dvp; 2274 2275 dvp = NULL; 2276 if ((par = ncp->nc_parent) != NULL) { 2277 _cache_hold(par); 2278 _cache_lock(par); 2279 if ((par->nc_flag & NCF_UNRESOLVED) == 0) { 2280 if ((dvp = par->nc_vp) != NULL) 2281 vhold(dvp); 2282 } 2283 _cache_unlock(par); 2284 if (dvp) { 2285 if (vget(dvp, LK_SHARED) == 0) { 2286 vn_unlock(dvp); 2287 vdrop(dvp); 2288 /* return refd, unlocked dvp */ 2289 } else { 2290 vdrop(dvp); 2291 dvp = NULL; 2292 } 2293 } 2294 _cache_drop(par); 2295 } 2296 return(dvp); 2297 } 2298 2299 /* 2300 * Convert a directory vnode to a namecache record without any other 2301 * knowledge of the topology. This ONLY works with directory vnodes and 2302 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the 2303 * returned ncp (if not NULL) will be held and unlocked. 2304 * 2305 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned. 2306 * If 'makeit' is 1 we attempt to track-down and create the namecache topology 2307 * for dvp. This will fail only if the directory has been deleted out from 2308 * under the caller. 2309 * 2310 * Callers must always check for a NULL return no matter the value of 'makeit'. 2311 * 2312 * To avoid underflowing the kernel stack each recursive call increments 2313 * the makeit variable. 2314 */ 2315 2316 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2317 struct vnode *dvp, char *fakename); 2318 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2319 struct vnode **saved_dvp); 2320 2321 int 2322 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit, 2323 struct nchandle *nch) 2324 { 2325 struct vnode *saved_dvp; 2326 struct vnode *pvp; 2327 char *fakename; 2328 int error; 2329 2330 nch->ncp = NULL; 2331 nch->mount = dvp->v_mount; 2332 saved_dvp = NULL; 2333 fakename = NULL; 2334 2335 /* 2336 * Handle the makeit == 0 degenerate case 2337 */ 2338 if (makeit == 0) { 2339 spin_lock_shared(&dvp->v_spin); 2340 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2341 if (nch->ncp) 2342 cache_hold(nch); 2343 spin_unlock_shared(&dvp->v_spin); 2344 } 2345 2346 /* 2347 * Loop until resolution, inside code will break out on error. 2348 */ 2349 while (makeit) { 2350 /* 2351 * Break out if we successfully acquire a working ncp. 2352 */ 2353 spin_lock_shared(&dvp->v_spin); 2354 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2355 if (nch->ncp) { 2356 cache_hold(nch); 2357 spin_unlock_shared(&dvp->v_spin); 2358 break; 2359 } 2360 spin_unlock_shared(&dvp->v_spin); 2361 2362 /* 2363 * If dvp is the root of its filesystem it should already 2364 * have a namecache pointer associated with it as a side 2365 * effect of the mount, but it may have been disassociated. 2366 */ 2367 if (dvp->v_flag & VROOT) { 2368 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp); 2369 error = cache_resolve_mp(nch->mount); 2370 _cache_put(nch->ncp); 2371 if (ncvp_debug) { 2372 kprintf("cache_fromdvp: resolve root of mount %p error %d", 2373 dvp->v_mount, error); 2374 } 2375 if (error) { 2376 if (ncvp_debug) 2377 kprintf(" failed\n"); 2378 nch->ncp = NULL; 2379 break; 2380 } 2381 if (ncvp_debug) 2382 kprintf(" succeeded\n"); 2383 continue; 2384 } 2385 2386 /* 2387 * If we are recursed too deeply resort to an O(n^2) 2388 * algorithm to resolve the namecache topology. The 2389 * resolved pvp is left referenced in saved_dvp to 2390 * prevent the tree from being destroyed while we loop. 2391 */ 2392 if (makeit > 20) { 2393 error = cache_fromdvp_try(dvp, cred, &saved_dvp); 2394 if (error) { 2395 kprintf("lookupdotdot(longpath) failed %d " 2396 "dvp %p\n", error, dvp); 2397 nch->ncp = NULL; 2398 break; 2399 } 2400 continue; 2401 } 2402 2403 /* 2404 * Get the parent directory and resolve its ncp. 2405 */ 2406 if (fakename) { 2407 kfree(fakename, M_TEMP); 2408 fakename = NULL; 2409 } 2410 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2411 &fakename); 2412 if (error) { 2413 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp); 2414 break; 2415 } 2416 vn_unlock(pvp); 2417 2418 /* 2419 * Reuse makeit as a recursion depth counter. On success 2420 * nch will be fully referenced. 2421 */ 2422 cache_fromdvp(pvp, cred, makeit + 1, nch); 2423 vrele(pvp); 2424 if (nch->ncp == NULL) 2425 break; 2426 2427 /* 2428 * Do an inefficient scan of pvp (embodied by ncp) to look 2429 * for dvp. This will create a namecache record for dvp on 2430 * success. We loop up to recheck on success. 2431 * 2432 * ncp and dvp are both held but not locked. 2433 */ 2434 error = cache_inefficient_scan(nch, cred, dvp, fakename); 2435 if (error) { 2436 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n", 2437 pvp, nch->ncp->nc_name, dvp); 2438 cache_drop(nch); 2439 /* nch was NULLed out, reload mount */ 2440 nch->mount = dvp->v_mount; 2441 break; 2442 } 2443 if (ncvp_debug) { 2444 kprintf("cache_fromdvp: scan %p (%s) succeeded\n", 2445 pvp, nch->ncp->nc_name); 2446 } 2447 cache_drop(nch); 2448 /* nch was NULLed out, reload mount */ 2449 nch->mount = dvp->v_mount; 2450 } 2451 2452 /* 2453 * If nch->ncp is non-NULL it will have been held already. 2454 */ 2455 if (fakename) 2456 kfree(fakename, M_TEMP); 2457 if (saved_dvp) 2458 vrele(saved_dvp); 2459 if (nch->ncp) 2460 return (0); 2461 return (EINVAL); 2462 } 2463 2464 /* 2465 * Go up the chain of parent directories until we find something 2466 * we can resolve into the namecache. This is very inefficient. 2467 */ 2468 static 2469 int 2470 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2471 struct vnode **saved_dvp) 2472 { 2473 struct nchandle nch; 2474 struct vnode *pvp; 2475 int error; 2476 static time_t last_fromdvp_report; 2477 char *fakename; 2478 2479 /* 2480 * Loop getting the parent directory vnode until we get something we 2481 * can resolve in the namecache. 2482 */ 2483 vref(dvp); 2484 nch.mount = dvp->v_mount; 2485 nch.ncp = NULL; 2486 fakename = NULL; 2487 2488 for (;;) { 2489 if (fakename) { 2490 kfree(fakename, M_TEMP); 2491 fakename = NULL; 2492 } 2493 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2494 &fakename); 2495 if (error) { 2496 vrele(dvp); 2497 break; 2498 } 2499 vn_unlock(pvp); 2500 spin_lock_shared(&pvp->v_spin); 2501 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) { 2502 _cache_hold(nch.ncp); 2503 spin_unlock_shared(&pvp->v_spin); 2504 vrele(pvp); 2505 break; 2506 } 2507 spin_unlock_shared(&pvp->v_spin); 2508 if (pvp->v_flag & VROOT) { 2509 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp); 2510 error = cache_resolve_mp(nch.mount); 2511 _cache_unlock(nch.ncp); 2512 vrele(pvp); 2513 if (error) { 2514 _cache_drop(nch.ncp); 2515 nch.ncp = NULL; 2516 vrele(dvp); 2517 } 2518 break; 2519 } 2520 vrele(dvp); 2521 dvp = pvp; 2522 } 2523 if (error == 0) { 2524 if (last_fromdvp_report != time_uptime) { 2525 last_fromdvp_report = time_uptime; 2526 kprintf("Warning: extremely inefficient path " 2527 "resolution on %s\n", 2528 nch.ncp->nc_name); 2529 } 2530 error = cache_inefficient_scan(&nch, cred, dvp, fakename); 2531 2532 /* 2533 * Hopefully dvp now has a namecache record associated with 2534 * it. Leave it referenced to prevent the kernel from 2535 * recycling the vnode. Otherwise extremely long directory 2536 * paths could result in endless recycling. 2537 */ 2538 if (*saved_dvp) 2539 vrele(*saved_dvp); 2540 *saved_dvp = dvp; 2541 _cache_drop(nch.ncp); 2542 } 2543 if (fakename) 2544 kfree(fakename, M_TEMP); 2545 return (error); 2546 } 2547 2548 /* 2549 * Do an inefficient scan of the directory represented by ncp looking for 2550 * the directory vnode dvp. ncp must be held but not locked on entry and 2551 * will be held on return. dvp must be refd but not locked on entry and 2552 * will remain refd on return. 2553 * 2554 * Why do this at all? Well, due to its stateless nature the NFS server 2555 * converts file handles directly to vnodes without necessarily going through 2556 * the namecache ops that would otherwise create the namecache topology 2557 * leading to the vnode. We could either (1) Change the namecache algorithms 2558 * to allow disconnect namecache records that are re-merged opportunistically, 2559 * or (2) Make the NFS server backtrack and scan to recover a connected 2560 * namecache topology in order to then be able to issue new API lookups. 2561 * 2562 * It turns out that (1) is a huge mess. It takes a nice clean set of 2563 * namecache algorithms and introduces a lot of complication in every subsystem 2564 * that calls into the namecache to deal with the re-merge case, especially 2565 * since we are using the namecache to placehold negative lookups and the 2566 * vnode might not be immediately assigned. (2) is certainly far less 2567 * efficient then (1), but since we are only talking about directories here 2568 * (which are likely to remain cached), the case does not actually run all 2569 * that often and has the supreme advantage of not polluting the namecache 2570 * algorithms. 2571 * 2572 * If a fakename is supplied just construct a namecache entry using the 2573 * fake name. 2574 */ 2575 static int 2576 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2577 struct vnode *dvp, char *fakename) 2578 { 2579 struct nlcomponent nlc; 2580 struct nchandle rncp; 2581 struct dirent *den; 2582 struct vnode *pvp; 2583 struct vattr vat; 2584 struct iovec iov; 2585 struct uio uio; 2586 int blksize; 2587 int eofflag; 2588 int bytes; 2589 char *rbuf; 2590 int error; 2591 2592 vat.va_blocksize = 0; 2593 if ((error = VOP_GETATTR(dvp, &vat)) != 0) 2594 return (error); 2595 cache_lock(nch); 2596 error = cache_vref(nch, cred, &pvp); 2597 cache_unlock(nch); 2598 if (error) 2599 return (error); 2600 if (ncvp_debug) { 2601 kprintf("inefficient_scan of (%p,%s): directory iosize %ld " 2602 "vattr fileid = %lld\n", 2603 nch->ncp, nch->ncp->nc_name, 2604 vat.va_blocksize, 2605 (long long)vat.va_fileid); 2606 } 2607 2608 /* 2609 * Use the supplied fakename if not NULL. Fake names are typically 2610 * not in the actual filesystem hierarchy. This is used by HAMMER 2611 * to glue @@timestamp recursions together. 2612 */ 2613 if (fakename) { 2614 nlc.nlc_nameptr = fakename; 2615 nlc.nlc_namelen = strlen(fakename); 2616 rncp = cache_nlookup(nch, &nlc); 2617 goto done; 2618 } 2619 2620 if ((blksize = vat.va_blocksize) == 0) 2621 blksize = DEV_BSIZE; 2622 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK); 2623 rncp.ncp = NULL; 2624 2625 eofflag = 0; 2626 uio.uio_offset = 0; 2627 again: 2628 iov.iov_base = rbuf; 2629 iov.iov_len = blksize; 2630 uio.uio_iov = &iov; 2631 uio.uio_iovcnt = 1; 2632 uio.uio_resid = blksize; 2633 uio.uio_segflg = UIO_SYSSPACE; 2634 uio.uio_rw = UIO_READ; 2635 uio.uio_td = curthread; 2636 2637 if (ncvp_debug >= 2) 2638 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset); 2639 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL); 2640 if (error == 0) { 2641 den = (struct dirent *)rbuf; 2642 bytes = blksize - uio.uio_resid; 2643 2644 while (bytes > 0) { 2645 if (ncvp_debug >= 2) { 2646 kprintf("cache_inefficient_scan: %*.*s\n", 2647 den->d_namlen, den->d_namlen, 2648 den->d_name); 2649 } 2650 if (den->d_type != DT_WHT && 2651 den->d_ino == vat.va_fileid) { 2652 if (ncvp_debug) { 2653 kprintf("cache_inefficient_scan: " 2654 "MATCHED inode %lld path %s/%*.*s\n", 2655 (long long)vat.va_fileid, 2656 nch->ncp->nc_name, 2657 den->d_namlen, den->d_namlen, 2658 den->d_name); 2659 } 2660 nlc.nlc_nameptr = den->d_name; 2661 nlc.nlc_namelen = den->d_namlen; 2662 rncp = cache_nlookup(nch, &nlc); 2663 KKASSERT(rncp.ncp != NULL); 2664 break; 2665 } 2666 bytes -= _DIRENT_DIRSIZ(den); 2667 den = _DIRENT_NEXT(den); 2668 } 2669 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize) 2670 goto again; 2671 } 2672 kfree(rbuf, M_TEMP); 2673 done: 2674 vrele(pvp); 2675 if (rncp.ncp) { 2676 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) { 2677 _cache_setvp(rncp.mount, rncp.ncp, dvp); 2678 if (ncvp_debug >= 2) { 2679 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n", 2680 nch->ncp->nc_name, rncp.ncp->nc_name, dvp); 2681 } 2682 } else { 2683 if (ncvp_debug >= 2) { 2684 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n", 2685 nch->ncp->nc_name, rncp.ncp->nc_name, dvp, 2686 rncp.ncp->nc_vp); 2687 } 2688 } 2689 if (rncp.ncp->nc_vp == NULL) 2690 error = rncp.ncp->nc_error; 2691 /* 2692 * Release rncp after a successful nlookup. rncp was fully 2693 * referenced. 2694 */ 2695 cache_put(&rncp); 2696 } else { 2697 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n", 2698 dvp, nch->ncp->nc_name); 2699 error = ENOENT; 2700 } 2701 return (error); 2702 } 2703 2704 /* 2705 * This function must be called with the ncp held and locked and will unlock 2706 * and drop it during zapping. 2707 * 2708 * Zap a namecache entry. The ncp is unconditionally set to an unresolved 2709 * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list 2710 * and removes the related reference. If the ncp can be removed, and the 2711 * parent can be zapped non-blocking, this function loops up. 2712 * 2713 * There will be one ref from the caller (which we now own). The only 2714 * remaining autonomous refs to the ncp will then be due to nc_parent->nc_list, 2715 * so possibly 2 refs left. Taking this into account, if there are no 2716 * additional refs and no children, the ncp will be removed from the topology 2717 * and destroyed. 2718 * 2719 * References and/or children may exist if the ncp is in the middle of the 2720 * topology, preventing the ncp from being destroyed. 2721 * 2722 * If nonblock is non-zero and the parent ncp cannot be locked we give up. 2723 * 2724 * This function may return a held (but NOT locked) parent node which the 2725 * caller must drop in a loop. Looping is one way to avoid unbounded recursion 2726 * due to deep namecache trees. 2727 * 2728 * WARNING! For MPSAFE operation this routine must acquire up to three 2729 * spin locks to be able to safely test nc_refs. Lock order is 2730 * very important. 2731 * 2732 * hash spinlock if on hash list 2733 * parent spinlock if child of parent 2734 * (the ncp is unresolved so there is no vnode association) 2735 */ 2736 static int 2737 cache_zap(struct namecache *ncp) 2738 { 2739 struct namecache *par; 2740 struct nchash_head *nchpp; 2741 int refcmp; 2742 int nonblock = 1; /* XXX cleanup */ 2743 int res = 0; 2744 2745 again: 2746 /* 2747 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED. 2748 * This gets rid of any vp->v_namecache list or negative list and 2749 * the related ref. 2750 */ 2751 _cache_setunresolved(ncp); 2752 2753 /* 2754 * Try to scrap the entry and possibly tail-recurse on its parent. 2755 * We only scrap unref'd (other then our ref) unresolved entries, 2756 * we do not scrap 'live' entries. 2757 * 2758 * If nc_parent is non NULL we expect 2 references, else just 1. 2759 * If there are more, someone else also holds the ncp and we cannot 2760 * destroy it. 2761 */ 2762 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 2763 KKASSERT(ncp->nc_refs > 0); 2764 2765 /* 2766 * If the ncp is linked to its parent it will also be in the hash 2767 * table. We have to be able to lock the parent and the hash table. 2768 * 2769 * Acquire locks. Note that the parent can't go away while we hold 2770 * a child locked. If nc_parent is present, expect 2 refs instead 2771 * of 1. 2772 */ 2773 nchpp = NULL; 2774 if ((par = ncp->nc_parent) != NULL) { 2775 if (nonblock) { 2776 if (_cache_lock_nonblock(par)) { 2777 /* lock failed */ 2778 ncp->nc_flag |= NCF_DEFEREDZAP; 2779 atomic_add_long( 2780 &pcpu_ncache[mycpu->gd_cpuid].numdefered, 2781 1); 2782 _cache_unlock(ncp); 2783 _cache_drop(ncp); /* caller's ref */ 2784 return res; 2785 } 2786 _cache_hold(par); 2787 } else { 2788 _cache_hold(par); 2789 _cache_lock(par); 2790 } 2791 nchpp = ncp->nc_head; 2792 spin_lock(&nchpp->spin); 2793 } 2794 2795 /* 2796 * With the parent and nchpp locked, and the vnode removed 2797 * (no vp->v_namecache), we expect 1 or 2 refs. If there are 2798 * more someone else has a ref and we cannot zap the entry. 2799 * 2800 * one for our hold 2801 * one for our parent link (parent also has one from the linkage) 2802 */ 2803 if (par) 2804 refcmp = 2; 2805 else 2806 refcmp = 1; 2807 2808 /* 2809 * On failure undo the work we've done so far and drop the 2810 * caller's ref and ncp. 2811 */ 2812 if (ncp->nc_refs != refcmp || TAILQ_FIRST(&ncp->nc_list)) { 2813 if (par) { 2814 spin_unlock(&nchpp->spin); 2815 _cache_put(par); 2816 } 2817 _cache_unlock(ncp); 2818 _cache_drop(ncp); 2819 return res; 2820 } 2821 2822 /* 2823 * We own all the refs and with the spinlocks held no further 2824 * refs can be acquired by others. 2825 * 2826 * Remove us from the hash list and parent list. We have to 2827 * drop a ref on the parent's vp if the parent's list becomes 2828 * empty. 2829 */ 2830 if (par) { 2831 KKASSERT(nchpp == ncp->nc_head); 2832 _cache_unlink_parent(par, ncp, nchpp); /* eats nhcpp */ 2833 /*_cache_unlock(par);*/ 2834 /* &nchpp->spin is unlocked by call */ 2835 } else { 2836 KKASSERT(ncp->nc_head == NULL); 2837 } 2838 2839 /* 2840 * ncp should not have picked up any refs. Physically 2841 * destroy the ncp. 2842 */ 2843 if (ncp->nc_refs != refcmp) { 2844 panic("cache_zap: %p bad refs %d (expected %d)\n", 2845 ncp, ncp->nc_refs, refcmp); 2846 } 2847 /* _cache_unlock(ncp) not required */ 2848 ncp->nc_refs = -1; /* safety */ 2849 if (ncp->nc_name) 2850 kfree(ncp->nc_name, M_VFSCACHEAUX); 2851 kfree_obj(ncp, M_VFSCACHE); 2852 res = 1; 2853 2854 /* 2855 * Loop up if we can recursively clean out the parent. 2856 */ 2857 if (par) { 2858 refcmp = 1; /* ref on parent */ 2859 if (par->nc_parent) /* par->par */ 2860 ++refcmp; 2861 par->nc_flag &= ~NCF_DEFEREDZAP; 2862 if ((par->nc_flag & NCF_UNRESOLVED) && 2863 par->nc_refs == refcmp && 2864 TAILQ_EMPTY(&par->nc_list)) 2865 { 2866 ncp = par; 2867 goto again; 2868 } 2869 _cache_unlock(par); 2870 _cache_drop(par); 2871 } 2872 return 1; 2873 } 2874 2875 /* 2876 * Clean up dangling negative cache and defered-drop entries in the 2877 * namecache. 2878 * 2879 * This routine is called in the critical path and also called from 2880 * vnlru(). When called from vnlru we use a lower limit to try to 2881 * deal with the negative cache before the critical path has to start 2882 * dealing with it. 2883 */ 2884 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t; 2885 2886 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2887 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2888 static cache_hs_t exc_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2889 2890 void 2891 cache_hysteresis(int critpath) 2892 { 2893 long poslimit; 2894 long exclimit; 2895 long neglimit = maxvnodes / ncnegfactor; 2896 long xnumunres; 2897 long xnumleafs; 2898 2899 if (critpath == 0) 2900 neglimit = neglimit * 8 / 10; 2901 2902 /* 2903 * Don't cache too many negative hits. We use hysteresis to reduce 2904 * the impact on the critical path. 2905 */ 2906 switch(neg_cache_hysteresis_state[critpath]) { 2907 case CHI_LOW: 2908 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) { 2909 if (critpath) 2910 _cache_cleanneg(ncnegflush); 2911 else 2912 _cache_cleanneg(ncnegflush + 2913 vfscache_negs - neglimit); 2914 neg_cache_hysteresis_state[critpath] = CHI_HIGH; 2915 } 2916 break; 2917 case CHI_HIGH: 2918 if (vfscache_negs > MINNEG * 9 / 10 && 2919 vfscache_negs * 9 / 10 > neglimit 2920 ) { 2921 if (critpath) 2922 _cache_cleanneg(ncnegflush); 2923 else 2924 _cache_cleanneg(ncnegflush + 2925 vfscache_negs * 9 / 10 - 2926 neglimit); 2927 } else { 2928 neg_cache_hysteresis_state[critpath] = CHI_LOW; 2929 } 2930 break; 2931 } 2932 2933 /* 2934 * Don't cache too many unresolved elements. We use hysteresis to 2935 * reduce the impact on the critical path. 2936 */ 2937 if ((poslimit = ncposlimit) == 0) 2938 poslimit = maxvnodes / ncposfactor; 2939 if (critpath == 0) 2940 poslimit = poslimit * 8 / 10; 2941 2942 /* 2943 * Number of unresolved leaf elements in the namecache. These 2944 * can build-up for various reasons and may have to be disposed 2945 * of to allow the inactive list to be cleaned out by vnlru_proc() 2946 */ 2947 xnumunres = vfscache_unres; 2948 2949 switch(pos_cache_hysteresis_state[critpath]) { 2950 case CHI_LOW: 2951 if (xnumunres > poslimit && xnumunres > MINPOS) { 2952 if (critpath) 2953 _cache_cleanpos(ncposflush); 2954 else 2955 _cache_cleanpos(ncposflush + 2956 xnumunres - poslimit); 2957 pos_cache_hysteresis_state[critpath] = CHI_HIGH; 2958 } 2959 break; 2960 case CHI_HIGH: 2961 if (xnumunres > poslimit * 5 / 6 && xnumunres > MINPOS) { 2962 if (critpath) 2963 _cache_cleanpos(ncposflush); 2964 else 2965 _cache_cleanpos(ncposflush + 2966 xnumunres - poslimit * 5 / 6); 2967 } else { 2968 pos_cache_hysteresis_state[critpath] = CHI_LOW; 2969 } 2970 break; 2971 } 2972 2973 /* 2974 * Excessive positive hits can accumulate due to large numbers of 2975 * hardlinks (the vnode cache will not prevent ncps representing 2976 * hardlinks from growing into infinity). 2977 */ 2978 exclimit = maxvnodes * 2; 2979 if (critpath == 0) 2980 exclimit = exclimit * 8 / 10; 2981 xnumleafs = vfscache_leafs; 2982 2983 switch(exc_cache_hysteresis_state[critpath]) { 2984 case CHI_LOW: 2985 if (xnumleafs > exclimit && xnumleafs > MINPOS) { 2986 if (critpath) 2987 _cache_cleanpos(ncposflush); 2988 else 2989 _cache_cleanpos(ncposflush + 2990 xnumleafs - exclimit); 2991 exc_cache_hysteresis_state[critpath] = CHI_HIGH; 2992 } 2993 break; 2994 case CHI_HIGH: 2995 if (xnumleafs > exclimit * 5 / 6 && xnumleafs > MINPOS) { 2996 if (critpath) 2997 _cache_cleanpos(ncposflush); 2998 else 2999 _cache_cleanpos(ncposflush + 3000 xnumleafs - exclimit * 5 / 6); 3001 } else { 3002 exc_cache_hysteresis_state[critpath] = CHI_LOW; 3003 } 3004 break; 3005 } 3006 3007 /* 3008 * Clean out dangling defered-zap ncps which could not be cleanly 3009 * dropped if too many build up. Note that numdefered is 3010 * heuristical. Make sure we are real-time for the current cpu, 3011 * plus the global rollup. 3012 */ 3013 if (pcpu_ncache[mycpu->gd_cpuid].numdefered + numdefered > neglimit) { 3014 _cache_cleandefered(); 3015 } 3016 } 3017 3018 /* 3019 * NEW NAMECACHE LOOKUP API 3020 * 3021 * Lookup an entry in the namecache. The passed par_nch must be referenced 3022 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp 3023 * is ALWAYS returned, eve if the supplied component is illegal. 3024 * 3025 * The resulting namecache entry should be returned to the system with 3026 * cache_put() or cache_unlock() + cache_drop(). 3027 * 3028 * namecache locks are recursive but care must be taken to avoid lock order 3029 * reversals (hence why the passed par_nch must be unlocked). Locking 3030 * rules are to order for parent traversals, not for child traversals. 3031 * 3032 * Nobody else will be able to manipulate the associated namespace (e.g. 3033 * create, delete, rename, rename-target) until the caller unlocks the 3034 * entry. 3035 * 3036 * The returned entry will be in one of three states: positive hit (non-null 3037 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set). 3038 * Unresolved entries must be resolved through the filesystem to associate the 3039 * vnode and/or determine whether a positive or negative hit has occured. 3040 * 3041 * It is not necessary to lock a directory in order to lock namespace under 3042 * that directory. In fact, it is explicitly not allowed to do that. A 3043 * directory is typically only locked when being created, renamed, or 3044 * destroyed. 3045 * 3046 * The directory (par) may be unresolved, in which case any returned child 3047 * will likely also be marked unresolved. Likely but not guarenteed. Since 3048 * the filesystem lookup requires a resolved directory vnode the caller is 3049 * responsible for resolving the namecache chain top-down. This API 3050 * specifically allows whole chains to be created in an unresolved state. 3051 */ 3052 struct nchandle 3053 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc) 3054 { 3055 struct nchandle nch; 3056 struct namecache *ncp; 3057 struct namecache *new_ncp; 3058 struct namecache *rep_ncp; /* reuse a destroyed ncp */ 3059 struct nchash_head *nchpp; 3060 struct mount *mp; 3061 u_int32_t hash; 3062 globaldata_t gd; 3063 int par_locked; 3064 int use_excl; 3065 3066 gd = mycpu; 3067 mp = par_nch->mount; 3068 par_locked = 0; 3069 3070 /* 3071 * This is a good time to call it, no ncp's are locked by 3072 * the caller or us. 3073 */ 3074 cache_hysteresis(1); 3075 3076 /* 3077 * Try to locate an existing entry 3078 */ 3079 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3080 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3081 new_ncp = NULL; 3082 use_excl = 0; 3083 nchpp = NCHHASH(hash); 3084 restart: 3085 rep_ncp = NULL; 3086 if (use_excl) 3087 spin_lock(&nchpp->spin); 3088 else 3089 spin_lock_shared(&nchpp->spin); 3090 3091 /* 3092 * Do a reverse scan to collect any DESTROYED ncps prior to matching 3093 * an existing entry. 3094 */ 3095 TAILQ_FOREACH_REVERSE(ncp, &nchpp->list, nchash_list, nc_hash) { 3096 /* 3097 * Break out if we find a matching entry. Note that 3098 * UNRESOLVED entries may match, but DESTROYED entries 3099 * do not. 3100 * 3101 * We may be able to reuse DESTROYED entries that we come 3102 * across, even if the name does not match, as long as 3103 * nc_nlen is correct and the only hold ref is from the nchpp 3104 * list itself. 3105 */ 3106 if (ncp->nc_parent == par_nch->ncp && 3107 ncp->nc_nlen == nlc->nlc_namelen) { 3108 if (ncp->nc_flag & NCF_DESTROYED) { 3109 if (ncp->nc_refs == 1 && rep_ncp == NULL) 3110 rep_ncp = ncp; 3111 continue; 3112 } 3113 if (bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen)) 3114 continue; 3115 3116 /* 3117 * Matched ncp 3118 */ 3119 _cache_hold(ncp); 3120 if (rep_ncp) 3121 _cache_hold(rep_ncp); 3122 3123 if (use_excl) 3124 spin_unlock(&nchpp->spin); 3125 else 3126 spin_unlock_shared(&nchpp->spin); 3127 3128 if (par_locked) { 3129 _cache_unlock(par_nch->ncp); 3130 par_locked = 0; 3131 } 3132 3133 /* 3134 * Really try to destroy rep_ncp if encountered. 3135 * Various edge cases can build up more than one, 3136 * so loop if we succeed. This isn't perfect, but 3137 * we can't afford to have tons of entries build 3138 * up on a single nhcpp list due to rename-over 3139 * operations. If that were to happen, the system 3140 * would bog down quickly. 3141 */ 3142 if (rep_ncp) { 3143 if (_cache_lock_nonblock(rep_ncp) == 0) { 3144 if (rep_ncp->nc_flag & NCF_DESTROYED) { 3145 if (cache_zap(rep_ncp)) { 3146 _cache_drop(ncp); 3147 goto restart; 3148 } 3149 } else { 3150 _cache_unlock(rep_ncp); 3151 _cache_drop(rep_ncp); 3152 } 3153 } else { 3154 _cache_drop(rep_ncp); 3155 } 3156 } 3157 3158 /* 3159 * Continue processing the matched entry 3160 */ 3161 if (_cache_lock_special(ncp) == 0) { 3162 /* 3163 * Successfully locked but we must re-test 3164 * conditions that might have changed since 3165 * we did not have the lock before. 3166 */ 3167 if (ncp->nc_parent != par_nch->ncp || 3168 ncp->nc_nlen != nlc->nlc_namelen || 3169 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3170 ncp->nc_nlen) || 3171 (ncp->nc_flag & NCF_DESTROYED)) { 3172 _cache_put(ncp); 3173 goto restart; 3174 } 3175 _cache_auto_unresolve(mp, ncp); 3176 if (new_ncp) { 3177 _cache_free(new_ncp); 3178 new_ncp = NULL; /* safety */ 3179 } 3180 goto found; 3181 } 3182 _cache_get(ncp); /* cycle the lock to block */ 3183 _cache_put(ncp); 3184 _cache_drop(ncp); 3185 goto restart; 3186 } 3187 } 3188 3189 /* 3190 * We failed to locate the entry, try to resurrect a destroyed 3191 * entry that we did find that is already correctly linked into 3192 * nchpp and the parent. We must re-test conditions after 3193 * successfully locking rep_ncp. 3194 * 3195 * This case can occur under heavy loads due to not being able 3196 * to safely lock the parent in cache_zap(). Nominally a repeated 3197 * create/unlink load, but only the namelen needs to match. 3198 * 3199 * An exclusive lock on the nchpp is required to process this case, 3200 * otherwise a race can cause duplicate entries to be created with 3201 * one cpu reusing a DESTROYED ncp while another creates a new_ncp. 3202 */ 3203 if (rep_ncp && use_excl) { 3204 if (_cache_lock_nonblock(rep_ncp) == 0) { 3205 _cache_hold(rep_ncp); 3206 if (rep_ncp->nc_parent == par_nch->ncp && 3207 rep_ncp->nc_nlen == nlc->nlc_namelen && 3208 (rep_ncp->nc_flag & NCF_DESTROYED) && 3209 rep_ncp->nc_refs == 2) 3210 { 3211 /* 3212 * Update nc_name. 3213 */ 3214 ncp = rep_ncp; 3215 bcopy(nlc->nlc_nameptr, ncp->nc_name, 3216 nlc->nlc_namelen); 3217 3218 /* 3219 * This takes some care. We must clear the 3220 * NCF_DESTROYED flag before unlocking the 3221 * hash chain so other concurrent searches 3222 * do not skip this element. 3223 * 3224 * We must also unlock the hash chain before 3225 * unresolving the ncp to avoid deadlocks. 3226 * We hold the lock on the ncp so we can safely 3227 * reinitialize nc_flag after that. 3228 */ 3229 ncp->nc_flag &= ~NCF_DESTROYED; 3230 spin_unlock(&nchpp->spin); /* use_excl */ 3231 3232 _cache_setunresolved(ncp); 3233 ncp->nc_flag = NCF_UNRESOLVED; 3234 ncp->nc_error = ENOTCONN; 3235 if (par_locked) { 3236 _cache_unlock(par_nch->ncp); 3237 par_locked = 0; 3238 } 3239 if (new_ncp) { 3240 _cache_free(new_ncp); 3241 new_ncp = NULL; /* safety */ 3242 } 3243 goto found; 3244 } 3245 _cache_put(rep_ncp); 3246 } 3247 } 3248 3249 /* 3250 * Otherwise create a new entry and add it to the cache. The parent 3251 * ncp must also be locked so we can link into it. 3252 * 3253 * We have to relookup after possibly blocking in kmalloc or 3254 * when locking par_nch. 3255 * 3256 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3257 * mount case, in which case nc_name will be NULL. 3258 * 3259 * NOTE: In the rep_ncp != NULL case we are trying to reuse 3260 * a DESTROYED entry, but didn't have an exclusive lock. 3261 * In this situation we do not create a new_ncp. 3262 */ 3263 if (new_ncp == NULL) { 3264 if (use_excl) 3265 spin_unlock(&nchpp->spin); 3266 else 3267 spin_unlock_shared(&nchpp->spin); 3268 if (rep_ncp == NULL) { 3269 new_ncp = cache_alloc(nlc->nlc_namelen); 3270 if (nlc->nlc_namelen) { 3271 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3272 nlc->nlc_namelen); 3273 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3274 } 3275 } 3276 use_excl = 1; 3277 goto restart; 3278 } 3279 3280 /* 3281 * NOTE! The spinlock is held exclusively here because new_ncp 3282 * is non-NULL. 3283 */ 3284 if (par_locked == 0) { 3285 spin_unlock(&nchpp->spin); 3286 _cache_lock(par_nch->ncp); 3287 par_locked = 1; 3288 goto restart; 3289 } 3290 3291 /* 3292 * Link to parent (requires another ref, the one already in new_ncp 3293 * is what we wil lreturn). 3294 * 3295 * WARNING! We still hold the spinlock. We have to set the hash 3296 * table entry atomically. 3297 */ 3298 ncp = new_ncp; 3299 ++ncp->nc_refs; 3300 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3301 spin_unlock(&nchpp->spin); 3302 _cache_unlock(par_nch->ncp); 3303 /* par_locked = 0 - not used */ 3304 found: 3305 /* 3306 * stats and namecache size management 3307 */ 3308 if (ncp->nc_flag & NCF_UNRESOLVED) 3309 ++gd->gd_nchstats->ncs_miss; 3310 else if (ncp->nc_vp) 3311 ++gd->gd_nchstats->ncs_goodhits; 3312 else 3313 ++gd->gd_nchstats->ncs_neghits; 3314 nch.mount = mp; 3315 nch.ncp = ncp; 3316 _cache_mntref(nch.mount); 3317 3318 return(nch); 3319 } 3320 3321 /* 3322 * Attempt to lookup a namecache entry and return with a shared namecache 3323 * lock. This operates non-blocking. EWOULDBLOCK is returned if excl is 3324 * set or we are unable to lock. 3325 */ 3326 int 3327 cache_nlookup_maybe_shared(struct nchandle *par_nch, 3328 struct nlcomponent *nlc, 3329 int excl, struct nchandle *res_nch) 3330 { 3331 struct namecache *ncp; 3332 struct nchash_head *nchpp; 3333 struct mount *mp; 3334 u_int32_t hash; 3335 globaldata_t gd; 3336 3337 /* 3338 * If exclusive requested or shared namecache locks are disabled, 3339 * return failure. 3340 */ 3341 if (ncp_shared_lock_disable || excl) 3342 return(EWOULDBLOCK); 3343 3344 gd = mycpu; 3345 mp = par_nch->mount; 3346 3347 /* 3348 * This is a good time to call it, no ncp's are locked by 3349 * the caller or us. 3350 */ 3351 cache_hysteresis(1); 3352 3353 /* 3354 * Try to locate an existing entry 3355 */ 3356 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3357 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3358 nchpp = NCHHASH(hash); 3359 3360 spin_lock_shared(&nchpp->spin); 3361 3362 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3363 /* 3364 * Break out if we find a matching entry. Note that 3365 * UNRESOLVED entries may match, but DESTROYED entries 3366 * do not. 3367 */ 3368 if (ncp->nc_parent == par_nch->ncp && 3369 ncp->nc_nlen == nlc->nlc_namelen && 3370 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3371 (ncp->nc_flag & NCF_DESTROYED) == 0 3372 ) { 3373 _cache_hold(ncp); 3374 spin_unlock_shared(&nchpp->spin); 3375 3376 if (_cache_lock_shared_special(ncp) == 0) { 3377 if (ncp->nc_parent == par_nch->ncp && 3378 ncp->nc_nlen == nlc->nlc_namelen && 3379 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3380 ncp->nc_nlen) == 0 && 3381 (ncp->nc_flag & NCF_DESTROYED) == 0 && 3382 (ncp->nc_flag & NCF_UNRESOLVED) == 0 && 3383 _cache_auto_unresolve_test(mp, ncp) == 0) 3384 { 3385 goto found; 3386 } 3387 _cache_unlock(ncp); 3388 } 3389 _cache_drop(ncp); 3390 return(EWOULDBLOCK); 3391 } 3392 } 3393 3394 /* 3395 * Failure 3396 */ 3397 spin_unlock_shared(&nchpp->spin); 3398 return(EWOULDBLOCK); 3399 3400 /* 3401 * Success 3402 * 3403 * Note that nc_error might be non-zero (e.g ENOENT). 3404 */ 3405 found: 3406 res_nch->mount = mp; 3407 res_nch->ncp = ncp; 3408 ++gd->gd_nchstats->ncs_goodhits; 3409 _cache_mntref(res_nch->mount); 3410 3411 KKASSERT(ncp->nc_error != EWOULDBLOCK); 3412 return(ncp->nc_error); 3413 } 3414 3415 /* 3416 * This is a non-blocking verison of cache_nlookup() used by 3417 * nfs_readdirplusrpc_uio(). It can fail for any reason and 3418 * will return nch.ncp == NULL in that case. 3419 */ 3420 struct nchandle 3421 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc) 3422 { 3423 struct nchandle nch; 3424 struct namecache *ncp; 3425 struct namecache *new_ncp; 3426 struct nchash_head *nchpp; 3427 struct mount *mp; 3428 u_int32_t hash; 3429 globaldata_t gd; 3430 int par_locked; 3431 3432 gd = mycpu; 3433 mp = par_nch->mount; 3434 par_locked = 0; 3435 3436 /* 3437 * Try to locate an existing entry 3438 */ 3439 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3440 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3441 new_ncp = NULL; 3442 nchpp = NCHHASH(hash); 3443 restart: 3444 spin_lock(&nchpp->spin); 3445 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3446 /* 3447 * Break out if we find a matching entry. Note that 3448 * UNRESOLVED entries may match, but DESTROYED entries 3449 * do not. 3450 */ 3451 if (ncp->nc_parent == par_nch->ncp && 3452 ncp->nc_nlen == nlc->nlc_namelen && 3453 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3454 (ncp->nc_flag & NCF_DESTROYED) == 0 3455 ) { 3456 _cache_hold(ncp); 3457 spin_unlock(&nchpp->spin); 3458 if (par_locked) { 3459 _cache_unlock(par_nch->ncp); 3460 par_locked = 0; 3461 } 3462 if (_cache_lock_special(ncp) == 0) { 3463 if (ncp->nc_parent != par_nch->ncp || 3464 ncp->nc_nlen != nlc->nlc_namelen || 3465 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) || 3466 (ncp->nc_flag & NCF_DESTROYED)) { 3467 kprintf("cache_lookup_nonblock: " 3468 "ncp-race %p %*.*s\n", 3469 ncp, 3470 nlc->nlc_namelen, 3471 nlc->nlc_namelen, 3472 nlc->nlc_nameptr); 3473 _cache_unlock(ncp); 3474 _cache_drop(ncp); 3475 goto failed; 3476 } 3477 _cache_auto_unresolve(mp, ncp); 3478 if (new_ncp) { 3479 _cache_free(new_ncp); 3480 new_ncp = NULL; 3481 } 3482 goto found; 3483 } 3484 _cache_drop(ncp); 3485 goto failed; 3486 } 3487 } 3488 3489 /* 3490 * We failed to locate an entry, create a new entry and add it to 3491 * the cache. The parent ncp must also be locked so we 3492 * can link into it. 3493 * 3494 * We have to relookup after possibly blocking in kmalloc or 3495 * when locking par_nch. 3496 * 3497 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3498 * mount case, in which case nc_name will be NULL. 3499 */ 3500 if (new_ncp == NULL) { 3501 spin_unlock(&nchpp->spin); 3502 new_ncp = cache_alloc(nlc->nlc_namelen); 3503 if (nlc->nlc_namelen) { 3504 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3505 nlc->nlc_namelen); 3506 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3507 } 3508 goto restart; 3509 } 3510 if (par_locked == 0) { 3511 spin_unlock(&nchpp->spin); 3512 if (_cache_lock_nonblock(par_nch->ncp) == 0) { 3513 par_locked = 1; 3514 goto restart; 3515 } 3516 goto failed; 3517 } 3518 3519 /* 3520 * Link to parent (requires another ref, the one already in new_ncp 3521 * is what we wil lreturn). 3522 * 3523 * WARNING! We still hold the spinlock. We have to set the hash 3524 * table entry atomically. 3525 */ 3526 ncp = new_ncp; 3527 ++ncp->nc_refs; 3528 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3529 spin_unlock(&nchpp->spin); 3530 _cache_unlock(par_nch->ncp); 3531 /* par_locked = 0 - not used */ 3532 found: 3533 /* 3534 * stats and namecache size management 3535 */ 3536 if (ncp->nc_flag & NCF_UNRESOLVED) 3537 ++gd->gd_nchstats->ncs_miss; 3538 else if (ncp->nc_vp) 3539 ++gd->gd_nchstats->ncs_goodhits; 3540 else 3541 ++gd->gd_nchstats->ncs_neghits; 3542 nch.mount = mp; 3543 nch.ncp = ncp; 3544 _cache_mntref(nch.mount); 3545 3546 return(nch); 3547 failed: 3548 if (new_ncp) { 3549 _cache_free(new_ncp); 3550 new_ncp = NULL; 3551 } 3552 nch.mount = NULL; 3553 nch.ncp = NULL; 3554 return(nch); 3555 } 3556 3557 /* 3558 * This version is non-locking. The caller must validate the result 3559 * for parent-to-child continuity. 3560 * 3561 * It can fail for any reason and will return nch.ncp == NULL in that case. 3562 */ 3563 struct nchandle 3564 cache_nlookup_nonlocked(struct nchandle *par_nch, struct nlcomponent *nlc) 3565 { 3566 struct nchandle nch; 3567 struct namecache *ncp; 3568 struct nchash_head *nchpp; 3569 struct mount *mp; 3570 u_int32_t hash; 3571 globaldata_t gd; 3572 3573 gd = mycpu; 3574 mp = par_nch->mount; 3575 3576 /* 3577 * Try to locate an existing entry 3578 */ 3579 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3580 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3581 nchpp = NCHHASH(hash); 3582 3583 spin_lock_shared(&nchpp->spin); 3584 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3585 /* 3586 * Break out if we find a matching entry. Note that 3587 * UNRESOLVED entries may match, but DESTROYED entries 3588 * do not. 3589 * 3590 * Resolved NFS entries which have timed out fail so the 3591 * caller can rerun with normal locking. 3592 */ 3593 if (ncp->nc_parent == par_nch->ncp && 3594 ncp->nc_nlen == nlc->nlc_namelen && 3595 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3596 (ncp->nc_flag & NCF_DESTROYED) == 0 3597 ) { 3598 if (_cache_auto_unresolve_test(par_nch->mount, ncp)) 3599 break; 3600 _cache_hold(ncp); 3601 spin_unlock_shared(&nchpp->spin); 3602 goto found; 3603 } 3604 } 3605 spin_unlock_shared(&nchpp->spin); 3606 nch.mount = NULL; 3607 nch.ncp = NULL; 3608 return nch; 3609 found: 3610 /* 3611 * stats and namecache size management 3612 */ 3613 if (ncp->nc_flag & NCF_UNRESOLVED) 3614 ++gd->gd_nchstats->ncs_miss; 3615 else if (ncp->nc_vp) 3616 ++gd->gd_nchstats->ncs_goodhits; 3617 else 3618 ++gd->gd_nchstats->ncs_neghits; 3619 nch.mount = mp; 3620 nch.ncp = ncp; 3621 _cache_mntref(nch.mount); 3622 3623 return(nch); 3624 } 3625 3626 /* 3627 * The namecache entry is marked as being used as a mount point. 3628 * Locate the mount if it is visible to the caller. The DragonFly 3629 * mount system allows arbitrary loops in the topology and disentangles 3630 * those loops by matching against (mp, ncp) rather than just (ncp). 3631 * This means any given ncp can dive any number of mounts, depending 3632 * on the relative mount (e.g. nullfs) the caller is at in the topology. 3633 * 3634 * We use a very simple frontend cache to reduce SMP conflicts, 3635 * which we have to do because the mountlist scan needs an exclusive 3636 * lock around its ripout info list. Not to mention that there might 3637 * be a lot of mounts. 3638 * 3639 * Because all mounts can potentially be accessed by all cpus, break the cpu's 3640 * down a bit to allow some contention rather than making the cache 3641 * excessively huge. 3642 * 3643 * The hash table is split into per-cpu areas, is 4-way set-associative. 3644 */ 3645 struct findmount_info { 3646 struct mount *result; 3647 struct mount *nch_mount; 3648 struct namecache *nch_ncp; 3649 }; 3650 3651 static __inline 3652 struct ncmount_cache * 3653 ncmount_cache_lookup4(struct mount *mp, struct namecache *ncp) 3654 { 3655 uint32_t hash; 3656 3657 hash = iscsi_crc32(&mp, sizeof(mp)); 3658 hash = iscsi_crc32_ext(&ncp, sizeof(ncp), hash); 3659 hash ^= hash >> 16; 3660 hash = hash & ((NCMOUNT_NUMCACHE - 1) & ~(NCMOUNT_SET - 1)); 3661 3662 return (&ncmount_cache[hash]); 3663 } 3664 3665 static 3666 struct ncmount_cache * 3667 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp) 3668 { 3669 struct ncmount_cache *ncc; 3670 struct ncmount_cache *best; 3671 int delta; 3672 int best_delta; 3673 int i; 3674 3675 ncc = ncmount_cache_lookup4(mp, ncp); 3676 3677 /* 3678 * NOTE: When checking for a ticks overflow implement a slop of 3679 * 2 ticks just to be safe, because ticks is accessed 3680 * non-atomically one CPU can increment it while another 3681 * is still using the old value. 3682 */ 3683 if (ncc->ncp == ncp && ncc->mp == mp) /* 0 */ 3684 return ncc; 3685 delta = (int)(ticks - ncc->ticks); /* beware GCC opts */ 3686 if (delta < -2) /* overflow reset */ 3687 ncc->ticks = ticks; 3688 best = ncc; 3689 best_delta = delta; 3690 3691 for (i = 1; i < NCMOUNT_SET; ++i) { /* 1, 2, 3 */ 3692 ++ncc; 3693 if (ncc->ncp == ncp && ncc->mp == mp) 3694 return ncc; 3695 delta = (int)(ticks - ncc->ticks); 3696 if (delta < -2) 3697 ncc->ticks = ticks; 3698 if (delta > best_delta) { 3699 best_delta = delta; 3700 best = ncc; 3701 } 3702 } 3703 return best; 3704 } 3705 3706 /* 3707 * pcpu-optimized mount search. Locate the recursive mountpoint, avoid 3708 * doing an expensive mountlist_scan*() if possible. 3709 * 3710 * (mp, ncp) -> mountonpt.k 3711 * 3712 * Returns a referenced mount pointer or NULL 3713 * 3714 * General SMP operation uses a per-cpu umount_spin to interlock unmount 3715 * operations (that is, where the mp_target can be freed out from under us). 3716 * 3717 * Lookups use the ncc->updating counter to validate the contents in order 3718 * to avoid having to obtain the per cache-element spin-lock. In addition, 3719 * the ticks field is only updated when it changes. However, if our per-cpu 3720 * lock fails due to an unmount-in-progress, we fall-back to the 3721 * cache-element's spin-lock. 3722 */ 3723 struct mount * 3724 cache_findmount(struct nchandle *nch) 3725 { 3726 struct findmount_info info; 3727 struct ncmount_cache *ncc; 3728 struct ncmount_cache ncc_copy; 3729 struct mount *target; 3730 struct pcpu_ncache *pcpu; 3731 struct spinlock *spinlk; 3732 int update; 3733 3734 pcpu = pcpu_ncache; 3735 if (ncmount_cache_enable == 0 || pcpu == NULL) { 3736 ncc = NULL; 3737 goto skip; 3738 } 3739 pcpu += mycpu->gd_cpuid; 3740 3741 again: 3742 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3743 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3744 found: 3745 /* 3746 * This is a bit messy for now because we do not yet have 3747 * safe disposal of mount structures. We have to ref 3748 * ncc->mp_target but the 'update' counter only tell us 3749 * whether the cache has changed after the fact. 3750 * 3751 * For now get a per-cpu spinlock that will only contend 3752 * against umount's. This is the best path. If it fails, 3753 * instead of waiting on the umount we fall-back to a 3754 * shared ncc->spin lock, which will generally only cost a 3755 * cache ping-pong. 3756 */ 3757 update = ncc->updating; 3758 if (__predict_true(spin_trylock(&pcpu->umount_spin))) { 3759 spinlk = &pcpu->umount_spin; 3760 } else { 3761 spinlk = &ncc->spin; 3762 spin_lock_shared(spinlk); 3763 } 3764 if (update & 1) { /* update in progress */ 3765 spin_unlock_any(spinlk); 3766 goto skip; 3767 } 3768 ncc_copy = *ncc; 3769 cpu_lfence(); 3770 if (ncc->updating != update) { /* content changed */ 3771 spin_unlock_any(spinlk); 3772 goto again; 3773 } 3774 if (ncc_copy.ncp != nch->ncp || ncc_copy.mp != nch->mount) { 3775 spin_unlock_any(spinlk); 3776 goto again; 3777 } 3778 if (ncc_copy.isneg == 0) { 3779 target = ncc_copy.mp_target; 3780 if (target->mnt_ncmounton.mount == nch->mount && 3781 target->mnt_ncmounton.ncp == nch->ncp) { 3782 /* 3783 * Cache hit (positive) (avoid dirtying 3784 * the cache line if possible) 3785 */ 3786 if (ncc->ticks != (int)ticks) 3787 ncc->ticks = (int)ticks; 3788 _cache_mntref(target); 3789 } 3790 } else { 3791 /* 3792 * Cache hit (negative) (avoid dirtying 3793 * the cache line if possible) 3794 */ 3795 if (ncc->ticks != (int)ticks) 3796 ncc->ticks = (int)ticks; 3797 target = NULL; 3798 } 3799 spin_unlock_any(spinlk); 3800 3801 return target; 3802 } 3803 skip: 3804 3805 /* 3806 * Slow 3807 */ 3808 info.result = NULL; 3809 info.nch_mount = nch->mount; 3810 info.nch_ncp = nch->ncp; 3811 mountlist_scan(cache_findmount_callback, &info, 3812 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | MNTSCAN_NOUNLOCK); 3813 3814 /* 3815 * To reduce multi-re-entry on the cache, relookup in the cache. 3816 * This can still race, obviously, but that's ok. 3817 */ 3818 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3819 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3820 if (info.result) 3821 atomic_add_int(&info.result->mnt_refs, -1); 3822 goto found; 3823 } 3824 3825 /* 3826 * Cache the result. 3827 */ 3828 if ((info.result == NULL || 3829 (info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0)) { 3830 spin_lock(&ncc->spin); 3831 atomic_add_int_nonlocked(&ncc->updating, 1); 3832 cpu_sfence(); 3833 KKASSERT(ncc->updating & 1); 3834 if (ncc->mp != nch->mount) { 3835 if (ncc->mp) 3836 atomic_add_int(&ncc->mp->mnt_refs, -1); 3837 atomic_add_int(&nch->mount->mnt_refs, 1); 3838 ncc->mp = nch->mount; 3839 } 3840 ncc->ncp = nch->ncp; /* ptr compares only, not refd*/ 3841 ncc->ticks = (int)ticks; 3842 3843 if (info.result) { 3844 ncc->isneg = 0; 3845 if (ncc->mp_target != info.result) { 3846 if (ncc->mp_target) 3847 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3848 ncc->mp_target = info.result; 3849 atomic_add_int(&info.result->mnt_refs, 1); 3850 } 3851 } else { 3852 ncc->isneg = 1; 3853 if (ncc->mp_target) { 3854 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3855 ncc->mp_target = NULL; 3856 } 3857 } 3858 cpu_sfence(); 3859 atomic_add_int_nonlocked(&ncc->updating, 1); 3860 spin_unlock(&ncc->spin); 3861 } 3862 return(info.result); 3863 } 3864 3865 static 3866 int 3867 cache_findmount_callback(struct mount *mp, void *data) 3868 { 3869 struct findmount_info *info = data; 3870 3871 /* 3872 * Check the mount's mounted-on point against the passed nch. 3873 */ 3874 if (mp->mnt_ncmounton.mount == info->nch_mount && 3875 mp->mnt_ncmounton.ncp == info->nch_ncp 3876 ) { 3877 info->result = mp; 3878 _cache_mntref(mp); 3879 return(-1); 3880 } 3881 return(0); 3882 } 3883 3884 void 3885 cache_dropmount(struct mount *mp) 3886 { 3887 _cache_mntrel(mp); 3888 } 3889 3890 /* 3891 * mp is being mounted, scrap entries matching mp->mnt_ncmounton (positive 3892 * or negative). 3893 * 3894 * A full scan is not required, but for now just do it anyway. 3895 */ 3896 void 3897 cache_ismounting(struct mount *mp) 3898 { 3899 struct ncmount_cache *ncc; 3900 struct mount *ncc_mp; 3901 int i; 3902 3903 if (pcpu_ncache == NULL) 3904 return; 3905 3906 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 3907 ncc = &ncmount_cache[i]; 3908 if (ncc->mp != mp->mnt_ncmounton.mount || 3909 ncc->ncp != mp->mnt_ncmounton.ncp) { 3910 continue; 3911 } 3912 spin_lock(&ncc->spin); 3913 atomic_add_int_nonlocked(&ncc->updating, 1); 3914 cpu_sfence(); 3915 KKASSERT(ncc->updating & 1); 3916 if (ncc->mp != mp->mnt_ncmounton.mount || 3917 ncc->ncp != mp->mnt_ncmounton.ncp) { 3918 cpu_sfence(); 3919 ++ncc->updating; 3920 spin_unlock(&ncc->spin); 3921 continue; 3922 } 3923 ncc_mp = ncc->mp; 3924 ncc->ncp = NULL; 3925 ncc->mp = NULL; 3926 if (ncc_mp) 3927 atomic_add_int(&ncc_mp->mnt_refs, -1); 3928 ncc_mp = ncc->mp_target; 3929 ncc->mp_target = NULL; 3930 if (ncc_mp) 3931 atomic_add_int(&ncc_mp->mnt_refs, -1); 3932 ncc->ticks = (int)ticks - hz * 120; 3933 3934 cpu_sfence(); 3935 atomic_add_int_nonlocked(&ncc->updating, 1); 3936 spin_unlock(&ncc->spin); 3937 } 3938 3939 /* 3940 * Pre-cache the mount point 3941 */ 3942 ncc = ncmount_cache_lookup(mp->mnt_ncmounton.mount, 3943 mp->mnt_ncmounton.ncp); 3944 3945 spin_lock(&ncc->spin); 3946 atomic_add_int_nonlocked(&ncc->updating, 1); 3947 cpu_sfence(); 3948 KKASSERT(ncc->updating & 1); 3949 3950 if (ncc->mp) 3951 atomic_add_int(&ncc->mp->mnt_refs, -1); 3952 atomic_add_int(&mp->mnt_ncmounton.mount->mnt_refs, 1); 3953 ncc->mp = mp->mnt_ncmounton.mount; 3954 ncc->ncp = mp->mnt_ncmounton.ncp; /* ptr compares only */ 3955 ncc->ticks = (int)ticks; 3956 3957 ncc->isneg = 0; 3958 if (ncc->mp_target != mp) { 3959 if (ncc->mp_target) 3960 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3961 ncc->mp_target = mp; 3962 atomic_add_int(&mp->mnt_refs, 1); 3963 } 3964 cpu_sfence(); 3965 atomic_add_int_nonlocked(&ncc->updating, 1); 3966 spin_unlock(&ncc->spin); 3967 } 3968 3969 /* 3970 * Scrap any ncmount_cache entries related to mp. Not only do we need to 3971 * scrap entries matching mp->mnt_ncmounton, but we also need to scrap any 3972 * negative hits involving (mp, <any>). 3973 * 3974 * A full scan is required. 3975 */ 3976 void 3977 cache_unmounting(struct mount *mp) 3978 { 3979 struct ncmount_cache *ncc; 3980 struct pcpu_ncache *pcpu; 3981 struct mount *ncc_mp; 3982 int i; 3983 3984 pcpu = pcpu_ncache; 3985 if (pcpu == NULL) 3986 return; 3987 3988 for (i = 0; i < ncpus; ++i) 3989 spin_lock(&pcpu[i].umount_spin); 3990 3991 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 3992 ncc = &ncmount_cache[i]; 3993 if (ncc->mp != mp && ncc->mp_target != mp) 3994 continue; 3995 spin_lock(&ncc->spin); 3996 atomic_add_int_nonlocked(&ncc->updating, 1); 3997 cpu_sfence(); 3998 3999 if (ncc->mp != mp && ncc->mp_target != mp) { 4000 atomic_add_int_nonlocked(&ncc->updating, 1); 4001 cpu_sfence(); 4002 spin_unlock(&ncc->spin); 4003 continue; 4004 } 4005 ncc_mp = ncc->mp; 4006 ncc->ncp = NULL; 4007 ncc->mp = NULL; 4008 if (ncc_mp) 4009 atomic_add_int(&ncc_mp->mnt_refs, -1); 4010 ncc_mp = ncc->mp_target; 4011 ncc->mp_target = NULL; 4012 if (ncc_mp) 4013 atomic_add_int(&ncc_mp->mnt_refs, -1); 4014 ncc->ticks = (int)ticks - hz * 120; 4015 4016 cpu_sfence(); 4017 atomic_add_int_nonlocked(&ncc->updating, 1); 4018 spin_unlock(&ncc->spin); 4019 } 4020 4021 for (i = 0; i < ncpus; ++i) 4022 spin_unlock(&pcpu[i].umount_spin); 4023 } 4024 4025 /* 4026 * Resolve an unresolved namecache entry, generally by looking it up. 4027 * The passed ncp must be locked and refd. 4028 * 4029 * Theoretically since a vnode cannot be recycled while held, and since 4030 * the nc_parent chain holds its vnode as long as children exist, the 4031 * direct parent of the cache entry we are trying to resolve should 4032 * have a valid vnode. If not then generate an error that we can 4033 * determine is related to a resolver bug. 4034 * 4035 * However, if a vnode was in the middle of a recyclement when the NCP 4036 * got locked, ncp->nc_vp might point to a vnode that is about to become 4037 * invalid. cache_resolve() handles this case by unresolving the entry 4038 * and then re-resolving it. 4039 * 4040 * Note that successful resolution does not necessarily return an error 4041 * code of 0. If the ncp resolves to a negative cache hit then ENOENT 4042 * will be returned. 4043 */ 4044 int 4045 cache_resolve(struct nchandle *nch, struct ucred *cred) 4046 { 4047 struct namecache *par_tmp; 4048 struct namecache *par; 4049 struct namecache *ncp; 4050 struct nchandle nctmp; 4051 struct mount *mp; 4052 struct vnode *dvp; 4053 int error; 4054 4055 ncp = nch->ncp; 4056 mp = nch->mount; 4057 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 4058 restart: 4059 /* 4060 * If the ncp is already resolved we have nothing to do. However, 4061 * we do want to guarentee that a usable vnode is returned when 4062 * a vnode is present, so make sure it hasn't been reclaimed. 4063 */ 4064 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 4065 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 4066 _cache_setunresolved(ncp); 4067 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 4068 return (ncp->nc_error); 4069 } 4070 4071 /* 4072 * If the ncp was destroyed it will never resolve again. This 4073 * can basically only happen when someone is chdir'd into an 4074 * empty directory which is then rmdir'd. We want to catch this 4075 * here and not dive the VFS because the VFS might actually 4076 * have a way to re-resolve the disconnected ncp, which will 4077 * result in inconsistencies in the cdir/nch for proc->p_fd. 4078 */ 4079 if (ncp->nc_flag & NCF_DESTROYED) 4080 return(EINVAL); 4081 4082 /* 4083 * Mount points need special handling because the parent does not 4084 * belong to the same filesystem as the ncp. 4085 */ 4086 if (ncp == mp->mnt_ncmountpt.ncp) 4087 return (cache_resolve_mp(mp)); 4088 4089 /* 4090 * We expect an unbroken chain of ncps to at least the mount point, 4091 * and even all the way to root (but this code doesn't have to go 4092 * past the mount point). 4093 */ 4094 if (ncp->nc_parent == NULL) { 4095 kprintf("EXDEV case 1 %p %*.*s\n", ncp, 4096 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 4097 ncp->nc_error = EXDEV; 4098 return(ncp->nc_error); 4099 } 4100 4101 /* 4102 * The vp's of the parent directories in the chain are held via vhold() 4103 * due to the existance of the child, and should not disappear. 4104 * However, there are cases where they can disappear: 4105 * 4106 * - due to filesystem I/O errors. 4107 * - due to NFS being stupid about tracking the namespace and 4108 * destroys the namespace for entire directories quite often. 4109 * - due to forced unmounts. 4110 * - due to an rmdir (parent will be marked DESTROYED) 4111 * 4112 * When this occurs we have to track the chain backwards and resolve 4113 * it, looping until the resolver catches up to the current node. We 4114 * could recurse here but we might run ourselves out of kernel stack 4115 * so we do it in a more painful manner. This situation really should 4116 * not occur all that often, or if it does not have to go back too 4117 * many nodes to resolve the ncp. 4118 */ 4119 while ((dvp = cache_dvpref(ncp)) == NULL) { 4120 /* 4121 * This case can occur if a process is CD'd into a 4122 * directory which is then rmdir'd. If the parent is marked 4123 * destroyed there is no point trying to resolve it. 4124 */ 4125 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) 4126 return(ENOENT); 4127 par = ncp->nc_parent; 4128 _cache_hold(par); 4129 _cache_lock(par); 4130 while ((par_tmp = par->nc_parent) != NULL && 4131 par_tmp->nc_vp == NULL) { 4132 _cache_hold(par_tmp); 4133 _cache_lock(par_tmp); 4134 _cache_put(par); 4135 par = par_tmp; 4136 } 4137 if (par->nc_parent == NULL) { 4138 kprintf("EXDEV case 2 %*.*s\n", 4139 par->nc_nlen, par->nc_nlen, par->nc_name); 4140 _cache_put(par); 4141 return (EXDEV); 4142 } 4143 /* 4144 * The parent is not set in stone, ref and lock it to prevent 4145 * it from disappearing. Also note that due to renames it 4146 * is possible for our ncp to move and for par to no longer 4147 * be one of its parents. We resolve it anyway, the loop 4148 * will handle any moves. 4149 */ 4150 _cache_get(par); /* additional hold/lock */ 4151 _cache_put(par); /* from earlier hold/lock */ 4152 if (par == nch->mount->mnt_ncmountpt.ncp) { 4153 cache_resolve_mp(nch->mount); 4154 } else if ((dvp = cache_dvpref(par)) == NULL) { 4155 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", 4156 par->nc_nlen, par->nc_nlen, par->nc_name); 4157 _cache_put(par); 4158 continue; 4159 } else { 4160 if (par->nc_flag & NCF_UNRESOLVED) { 4161 nctmp.mount = mp; 4162 nctmp.ncp = par; 4163 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 4164 } 4165 vrele(dvp); 4166 } 4167 if ((error = par->nc_error) != 0) { 4168 if (par->nc_error != EAGAIN) { 4169 kprintf("EXDEV case 3 %*.*s error %d\n", 4170 par->nc_nlen, par->nc_nlen, par->nc_name, 4171 par->nc_error); 4172 _cache_put(par); 4173 return(error); 4174 } 4175 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 4176 par, par->nc_nlen, par->nc_nlen, par->nc_name); 4177 } 4178 _cache_put(par); 4179 /* loop */ 4180 } 4181 4182 /* 4183 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected 4184 * ncp's and reattach them. If this occurs the original ncp is marked 4185 * EAGAIN to force a relookup. 4186 * 4187 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed 4188 * ncp must already be resolved. 4189 */ 4190 if (dvp) { 4191 nctmp.mount = mp; 4192 nctmp.ncp = ncp; 4193 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 4194 vrele(dvp); 4195 } else { 4196 ncp->nc_error = EPERM; 4197 } 4198 if (ncp->nc_error == EAGAIN) { 4199 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n", 4200 ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 4201 goto restart; 4202 } 4203 return(ncp->nc_error); 4204 } 4205 4206 /* 4207 * Resolve the ncp associated with a mount point. Such ncp's almost always 4208 * remain resolved and this routine is rarely called. NFS MPs tends to force 4209 * re-resolution more often due to its mac-truck-smash-the-namecache 4210 * method of tracking namespace changes. 4211 * 4212 * The semantics for this call is that the passed ncp must be locked on 4213 * entry and will be locked on return. However, if we actually have to 4214 * resolve the mount point we temporarily unlock the entry in order to 4215 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of 4216 * the unlock we have to recheck the flags after we relock. 4217 */ 4218 static int 4219 cache_resolve_mp(struct mount *mp) 4220 { 4221 struct namecache *ncp = mp->mnt_ncmountpt.ncp; 4222 struct vnode *vp; 4223 int error; 4224 4225 KKASSERT(mp != NULL); 4226 4227 /* 4228 * If the ncp is already resolved we have nothing to do. However, 4229 * we do want to guarentee that a usable vnode is returned when 4230 * a vnode is present, so make sure it hasn't been reclaimed. 4231 */ 4232 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 4233 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 4234 _cache_setunresolved(ncp); 4235 } 4236 4237 if (ncp->nc_flag & NCF_UNRESOLVED) { 4238 /* 4239 * ncp must be unlocked across the vfs_busy(), but 4240 * once busied lock ordering is ncp(s), then vnodes, 4241 * so we must relock the ncp before issuing the VFS_ROOT(). 4242 */ 4243 _cache_unlock(ncp); 4244 while (vfs_busy(mp, 0)) 4245 ; 4246 _cache_lock(ncp); 4247 error = VFS_ROOT(mp, &vp); 4248 4249 /* 4250 * recheck the ncp state after relocking. 4251 */ 4252 if (ncp->nc_flag & NCF_UNRESOLVED) { 4253 ncp->nc_error = error; 4254 if (error == 0) { 4255 _cache_setvp(mp, ncp, vp); 4256 vput(vp); 4257 } else { 4258 kprintf("[diagnostic] cache_resolve_mp: failed" 4259 " to resolve mount %p err=%d ncp=%p\n", 4260 mp, error, ncp); 4261 _cache_setvp(mp, ncp, NULL); 4262 } 4263 } else if (error == 0) { 4264 vput(vp); 4265 } 4266 vfs_unbusy(mp); 4267 } 4268 return(ncp->nc_error); 4269 } 4270 4271 /* 4272 * Resolve the parent vnode 4273 */ 4274 int 4275 cache_resolve_dvp(struct nchandle *nch, struct ucred *cred, struct vnode **dvpp) 4276 { 4277 struct namecache *par_tmp; 4278 struct namecache *par; 4279 struct namecache *ncp; 4280 struct nchandle nctmp; 4281 struct mount *mp; 4282 struct vnode *dvp; 4283 int error; 4284 4285 *dvpp = NULL; 4286 ncp = nch->ncp; 4287 mp = nch->mount; 4288 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 4289 4290 /* 4291 * Treat this as a mount point even if it has a parent (e.g. 4292 * null-mount). Return a NULL dvp and no error. 4293 */ 4294 if (ncp == mp->mnt_ncmountpt.ncp) 4295 return 0; 4296 4297 /* 4298 * If the ncp was destroyed there is no parent directory, return 4299 * EINVAL. 4300 */ 4301 if (ncp->nc_flag & NCF_DESTROYED) 4302 return(EINVAL); 4303 4304 /* 4305 * No parent if at the root of a filesystem, no error. Typically 4306 * not applicable to null-mounts. This case should have been caught 4307 * in the above ncmountpt check. 4308 */ 4309 if (ncp->nc_parent == NULL) 4310 return 0; 4311 4312 /* 4313 * Resolve the parent dvp. 4314 * 4315 * The vp's of the parent directories in the chain are held via vhold() 4316 * due to the existance of the child, and should not disappear. 4317 * However, there are cases where they can disappear: 4318 * 4319 * - due to filesystem I/O errors. 4320 * - due to NFS being stupid about tracking the namespace and 4321 * destroys the namespace for entire directories quite often. 4322 * - due to forced unmounts. 4323 * - due to an rmdir (parent will be marked DESTROYED) 4324 * 4325 * When this occurs we have to track the chain backwards and resolve 4326 * it, looping until the resolver catches up to the current node. We 4327 * could recurse here but we might run ourselves out of kernel stack 4328 * so we do it in a more painful manner. This situation really should 4329 * not occur all that often, or if it does not have to go back too 4330 * many nodes to resolve the ncp. 4331 */ 4332 while ((dvp = cache_dvpref(ncp)) == NULL) { 4333 /* 4334 * This case can occur if a process is CD'd into a 4335 * directory which is then rmdir'd. If the parent is marked 4336 * destroyed there is no point trying to resolve it. 4337 */ 4338 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) 4339 return(ENOENT); 4340 par = ncp->nc_parent; 4341 _cache_hold(par); 4342 _cache_lock(par); 4343 while ((par_tmp = par->nc_parent) != NULL && 4344 par_tmp->nc_vp == NULL) { 4345 _cache_hold(par_tmp); 4346 _cache_lock(par_tmp); 4347 _cache_put(par); 4348 par = par_tmp; 4349 } 4350 if (par->nc_parent == NULL) { 4351 kprintf("EXDEV case 2 %*.*s\n", 4352 par->nc_nlen, par->nc_nlen, par->nc_name); 4353 _cache_put(par); 4354 return (EXDEV); 4355 } 4356 4357 /* 4358 * The parent is not set in stone, ref and lock it to prevent 4359 * it from disappearing. Also note that due to renames it 4360 * is possible for our ncp to move and for par to no longer 4361 * be one of its parents. We resolve it anyway, the loop 4362 * will handle any moves. 4363 */ 4364 _cache_get(par); /* additional hold/lock */ 4365 _cache_put(par); /* from earlier hold/lock */ 4366 if (par == nch->mount->mnt_ncmountpt.ncp) { 4367 cache_resolve_mp(nch->mount); 4368 } else if ((dvp = cache_dvpref(par)) == NULL) { 4369 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", 4370 par->nc_nlen, par->nc_nlen, par->nc_name); 4371 _cache_put(par); 4372 continue; 4373 } else { 4374 if (par->nc_flag & NCF_UNRESOLVED) { 4375 nctmp.mount = mp; 4376 nctmp.ncp = par; 4377 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 4378 } 4379 vrele(dvp); 4380 } 4381 if ((error = par->nc_error) != 0) { 4382 if (par->nc_error != EAGAIN) { 4383 kprintf("EXDEV case 3 %*.*s error %d\n", 4384 par->nc_nlen, par->nc_nlen, par->nc_name, 4385 par->nc_error); 4386 _cache_put(par); 4387 return(error); 4388 } 4389 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 4390 par, par->nc_nlen, par->nc_nlen, par->nc_name); 4391 } 4392 _cache_put(par); 4393 /* loop */ 4394 } 4395 4396 /* 4397 * We have a referenced dvp 4398 */ 4399 *dvpp = dvp; 4400 return 0; 4401 } 4402 4403 /* 4404 * Clean out negative cache entries when too many have accumulated. 4405 */ 4406 static void 4407 _cache_cleanneg(long count) 4408 { 4409 struct pcpu_ncache *pn; 4410 struct namecache *ncp; 4411 static uint32_t neg_rover; 4412 uint32_t n; 4413 long vnegs; 4414 4415 n = neg_rover++; /* SMP heuristical, race ok */ 4416 cpu_ccfence(); 4417 n = n % (uint32_t)ncpus; 4418 4419 /* 4420 * Normalize vfscache_negs and count. count is sometimes based 4421 * on vfscache_negs. vfscache_negs is heuristical and can sometimes 4422 * have crazy values. 4423 */ 4424 vnegs = vfscache_negs; 4425 cpu_ccfence(); 4426 if (vnegs <= MINNEG) 4427 vnegs = MINNEG; 4428 if (count < 1) 4429 count = 1; 4430 4431 pn = &pcpu_ncache[n]; 4432 spin_lock(&pn->neg_spin); 4433 count = pn->neg_count * count / vnegs + 1; 4434 spin_unlock(&pn->neg_spin); 4435 4436 /* 4437 * Attempt to clean out the specified number of negative cache 4438 * entries. 4439 */ 4440 while (count > 0) { 4441 spin_lock(&pn->neg_spin); 4442 ncp = TAILQ_FIRST(&pn->neg_list); 4443 if (ncp == NULL) { 4444 spin_unlock(&pn->neg_spin); 4445 break; 4446 } 4447 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 4448 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 4449 _cache_hold(ncp); 4450 spin_unlock(&pn->neg_spin); 4451 4452 /* 4453 * This can race, so we must re-check that the ncp 4454 * is on the ncneg.list after successfully locking it. 4455 */ 4456 if (_cache_lock_special(ncp) == 0) { 4457 if (ncp->nc_vp == NULL && 4458 (ncp->nc_flag & NCF_UNRESOLVED) == 0) 4459 { 4460 cache_zap(ncp); 4461 } else { 4462 _cache_unlock(ncp); 4463 _cache_drop(ncp); 4464 } 4465 } else { 4466 _cache_drop(ncp); 4467 } 4468 --count; 4469 } 4470 } 4471 4472 /* 4473 * Clean out positive cache entries when too many have accumulated. 4474 * Only leafs are cleaned out. LRU order is maintained and we rove 4475 * available cpus. 4476 */ 4477 static void 4478 _cache_cleanpos(long count) 4479 { 4480 static volatile int rover; 4481 struct nchash_head *nchpp; 4482 struct namecache *ncp; 4483 int rover_copy; 4484 4485 /* 4486 * Attempt to clean out the specified number of negative cache 4487 * entries. 4488 */ 4489 while (count > 0) { 4490 rover_copy = ++rover; /* MPSAFEENOUGH */ 4491 cpu_ccfence(); 4492 nchpp = NCHHASH(rover_copy); 4493 4494 if (TAILQ_FIRST(&nchpp->list) == NULL) { 4495 --count; 4496 continue; 4497 } 4498 4499 /* 4500 * Cycle ncp on list, ignore and do not move DUMMY 4501 * ncps. These are temporary list iterators. 4502 * 4503 * We must cycle the ncp to the end of the list to 4504 * ensure that all ncp's have an equal chance of 4505 * being removed. 4506 */ 4507 spin_lock(&nchpp->spin); 4508 ncp = TAILQ_FIRST(&nchpp->list); 4509 while (ncp && (ncp->nc_flag & NCF_DUMMY)) 4510 ncp = TAILQ_NEXT(ncp, nc_hash); 4511 if (ncp) { 4512 TAILQ_REMOVE(&nchpp->list, ncp, nc_hash); 4513 TAILQ_INSERT_TAIL(&nchpp->list, ncp, nc_hash); 4514 _cache_hold(ncp); 4515 } 4516 spin_unlock(&nchpp->spin); 4517 4518 if (ncp) { 4519 if (_cache_lock_special(ncp) == 0) { 4520 cache_zap(ncp); 4521 } else { 4522 _cache_drop(ncp); 4523 } 4524 } 4525 --count; 4526 } 4527 } 4528 4529 /* 4530 * This is a kitchen sink function to clean out ncps which we 4531 * tried to zap from cache_drop() but failed because we were 4532 * unable to acquire the parent lock. 4533 * 4534 * Such entries can also be removed via cache_inval_vp(), such 4535 * as when unmounting. 4536 */ 4537 static void 4538 _cache_cleandefered(void) 4539 { 4540 struct nchash_head *nchpp; 4541 struct namecache *ncp; 4542 struct namecache dummy; 4543 int i; 4544 4545 /* 4546 * Create a list iterator. DUMMY indicates that this is a list 4547 * iterator, DESTROYED prevents matches by lookup functions. 4548 */ 4549 numdefered = 0; 4550 pcpu_ncache[mycpu->gd_cpuid].numdefered = 0; 4551 bzero(&dummy, sizeof(dummy)); 4552 dummy.nc_flag = NCF_DESTROYED | NCF_DUMMY; 4553 dummy.nc_refs = 1; 4554 4555 for (i = 0; i <= nchash; ++i) { 4556 nchpp = &nchashtbl[i]; 4557 4558 spin_lock(&nchpp->spin); 4559 TAILQ_INSERT_HEAD(&nchpp->list, &dummy, nc_hash); 4560 ncp = &dummy; 4561 while ((ncp = TAILQ_NEXT(ncp, nc_hash)) != NULL) { 4562 if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0) 4563 continue; 4564 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4565 TAILQ_INSERT_AFTER(&nchpp->list, ncp, &dummy, nc_hash); 4566 _cache_hold(ncp); 4567 spin_unlock(&nchpp->spin); 4568 if (_cache_lock_nonblock(ncp) == 0) { 4569 ncp->nc_flag &= ~NCF_DEFEREDZAP; 4570 _cache_unlock(ncp); 4571 } 4572 _cache_drop(ncp); 4573 spin_lock(&nchpp->spin); 4574 ncp = &dummy; 4575 } 4576 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4577 spin_unlock(&nchpp->spin); 4578 } 4579 } 4580 4581 /* 4582 * Name cache initialization, from vfsinit() when we are booting 4583 */ 4584 void 4585 nchinit(void) 4586 { 4587 struct pcpu_ncache *pn; 4588 globaldata_t gd; 4589 int i; 4590 4591 /* 4592 * Per-cpu accounting and negative hit list 4593 */ 4594 pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus, 4595 M_VFSCACHEAUX, M_WAITOK|M_ZERO); 4596 for (i = 0; i < ncpus; ++i) { 4597 pn = &pcpu_ncache[i]; 4598 TAILQ_INIT(&pn->neg_list); 4599 spin_init(&pn->neg_spin, "ncneg"); 4600 spin_init(&pn->umount_spin, "ncumm"); 4601 } 4602 4603 /* 4604 * Initialise per-cpu namecache effectiveness statistics. 4605 */ 4606 for (i = 0; i < ncpus; ++i) { 4607 gd = globaldata_find(i); 4608 gd->gd_nchstats = &nchstats[i]; 4609 } 4610 4611 /* 4612 * Create a generous namecache hash table 4613 */ 4614 nchashtbl = hashinit_ext(vfs_inodehashsize(), 4615 sizeof(struct nchash_head), 4616 M_VFSCACHEAUX, &nchash); 4617 for (i = 0; i <= (int)nchash; ++i) { 4618 TAILQ_INIT(&nchashtbl[i].list); 4619 spin_init(&nchashtbl[i].spin, "nchinit_hash"); 4620 } 4621 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) 4622 spin_init(&ncmount_cache[i].spin, "nchinit_cache"); 4623 nclockwarn = 5 * hz; 4624 } 4625 4626 /* 4627 * Called from start_init() to bootstrap the root filesystem. Returns 4628 * a referenced, unlocked namecache record to serve as a root or the 4629 * root of the system. 4630 * 4631 * Adjust our namecache counts 4632 */ 4633 void 4634 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp) 4635 { 4636 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 4637 4638 atomic_add_long(&pn->vfscache_leafs, 1); 4639 atomic_add_long(&pn->vfscache_unres, 1); 4640 4641 nch->ncp = cache_alloc(0); 4642 nch->mount = mp; 4643 _cache_mntref(mp); 4644 if (vp) 4645 _cache_setvp(nch->mount, nch->ncp, vp); 4646 } 4647 4648 /* 4649 * vfs_cache_setroot() 4650 * 4651 * Create an association between the root of our namecache and 4652 * the root vnode. This routine may be called several times during 4653 * booting. 4654 * 4655 * If the caller intends to save the returned namecache pointer somewhere 4656 * it must cache_hold() it. 4657 */ 4658 void 4659 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch) 4660 { 4661 struct vnode *ovp; 4662 struct nchandle onch; 4663 4664 ovp = rootvnode; 4665 onch = rootnch; 4666 rootvnode = nvp; 4667 if (nch) 4668 rootnch = *nch; 4669 else 4670 cache_zero(&rootnch); 4671 if (ovp) 4672 vrele(ovp); 4673 if (onch.ncp) 4674 cache_drop(&onch); 4675 } 4676 4677 /* 4678 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache 4679 * topology and is being removed as quickly as possible. The new VOP_N*() 4680 * API calls are required to make specific adjustments using the supplied 4681 * ncp pointers rather then just bogusly purging random vnodes. 4682 * 4683 * Invalidate all namecache entries to a particular vnode as well as 4684 * any direct children of that vnode in the namecache. This is a 4685 * 'catch all' purge used by filesystems that do not know any better. 4686 * 4687 * Note that the linkage between the vnode and its namecache entries will 4688 * be removed, but the namecache entries themselves might stay put due to 4689 * active references from elsewhere in the system or due to the existance of 4690 * the children. The namecache topology is left intact even if we do not 4691 * know what the vnode association is. Such entries will be marked 4692 * NCF_UNRESOLVED. 4693 */ 4694 void 4695 cache_purge(struct vnode *vp) 4696 { 4697 cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN); 4698 } 4699 4700 __read_mostly static int disablecwd; 4701 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 4702 "Disable getcwd"); 4703 4704 /* 4705 * MPALMOSTSAFE 4706 */ 4707 int 4708 sys___getcwd(struct sysmsg *sysmsg, const struct __getcwd_args *uap) 4709 { 4710 u_int buflen; 4711 int error; 4712 char *buf; 4713 char *bp; 4714 4715 if (disablecwd) 4716 return (ENODEV); 4717 4718 buflen = uap->buflen; 4719 if (buflen == 0) 4720 return (EINVAL); 4721 if (buflen > MAXPATHLEN) 4722 buflen = MAXPATHLEN; 4723 4724 buf = kmalloc(buflen, M_TEMP, M_WAITOK); 4725 bp = kern_getcwd(buf, buflen, &error); 4726 if (error == 0) 4727 error = copyout(bp, uap->buf, strlen(bp) + 1); 4728 kfree(buf, M_TEMP); 4729 return (error); 4730 } 4731 4732 char * 4733 kern_getcwd(char *buf, size_t buflen, int *error) 4734 { 4735 struct proc *p = curproc; 4736 char *bp; 4737 int i, slash_prefixed; 4738 struct filedesc *fdp; 4739 struct nchandle nch; 4740 struct namecache *ncp; 4741 4742 bp = buf; 4743 bp += buflen - 1; 4744 *bp = '\0'; 4745 fdp = p->p_fd; 4746 slash_prefixed = 0; 4747 4748 nch = fdp->fd_ncdir; 4749 ncp = nch.ncp; 4750 if (ncp) 4751 _cache_hold(ncp); 4752 4753 while (ncp && (ncp != fdp->fd_nrdir.ncp || 4754 nch.mount != fdp->fd_nrdir.mount) 4755 ) { 4756 if (ncp->nc_flag & NCF_DESTROYED) { 4757 _cache_drop(ncp); 4758 ncp = NULL; 4759 break; 4760 } 4761 /* 4762 * While traversing upwards if we encounter the root 4763 * of the current mount we have to skip to the mount point 4764 * in the underlying filesystem. 4765 */ 4766 if (ncp == nch.mount->mnt_ncmountpt.ncp) { 4767 nch = nch.mount->mnt_ncmounton; 4768 _cache_drop(ncp); 4769 ncp = nch.ncp; 4770 if (ncp) 4771 _cache_hold(ncp); 4772 continue; 4773 } 4774 4775 /* 4776 * Prepend the path segment 4777 */ 4778 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4779 if (bp == buf) { 4780 *error = ERANGE; 4781 bp = NULL; 4782 goto done; 4783 } 4784 *--bp = ncp->nc_name[i]; 4785 } 4786 if (bp == buf) { 4787 *error = ERANGE; 4788 bp = NULL; 4789 goto done; 4790 } 4791 *--bp = '/'; 4792 slash_prefixed = 1; 4793 4794 /* 4795 * Go up a directory. This isn't a mount point so we don't 4796 * have to check again. 4797 */ 4798 while ((nch.ncp = ncp->nc_parent) != NULL) { 4799 if (ncp_shared_lock_disable) 4800 _cache_lock(ncp); 4801 else 4802 _cache_lock_shared(ncp); 4803 if (nch.ncp != ncp->nc_parent) { 4804 _cache_unlock(ncp); 4805 continue; 4806 } 4807 _cache_hold(nch.ncp); 4808 _cache_unlock(ncp); 4809 break; 4810 } 4811 _cache_drop(ncp); 4812 ncp = nch.ncp; 4813 } 4814 if (ncp == NULL) { 4815 *error = ENOENT; 4816 bp = NULL; 4817 goto done; 4818 } 4819 if (!slash_prefixed) { 4820 if (bp == buf) { 4821 *error = ERANGE; 4822 bp = NULL; 4823 goto done; 4824 } 4825 *--bp = '/'; 4826 } 4827 *error = 0; 4828 done: 4829 if (ncp) 4830 _cache_drop(ncp); 4831 return (bp); 4832 } 4833 4834 /* 4835 * Thus begins the fullpath magic. 4836 * 4837 * The passed nchp is referenced but not locked. 4838 */ 4839 __read_mostly static int disablefullpath; 4840 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, 4841 &disablefullpath, 0, 4842 "Disable fullpath lookups"); 4843 4844 int 4845 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase, 4846 char **retbuf, char **freebuf, int guess) 4847 { 4848 struct nchandle fd_nrdir; 4849 struct nchandle nch; 4850 struct namecache *ncp; 4851 struct mount *mp, *new_mp; 4852 char *bp, *buf; 4853 int slash_prefixed; 4854 int error = 0; 4855 int i; 4856 4857 *retbuf = NULL; 4858 *freebuf = NULL; 4859 4860 buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK); 4861 bp = buf + MAXPATHLEN - 1; 4862 *bp = '\0'; 4863 if (nchbase) 4864 fd_nrdir = *nchbase; 4865 else if (p != NULL) 4866 fd_nrdir = p->p_fd->fd_nrdir; 4867 else 4868 fd_nrdir = rootnch; 4869 slash_prefixed = 0; 4870 nch = *nchp; 4871 ncp = nch.ncp; 4872 if (ncp) 4873 _cache_hold(ncp); 4874 mp = nch.mount; 4875 4876 while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) { 4877 new_mp = NULL; 4878 4879 /* 4880 * If we are asked to guess the upwards path, we do so whenever 4881 * we encounter an ncp marked as a mountpoint. We try to find 4882 * the actual mountpoint by finding the mountpoint with this 4883 * ncp. 4884 */ 4885 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) { 4886 new_mp = mount_get_by_nc(ncp); 4887 } 4888 /* 4889 * While traversing upwards if we encounter the root 4890 * of the current mount we have to skip to the mount point. 4891 */ 4892 if (ncp == mp->mnt_ncmountpt.ncp) { 4893 new_mp = mp; 4894 } 4895 if (new_mp) { 4896 nch = new_mp->mnt_ncmounton; 4897 _cache_drop(ncp); 4898 ncp = nch.ncp; 4899 if (ncp) 4900 _cache_hold(ncp); 4901 mp = nch.mount; 4902 continue; 4903 } 4904 4905 /* 4906 * Prepend the path segment 4907 */ 4908 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4909 if (bp == buf) { 4910 kfree(buf, M_TEMP); 4911 error = ENOMEM; 4912 goto done; 4913 } 4914 *--bp = ncp->nc_name[i]; 4915 } 4916 if (bp == buf) { 4917 kfree(buf, M_TEMP); 4918 error = ENOMEM; 4919 goto done; 4920 } 4921 *--bp = '/'; 4922 slash_prefixed = 1; 4923 4924 /* 4925 * Go up a directory. This isn't a mount point so we don't 4926 * have to check again. 4927 * 4928 * We can only safely access nc_parent with ncp held locked. 4929 */ 4930 while ((nch.ncp = ncp->nc_parent) != NULL) { 4931 _cache_lock_shared(ncp); 4932 if (nch.ncp != ncp->nc_parent) { 4933 _cache_unlock(ncp); 4934 continue; 4935 } 4936 _cache_hold(nch.ncp); 4937 _cache_unlock(ncp); 4938 break; 4939 } 4940 _cache_drop(ncp); 4941 ncp = nch.ncp; 4942 } 4943 if (ncp == NULL) { 4944 kfree(buf, M_TEMP); 4945 error = ENOENT; 4946 goto done; 4947 } 4948 4949 if (!slash_prefixed) { 4950 if (bp == buf) { 4951 kfree(buf, M_TEMP); 4952 error = ENOMEM; 4953 goto done; 4954 } 4955 *--bp = '/'; 4956 } 4957 *retbuf = bp; 4958 *freebuf = buf; 4959 error = 0; 4960 done: 4961 if (ncp) 4962 _cache_drop(ncp); 4963 return(error); 4964 } 4965 4966 int 4967 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, 4968 char **freebuf, int guess) 4969 { 4970 struct namecache *ncp; 4971 struct nchandle nch; 4972 int error; 4973 4974 *freebuf = NULL; 4975 if (disablefullpath) 4976 return (ENODEV); 4977 4978 if (p == NULL) 4979 return (EINVAL); 4980 4981 /* vn is NULL, client wants us to use p->p_textvp */ 4982 if (vn == NULL) { 4983 if ((vn = p->p_textvp) == NULL) 4984 return (EINVAL); 4985 } 4986 spin_lock_shared(&vn->v_spin); 4987 TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) { 4988 if (ncp->nc_nlen) 4989 break; 4990 } 4991 if (ncp == NULL) { 4992 spin_unlock_shared(&vn->v_spin); 4993 return (EINVAL); 4994 } 4995 _cache_hold(ncp); 4996 spin_unlock_shared(&vn->v_spin); 4997 4998 nch.ncp = ncp; 4999 nch.mount = vn->v_mount; 5000 error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess); 5001 _cache_drop(ncp); 5002 return (error); 5003 } 5004 5005 void 5006 vfscache_rollup_cpu(struct globaldata *gd) 5007 { 5008 struct pcpu_ncache *pn; 5009 long count; 5010 5011 if (pcpu_ncache == NULL) 5012 return; 5013 pn = &pcpu_ncache[gd->gd_cpuid]; 5014 5015 if (pn->vfscache_count) { 5016 count = atomic_swap_long(&pn->vfscache_count, 0); 5017 atomic_add_long(&vfscache_count, count); 5018 } 5019 if (pn->vfscache_leafs) { 5020 count = atomic_swap_long(&pn->vfscache_leafs, 0); 5021 atomic_add_long(&vfscache_leafs, count); 5022 } 5023 if (pn->vfscache_unres) { 5024 count = atomic_swap_long(&pn->vfscache_unres, 0); 5025 atomic_add_long(&vfscache_unres, count); 5026 } 5027 if (pn->vfscache_negs) { 5028 count = atomic_swap_long(&pn->vfscache_negs, 0); 5029 atomic_add_long(&vfscache_negs, count); 5030 } 5031 if (pn->numdefered) { 5032 count = atomic_swap_long(&pn->numdefered, 0); 5033 atomic_add_long(&numdefered, count); 5034 } 5035 } 5036