1 /* $NetBSD: ufs_dirhash.c,v 1.34 2009/10/05 23:48:08 rmind Exp $ */ 2 3 /* 4 * Copyright (c) 2001, 2002 Ian Dowse. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 * 27 * $FreeBSD: src/sys/ufs/ufs/ufs_dirhash.c,v 1.3.2.8 2004/12/08 11:54:13 dwmalone Exp $ 28 */ 29 30 #include <sys/cdefs.h> 31 __KERNEL_RCSID(0, "$NetBSD: ufs_dirhash.c,v 1.34 2009/10/05 23:48:08 rmind Exp $"); 32 33 /* 34 * This implements a hash-based lookup scheme for UFS directories. 35 */ 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/kernel.h> 40 #include <sys/kmem.h> 41 #include <sys/types.h> 42 #include <sys/hash.h> 43 #include <sys/proc.h> 44 #include <sys/buf.h> 45 #include <sys/vnode.h> 46 #include <sys/mount.h> 47 #include <sys/pool.h> 48 #include <sys/sysctl.h> 49 #include <sys/atomic.h> 50 51 #include <ufs/ufs/inode.h> 52 #include <ufs/ufs/dir.h> 53 #include <ufs/ufs/dirhash.h> 54 #include <ufs/ufs/ufsmount.h> 55 #include <ufs/ufs/ufs_bswap.h> 56 #include <ufs/ufs/ufs_extern.h> 57 58 #define WRAPINCR(val, limit) (((val) + 1 == (limit)) ? 0 : ((val) + 1)) 59 #define WRAPDECR(val, limit) (((val) == 0) ? ((limit) - 1) : ((val) - 1)) 60 #define OFSFMT(ip) ((ip)->i_ump->um_maxsymlinklen <= 0) 61 #define BLKFREE2IDX(n) ((n) > DH_NFSTATS ? DH_NFSTATS : (n)) 62 63 static u_int ufs_dirhashminblks = 5; 64 static u_int ufs_dirhashmaxmem = 2 * 1024 * 1024; 65 static u_int ufs_dirhashmem; 66 static u_int ufs_dirhashcheck = 0; 67 68 static int ufsdirhash_hash(struct dirhash *dh, const char *name, int namelen); 69 static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff, 70 int dirblksiz); 71 static void ufsdirhash_delslot(struct dirhash *dh, int slot); 72 static int ufsdirhash_findslot(struct dirhash *dh, const char *name, 73 int namelen, doff_t offset); 74 static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset, 75 int dirblksiz); 76 static int ufsdirhash_recycle(int wanted); 77 78 static pool_cache_t ufsdirhashblk_cache; 79 static pool_cache_t ufsdirhash_cache; 80 81 #define DIRHASHLIST_LOCK() mutex_enter(&ufsdirhash_lock) 82 #define DIRHASHLIST_UNLOCK() mutex_exit(&ufsdirhash_lock) 83 #define DIRHASH_LOCK(dh) mutex_enter(&(dh)->dh_lock) 84 #define DIRHASH_UNLOCK(dh) mutex_exit(&(dh)->dh_lock) 85 #define DIRHASH_BLKALLOC() \ 86 pool_cache_get(ufsdirhashblk_cache, PR_NOWAIT) 87 #define DIRHASH_BLKFREE(ptr) \ 88 pool_cache_put(ufsdirhashblk_cache, ptr) 89 90 /* Dirhash list; recently-used entries are near the tail. */ 91 static TAILQ_HEAD(, dirhash) ufsdirhash_list; 92 93 /* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */ 94 static kmutex_t ufsdirhash_lock; 95 96 static struct sysctllog *ufsdirhash_sysctl_log; 97 98 /* 99 * Locking order: 100 * ufsdirhash_lock 101 * dh_lock 102 * 103 * The dh_lock mutex should be acquired either via the inode lock, or via 104 * ufsdirhash_lock. Only the owner of the inode may free the associated 105 * dirhash, but anything can steal its memory and set dh_hash to NULL. 106 */ 107 108 /* 109 * Attempt to build up a hash table for the directory contents in 110 * inode 'ip'. Returns 0 on success, or -1 of the operation failed. 111 */ 112 int 113 ufsdirhash_build(struct inode *ip) 114 { 115 struct dirhash *dh; 116 struct buf *bp = NULL; 117 struct direct *ep; 118 struct vnode *vp; 119 doff_t bmask, pos; 120 int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot; 121 const int needswap = UFS_MPNEEDSWAP(ip->i_ump); 122 int dirblksiz = ip->i_ump->um_dirblksiz; 123 124 /* Check if we can/should use dirhash. */ 125 if (ip->i_dirhash == NULL) { 126 if (ip->i_size < (ufs_dirhashminblks * dirblksiz) || OFSFMT(ip)) 127 return (-1); 128 } else { 129 /* Hash exists, but sysctls could have changed. */ 130 if (ip->i_size < (ufs_dirhashminblks * dirblksiz) || 131 ufs_dirhashmem > ufs_dirhashmaxmem) { 132 ufsdirhash_free(ip); 133 return (-1); 134 } 135 /* Check if hash exists and is intact (note: unlocked read). */ 136 if (ip->i_dirhash->dh_hash != NULL) 137 return (0); 138 /* Free the old, recycled hash and build a new one. */ 139 ufsdirhash_free(ip); 140 } 141 142 /* Don't hash removed directories. */ 143 if (ip->i_nlink == 0) 144 return (-1); 145 146 vp = ip->i_vnode; 147 /* Allocate 50% more entries than this dir size could ever need. */ 148 KASSERT(ip->i_size >= dirblksiz); 149 nslots = ip->i_size / DIRECTSIZ(1); 150 nslots = (nslots * 3 + 1) / 2; 151 narrays = howmany(nslots, DH_NBLKOFF); 152 nslots = narrays * DH_NBLKOFF; 153 dirblocks = howmany(ip->i_size, dirblksiz); 154 nblocks = (dirblocks * 3 + 1) / 2; 155 156 memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) + 157 narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) + 158 nblocks * sizeof(*dh->dh_blkfree); 159 160 while (atomic_add_int_nv(&ufs_dirhashmem, memreqd) > 161 ufs_dirhashmaxmem) { 162 atomic_add_int(&ufs_dirhashmem, -memreqd); 163 if (memreqd > ufs_dirhashmaxmem / 2) 164 return (-1); 165 /* Try to free some space. */ 166 if (ufsdirhash_recycle(memreqd) != 0) 167 return (-1); 168 else 169 DIRHASHLIST_UNLOCK(); 170 } 171 172 /* 173 * Use non-blocking mallocs so that we will revert to a linear 174 * lookup on failure rather than potentially blocking forever. 175 */ 176 dh = pool_cache_get(ufsdirhash_cache, PR_NOWAIT); 177 if (dh == NULL) { 178 atomic_add_int(&ufs_dirhashmem, -memreqd); 179 return (-1); 180 } 181 memset(dh, 0, sizeof(*dh)); 182 mutex_init(&dh->dh_lock, MUTEX_DEFAULT, IPL_NONE); 183 DIRHASH_LOCK(dh); 184 dh->dh_hashsz = narrays * sizeof(dh->dh_hash[0]); 185 dh->dh_hash = kmem_zalloc(dh->dh_hashsz, KM_NOSLEEP); 186 dh->dh_blkfreesz = nblocks * sizeof(dh->dh_blkfree[0]); 187 dh->dh_blkfree = kmem_zalloc(dh->dh_blkfreesz, KM_NOSLEEP); 188 if (dh->dh_hash == NULL || dh->dh_blkfree == NULL) 189 goto fail; 190 for (i = 0; i < narrays; i++) { 191 if ((dh->dh_hash[i] = DIRHASH_BLKALLOC()) == NULL) 192 goto fail; 193 for (j = 0; j < DH_NBLKOFF; j++) 194 dh->dh_hash[i][j] = DIRHASH_EMPTY; 195 } 196 197 /* Initialise the hash table and block statistics. */ 198 dh->dh_narrays = narrays; 199 dh->dh_hlen = nslots; 200 dh->dh_nblk = nblocks; 201 dh->dh_dirblks = dirblocks; 202 for (i = 0; i < dirblocks; i++) 203 dh->dh_blkfree[i] = dirblksiz / DIRALIGN; 204 for (i = 0; i < DH_NFSTATS; i++) 205 dh->dh_firstfree[i] = -1; 206 dh->dh_firstfree[DH_NFSTATS] = 0; 207 dh->dh_seqopt = 0; 208 dh->dh_seqoff = 0; 209 dh->dh_score = DH_SCOREINIT; 210 ip->i_dirhash = dh; 211 212 bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; 213 pos = 0; 214 while (pos < ip->i_size) { 215 if ((curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD) 216 != 0) { 217 preempt(); 218 } 219 /* If necessary, get the next directory block. */ 220 if ((pos & bmask) == 0) { 221 if (bp != NULL) 222 brelse(bp, 0); 223 if (ufs_blkatoff(vp, (off_t)pos, NULL, &bp, false) != 0) 224 goto fail; 225 } 226 227 /* Add this entry to the hash. */ 228 ep = (struct direct *)((char *)bp->b_data + (pos & bmask)); 229 if (ep->d_reclen == 0 || ep->d_reclen > 230 dirblksiz - (pos & (dirblksiz - 1))) { 231 /* Corrupted directory. */ 232 brelse(bp, 0); 233 goto fail; 234 } 235 if (ep->d_ino != 0) { 236 /* Add the entry (simplified ufsdirhash_add). */ 237 slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen); 238 while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY) 239 slot = WRAPINCR(slot, dh->dh_hlen); 240 dh->dh_hused++; 241 DH_ENTRY(dh, slot) = pos; 242 ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep, needswap), 243 dirblksiz); 244 } 245 pos += ep->d_reclen; 246 } 247 248 if (bp != NULL) 249 brelse(bp, 0); 250 DIRHASHLIST_LOCK(); 251 TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list); 252 dh->dh_onlist = 1; 253 DIRHASH_UNLOCK(dh); 254 DIRHASHLIST_UNLOCK(); 255 return (0); 256 257 fail: 258 DIRHASH_UNLOCK(dh); 259 if (dh->dh_hash != NULL) { 260 for (i = 0; i < narrays; i++) 261 if (dh->dh_hash[i] != NULL) 262 DIRHASH_BLKFREE(dh->dh_hash[i]); 263 kmem_free(dh->dh_hash, dh->dh_hashsz); 264 } 265 if (dh->dh_blkfree != NULL) 266 kmem_free(dh->dh_blkfree, dh->dh_blkfreesz); 267 mutex_destroy(&dh->dh_lock); 268 pool_cache_put(ufsdirhash_cache, dh); 269 ip->i_dirhash = NULL; 270 atomic_add_int(&ufs_dirhashmem, -memreqd); 271 return (-1); 272 } 273 274 /* 275 * Free any hash table associated with inode 'ip'. 276 */ 277 void 278 ufsdirhash_free(struct inode *ip) 279 { 280 struct dirhash *dh; 281 int i, mem; 282 283 if ((dh = ip->i_dirhash) == NULL) 284 return; 285 286 if (dh->dh_onlist) { 287 DIRHASHLIST_LOCK(); 288 if (dh->dh_onlist) 289 TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); 290 DIRHASHLIST_UNLOCK(); 291 } 292 293 /* The dirhash pointed to by 'dh' is exclusively ours now. */ 294 mem = sizeof(*dh); 295 if (dh->dh_hash != NULL) { 296 for (i = 0; i < dh->dh_narrays; i++) 297 DIRHASH_BLKFREE(dh->dh_hash[i]); 298 kmem_free(dh->dh_hash, dh->dh_hashsz); 299 kmem_free(dh->dh_blkfree, dh->dh_blkfreesz); 300 mem += dh->dh_hashsz; 301 mem += dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash); 302 mem += dh->dh_nblk * sizeof(*dh->dh_blkfree); 303 } 304 mutex_destroy(&dh->dh_lock); 305 pool_cache_put(ufsdirhash_cache, dh); 306 ip->i_dirhash = NULL; 307 308 atomic_add_int(&ufs_dirhashmem, -mem); 309 } 310 311 /* 312 * Find the offset of the specified name within the given inode. 313 * Returns 0 on success, ENOENT if the entry does not exist, or 314 * EJUSTRETURN if the caller should revert to a linear search. 315 * 316 * If successful, the directory offset is stored in *offp, and a 317 * pointer to a struct buf containing the entry is stored in *bpp. If 318 * prevoffp is non-NULL, the offset of the previous entry within 319 * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry 320 * is the first in a block, the start of the block is used). 321 */ 322 int 323 ufsdirhash_lookup(struct inode *ip, const char *name, int namelen, doff_t *offp, 324 struct buf **bpp, doff_t *prevoffp) 325 { 326 struct dirhash *dh, *dh_next; 327 struct direct *dp; 328 struct vnode *vp; 329 struct buf *bp; 330 doff_t blkoff, bmask, offset, prevoff; 331 int i, slot; 332 const int needswap = UFS_MPNEEDSWAP(ip->i_ump); 333 int dirblksiz = ip->i_ump->um_dirblksiz; 334 335 if ((dh = ip->i_dirhash) == NULL) 336 return (EJUSTRETURN); 337 338 /* 339 * Move this dirhash towards the end of the list if it has a 340 * score higher than the next entry, and acquire the dh_lock. 341 * Optimise the case where it's already the last by performing 342 * an unlocked read of the TAILQ_NEXT pointer. 343 * 344 * In both cases, end up holding just dh_lock. 345 */ 346 if (TAILQ_NEXT(dh, dh_list) != NULL) { 347 DIRHASHLIST_LOCK(); 348 DIRHASH_LOCK(dh); 349 /* 350 * If the new score will be greater than that of the next 351 * entry, then move this entry past it. With both mutexes 352 * held, dh_next won't go away, but its dh_score could 353 * change; that's not important since it is just a hint. 354 */ 355 if (dh->dh_hash != NULL && 356 (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL && 357 dh->dh_score >= dh_next->dh_score) { 358 KASSERT(dh->dh_onlist); 359 TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); 360 TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh, 361 dh_list); 362 } 363 DIRHASHLIST_UNLOCK(); 364 } else { 365 /* Already the last, though that could change as we wait. */ 366 DIRHASH_LOCK(dh); 367 } 368 if (dh->dh_hash == NULL) { 369 DIRHASH_UNLOCK(dh); 370 ufsdirhash_free(ip); 371 return (EJUSTRETURN); 372 } 373 374 /* Update the score. */ 375 if (dh->dh_score < DH_SCOREMAX) 376 dh->dh_score++; 377 378 vp = ip->i_vnode; 379 bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; 380 blkoff = -1; 381 bp = NULL; 382 restart: 383 slot = ufsdirhash_hash(dh, name, namelen); 384 385 if (dh->dh_seqopt) { 386 /* 387 * Sequential access optimisation. dh_seqoff contains the 388 * offset of the directory entry immediately following 389 * the last entry that was looked up. Check if this offset 390 * appears in the hash chain for the name we are looking for. 391 */ 392 for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY; 393 i = WRAPINCR(i, dh->dh_hlen)) 394 if (offset == dh->dh_seqoff) 395 break; 396 if (offset == dh->dh_seqoff) { 397 /* 398 * We found an entry with the expected offset. This 399 * is probably the entry we want, but if not, the 400 * code below will turn off seqoff and retry. 401 */ 402 slot = i; 403 } else 404 dh->dh_seqopt = 0; 405 } 406 407 for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY; 408 slot = WRAPINCR(slot, dh->dh_hlen)) { 409 if (offset == DIRHASH_DEL) 410 continue; 411 412 if (offset < 0 || offset >= ip->i_size) 413 panic("ufsdirhash_lookup: bad offset in hash array"); 414 if ((offset & ~bmask) != blkoff) { 415 if (bp != NULL) 416 brelse(bp, 0); 417 blkoff = offset & ~bmask; 418 if (ufs_blkatoff(vp, (off_t)blkoff, 419 NULL, &bp, false) != 0) { 420 DIRHASH_UNLOCK(dh); 421 return (EJUSTRETURN); 422 } 423 } 424 dp = (struct direct *)((char *)bp->b_data + (offset & bmask)); 425 if (dp->d_reclen == 0 || dp->d_reclen > 426 dirblksiz - (offset & (dirblksiz - 1))) { 427 /* Corrupted directory. */ 428 DIRHASH_UNLOCK(dh); 429 brelse(bp, 0); 430 return (EJUSTRETURN); 431 } 432 if (dp->d_namlen == namelen && 433 memcmp(dp->d_name, name, namelen) == 0) { 434 /* Found. Get the prev offset if needed. */ 435 if (prevoffp != NULL) { 436 if (offset & (dirblksiz - 1)) { 437 prevoff = ufsdirhash_getprev(dp, 438 offset, dirblksiz); 439 if (prevoff == -1) { 440 brelse(bp, 0); 441 return (EJUSTRETURN); 442 } 443 } else 444 prevoff = offset; 445 *prevoffp = prevoff; 446 } 447 448 /* Check for sequential access, and update offset. */ 449 if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset) 450 dh->dh_seqopt = 1; 451 dh->dh_seqoff = offset + DIRSIZ(0, dp, needswap); 452 DIRHASH_UNLOCK(dh); 453 454 *bpp = bp; 455 *offp = offset; 456 return (0); 457 } 458 459 if (dh->dh_hash == NULL) { 460 DIRHASH_UNLOCK(dh); 461 if (bp != NULL) 462 brelse(bp, 0); 463 ufsdirhash_free(ip); 464 return (EJUSTRETURN); 465 } 466 /* 467 * When the name doesn't match in the seqopt case, go back 468 * and search normally. 469 */ 470 if (dh->dh_seqopt) { 471 dh->dh_seqopt = 0; 472 goto restart; 473 } 474 } 475 DIRHASH_UNLOCK(dh); 476 if (bp != NULL) 477 brelse(bp, 0); 478 return (ENOENT); 479 } 480 481 /* 482 * Find a directory block with room for 'slotneeded' bytes. Returns 483 * the offset of the directory entry that begins the free space. 484 * This will either be the offset of an existing entry that has free 485 * space at the end, or the offset of an entry with d_ino == 0 at 486 * the start of a DIRBLKSIZ block. 487 * 488 * To use the space, the caller may need to compact existing entries in 489 * the directory. The total number of bytes in all of the entries involved 490 * in the compaction is stored in *slotsize. In other words, all of 491 * the entries that must be compacted are exactly contained in the 492 * region beginning at the returned offset and spanning *slotsize bytes. 493 * 494 * Returns -1 if no space was found, indicating that the directory 495 * must be extended. 496 */ 497 doff_t 498 ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize) 499 { 500 struct direct *dp; 501 struct dirhash *dh; 502 struct buf *bp; 503 doff_t pos, slotstart; 504 int dirblock, error, freebytes, i; 505 const int needswap = UFS_MPNEEDSWAP(ip->i_ump); 506 int dirblksiz = ip->i_ump->um_dirblksiz; 507 508 if ((dh = ip->i_dirhash) == NULL) 509 return (-1); 510 511 DIRHASH_LOCK(dh); 512 if (dh->dh_hash == NULL) { 513 DIRHASH_UNLOCK(dh); 514 ufsdirhash_free(ip); 515 return (-1); 516 } 517 518 /* Find a directory block with the desired free space. */ 519 dirblock = -1; 520 for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++) 521 if ((dirblock = dh->dh_firstfree[i]) != -1) 522 break; 523 if (dirblock == -1) { 524 DIRHASH_UNLOCK(dh); 525 return (-1); 526 } 527 528 KASSERT(dirblock < dh->dh_nblk && 529 dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN)); 530 pos = dirblock * dirblksiz; 531 error = ufs_blkatoff(ip->i_vnode, (off_t)pos, (void *)&dp, &bp, false); 532 if (error) { 533 DIRHASH_UNLOCK(dh); 534 return (-1); 535 } 536 /* Find the first entry with free space. */ 537 for (i = 0; i < dirblksiz; ) { 538 if (dp->d_reclen == 0) { 539 DIRHASH_UNLOCK(dh); 540 brelse(bp, 0); 541 return (-1); 542 } 543 if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp, needswap)) 544 break; 545 i += dp->d_reclen; 546 dp = (struct direct *)((char *)dp + dp->d_reclen); 547 } 548 if (i > dirblksiz) { 549 DIRHASH_UNLOCK(dh); 550 brelse(bp, 0); 551 return (-1); 552 } 553 slotstart = pos + i; 554 555 /* Find the range of entries needed to get enough space */ 556 freebytes = 0; 557 while (i < dirblksiz && freebytes < slotneeded) { 558 freebytes += dp->d_reclen; 559 if (dp->d_ino != 0) 560 freebytes -= DIRSIZ(0, dp, needswap); 561 if (dp->d_reclen == 0) { 562 DIRHASH_UNLOCK(dh); 563 brelse(bp, 0); 564 return (-1); 565 } 566 i += dp->d_reclen; 567 dp = (struct direct *)((char *)dp + dp->d_reclen); 568 } 569 if (i > dirblksiz) { 570 DIRHASH_UNLOCK(dh); 571 brelse(bp, 0); 572 return (-1); 573 } 574 if (freebytes < slotneeded) 575 panic("ufsdirhash_findfree: free mismatch"); 576 DIRHASH_UNLOCK(dh); 577 brelse(bp, 0); 578 *slotsize = pos + i - slotstart; 579 return (slotstart); 580 } 581 582 /* 583 * Return the start of the unused space at the end of a directory, or 584 * -1 if there are no trailing unused blocks. 585 */ 586 doff_t 587 ufsdirhash_enduseful(struct inode *ip) 588 { 589 struct dirhash *dh; 590 int i; 591 int dirblksiz = ip->i_ump->um_dirblksiz; 592 593 if ((dh = ip->i_dirhash) == NULL) 594 return (-1); 595 596 DIRHASH_LOCK(dh); 597 if (dh->dh_hash == NULL) { 598 DIRHASH_UNLOCK(dh); 599 ufsdirhash_free(ip); 600 return (-1); 601 } 602 603 if (dh->dh_blkfree[dh->dh_dirblks - 1] != dirblksiz / DIRALIGN) { 604 DIRHASH_UNLOCK(dh); 605 return (-1); 606 } 607 608 for (i = dh->dh_dirblks - 1; i >= 0; i--) 609 if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN) 610 break; 611 DIRHASH_UNLOCK(dh); 612 return ((doff_t)(i + 1) * dirblksiz); 613 } 614 615 /* 616 * Insert information into the hash about a new directory entry. dirp 617 * points to a struct direct containing the entry, and offset specifies 618 * the offset of this entry. 619 */ 620 void 621 ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset) 622 { 623 struct dirhash *dh; 624 int slot; 625 const int needswap = UFS_MPNEEDSWAP(ip->i_ump); 626 int dirblksiz = ip->i_ump->um_dirblksiz; 627 628 if ((dh = ip->i_dirhash) == NULL) 629 return; 630 631 DIRHASH_LOCK(dh); 632 if (dh->dh_hash == NULL) { 633 DIRHASH_UNLOCK(dh); 634 ufsdirhash_free(ip); 635 return; 636 } 637 638 KASSERT(offset < dh->dh_dirblks * dirblksiz); 639 /* 640 * Normal hash usage is < 66%. If the usage gets too high then 641 * remove the hash entirely and let it be rebuilt later. 642 */ 643 if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) { 644 DIRHASH_UNLOCK(dh); 645 ufsdirhash_free(ip); 646 return; 647 } 648 649 /* Find a free hash slot (empty or deleted), and add the entry. */ 650 slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen); 651 while (DH_ENTRY(dh, slot) >= 0) 652 slot = WRAPINCR(slot, dh->dh_hlen); 653 if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY) 654 dh->dh_hused++; 655 DH_ENTRY(dh, slot) = offset; 656 657 /* Update the per-block summary info. */ 658 ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp, needswap), dirblksiz); 659 DIRHASH_UNLOCK(dh); 660 } 661 662 /* 663 * Remove the specified directory entry from the hash. The entry to remove 664 * is defined by the name in `dirp', which must exist at the specified 665 * `offset' within the directory. 666 */ 667 void 668 ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset) 669 { 670 struct dirhash *dh; 671 int slot; 672 const int needswap = UFS_MPNEEDSWAP(ip->i_ump); 673 int dirblksiz = ip->i_ump->um_dirblksiz; 674 675 if ((dh = ip->i_dirhash) == NULL) 676 return; 677 678 DIRHASH_LOCK(dh); 679 if (dh->dh_hash == NULL) { 680 DIRHASH_UNLOCK(dh); 681 ufsdirhash_free(ip); 682 return; 683 } 684 685 KASSERT(offset < dh->dh_dirblks * dirblksiz); 686 /* Find the entry */ 687 slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset); 688 689 /* Remove the hash entry. */ 690 ufsdirhash_delslot(dh, slot); 691 692 /* Update the per-block summary info. */ 693 ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp, needswap), dirblksiz); 694 DIRHASH_UNLOCK(dh); 695 } 696 697 /* 698 * Change the offset associated with a directory entry in the hash. Used 699 * when compacting directory blocks. 700 */ 701 void 702 ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff, 703 doff_t newoff) 704 { 705 struct dirhash *dh; 706 int slot; 707 708 if ((dh = ip->i_dirhash) == NULL) 709 return; 710 DIRHASH_LOCK(dh); 711 if (dh->dh_hash == NULL) { 712 DIRHASH_UNLOCK(dh); 713 ufsdirhash_free(ip); 714 return; 715 } 716 717 KASSERT(oldoff < dh->dh_dirblks * ip->i_ump->um_dirblksiz && 718 newoff < dh->dh_dirblks * ip->i_ump->um_dirblksiz); 719 /* Find the entry, and update the offset. */ 720 slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff); 721 DH_ENTRY(dh, slot) = newoff; 722 DIRHASH_UNLOCK(dh); 723 } 724 725 /* 726 * Inform dirhash that the directory has grown by one block that 727 * begins at offset (i.e. the new length is offset + DIRBLKSIZ). 728 */ 729 void 730 ufsdirhash_newblk(struct inode *ip, doff_t offset) 731 { 732 struct dirhash *dh; 733 int block; 734 int dirblksiz = ip->i_ump->um_dirblksiz; 735 736 if ((dh = ip->i_dirhash) == NULL) 737 return; 738 DIRHASH_LOCK(dh); 739 if (dh->dh_hash == NULL) { 740 DIRHASH_UNLOCK(dh); 741 ufsdirhash_free(ip); 742 return; 743 } 744 745 KASSERT(offset == dh->dh_dirblks * dirblksiz); 746 block = offset / dirblksiz; 747 if (block >= dh->dh_nblk) { 748 /* Out of space; must rebuild. */ 749 DIRHASH_UNLOCK(dh); 750 ufsdirhash_free(ip); 751 return; 752 } 753 dh->dh_dirblks = block + 1; 754 755 /* Account for the new free block. */ 756 dh->dh_blkfree[block] = dirblksiz / DIRALIGN; 757 if (dh->dh_firstfree[DH_NFSTATS] == -1) 758 dh->dh_firstfree[DH_NFSTATS] = block; 759 DIRHASH_UNLOCK(dh); 760 } 761 762 /* 763 * Inform dirhash that the directory is being truncated. 764 */ 765 void 766 ufsdirhash_dirtrunc(struct inode *ip, doff_t offset) 767 { 768 struct dirhash *dh; 769 int block, i; 770 int dirblksiz = ip->i_ump->um_dirblksiz; 771 772 if ((dh = ip->i_dirhash) == NULL) 773 return; 774 775 DIRHASH_LOCK(dh); 776 if (dh->dh_hash == NULL) { 777 DIRHASH_UNLOCK(dh); 778 ufsdirhash_free(ip); 779 return; 780 } 781 782 KASSERT(offset <= dh->dh_dirblks * dirblksiz); 783 block = howmany(offset, dirblksiz); 784 /* 785 * If the directory shrinks to less than 1/8 of dh_nblk blocks 786 * (about 20% of its original size due to the 50% extra added in 787 * ufsdirhash_build) then free it, and let the caller rebuild 788 * if necessary. 789 */ 790 if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) { 791 DIRHASH_UNLOCK(dh); 792 ufsdirhash_free(ip); 793 return; 794 } 795 796 /* 797 * Remove any `first free' information pertaining to the 798 * truncated blocks. All blocks we're removing should be 799 * completely unused. 800 */ 801 if (dh->dh_firstfree[DH_NFSTATS] >= block) 802 dh->dh_firstfree[DH_NFSTATS] = -1; 803 for (i = block; i < dh->dh_dirblks; i++) 804 if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN) 805 panic("ufsdirhash_dirtrunc: blocks in use"); 806 for (i = 0; i < DH_NFSTATS; i++) 807 if (dh->dh_firstfree[i] >= block) 808 panic("ufsdirhash_dirtrunc: first free corrupt"); 809 dh->dh_dirblks = block; 810 DIRHASH_UNLOCK(dh); 811 } 812 813 /* 814 * Debugging function to check that the dirhash information about 815 * a directory block matches its actual contents. Panics if a mismatch 816 * is detected. 817 * 818 * On entry, `sbuf' should point to the start of an in-core 819 * DIRBLKSIZ-sized directory block, and `offset' should contain the 820 * offset from the start of the directory of that block. 821 */ 822 void 823 ufsdirhash_checkblock(struct inode *ip, char *sbuf, doff_t offset) 824 { 825 struct dirhash *dh; 826 struct direct *dp; 827 int block, ffslot, i, nfree; 828 const int needswap = UFS_MPNEEDSWAP(ip->i_ump); 829 int dirblksiz = ip->i_ump->um_dirblksiz; 830 831 if (!ufs_dirhashcheck) 832 return; 833 if ((dh = ip->i_dirhash) == NULL) 834 return; 835 836 DIRHASH_LOCK(dh); 837 if (dh->dh_hash == NULL) { 838 DIRHASH_UNLOCK(dh); 839 ufsdirhash_free(ip); 840 return; 841 } 842 843 block = offset / dirblksiz; 844 if ((offset & (dirblksiz - 1)) != 0 || block >= dh->dh_dirblks) 845 panic("ufsdirhash_checkblock: bad offset"); 846 847 nfree = 0; 848 for (i = 0; i < dirblksiz; i += dp->d_reclen) { 849 dp = (struct direct *)(sbuf + i); 850 if (dp->d_reclen == 0 || i + dp->d_reclen > dirblksiz) 851 panic("ufsdirhash_checkblock: bad dir"); 852 853 if (dp->d_ino == 0) { 854 #if 0 855 /* 856 * XXX entries with d_ino == 0 should only occur 857 * at the start of a DIRBLKSIZ block. However the 858 * ufs code is tolerant of such entries at other 859 * offsets, and fsck does not fix them. 860 */ 861 if (i != 0) 862 panic("ufsdirhash_checkblock: bad dir inode"); 863 #endif 864 nfree += dp->d_reclen; 865 continue; 866 } 867 868 /* Check that the entry exists (will panic if it doesn't). */ 869 ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i); 870 871 nfree += dp->d_reclen - DIRSIZ(0, dp, needswap); 872 } 873 if (i != dirblksiz) 874 panic("ufsdirhash_checkblock: bad dir end"); 875 876 if (dh->dh_blkfree[block] * DIRALIGN != nfree) 877 panic("ufsdirhash_checkblock: bad free count"); 878 879 ffslot = BLKFREE2IDX(nfree / DIRALIGN); 880 for (i = 0; i <= DH_NFSTATS; i++) 881 if (dh->dh_firstfree[i] == block && i != ffslot) 882 panic("ufsdirhash_checkblock: bad first-free"); 883 if (dh->dh_firstfree[ffslot] == -1) 884 panic("ufsdirhash_checkblock: missing first-free entry"); 885 DIRHASH_UNLOCK(dh); 886 } 887 888 /* 889 * Hash the specified filename into a dirhash slot. 890 */ 891 static int 892 ufsdirhash_hash(struct dirhash *dh, const char *name, int namelen) 893 { 894 u_int32_t hash; 895 896 /* 897 * We hash the name and then some other bit of data that is 898 * invariant over the dirhash's lifetime. Otherwise names 899 * differing only in the last byte are placed close to one 900 * another in the table, which is bad for linear probing. 901 */ 902 hash = hash32_buf(name, namelen, HASH32_BUF_INIT); 903 hash = hash32_buf(&dh, sizeof(dh), hash); 904 return (hash % dh->dh_hlen); 905 } 906 907 /* 908 * Adjust the number of free bytes in the block containing `offset' 909 * by the value specified by `diff'. 910 * 911 * The caller must ensure we have exclusive access to `dh'; normally 912 * that means that dh_lock should be held, but this is also called 913 * from ufsdirhash_build() where exclusive access can be assumed. 914 */ 915 static void 916 ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff, int dirblksiz) 917 { 918 int block, i, nfidx, ofidx; 919 920 KASSERT(mutex_owned(&dh->dh_lock)); 921 922 /* Update the per-block summary info. */ 923 block = offset / dirblksiz; 924 KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks); 925 ofidx = BLKFREE2IDX(dh->dh_blkfree[block]); 926 dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN); 927 nfidx = BLKFREE2IDX(dh->dh_blkfree[block]); 928 929 /* Update the `first free' list if necessary. */ 930 if (ofidx != nfidx) { 931 /* If removing, scan forward for the next block. */ 932 if (dh->dh_firstfree[ofidx] == block) { 933 for (i = block + 1; i < dh->dh_dirblks; i++) 934 if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx) 935 break; 936 dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1; 937 } 938 939 /* Make this the new `first free' if necessary */ 940 if (dh->dh_firstfree[nfidx] > block || 941 dh->dh_firstfree[nfidx] == -1) 942 dh->dh_firstfree[nfidx] = block; 943 } 944 } 945 946 /* 947 * Find the specified name which should have the specified offset. 948 * Returns a slot number, and panics on failure. 949 * 950 * `dh' must be locked on entry and remains so on return. 951 */ 952 static int 953 ufsdirhash_findslot(struct dirhash *dh, const char *name, int namelen, 954 doff_t offset) 955 { 956 int slot; 957 958 KASSERT(mutex_owned(&dh->dh_lock)); 959 960 /* Find the entry. */ 961 KASSERT(dh->dh_hused < dh->dh_hlen); 962 slot = ufsdirhash_hash(dh, name, namelen); 963 while (DH_ENTRY(dh, slot) != offset && 964 DH_ENTRY(dh, slot) != DIRHASH_EMPTY) 965 slot = WRAPINCR(slot, dh->dh_hlen); 966 if (DH_ENTRY(dh, slot) != offset) 967 panic("ufsdirhash_findslot: '%.*s' not found", namelen, name); 968 969 return (slot); 970 } 971 972 /* 973 * Remove the entry corresponding to the specified slot from the hash array. 974 * 975 * `dh' must be locked on entry and remains so on return. 976 */ 977 static void 978 ufsdirhash_delslot(struct dirhash *dh, int slot) 979 { 980 int i; 981 982 KASSERT(mutex_owned(&dh->dh_lock)); 983 984 /* Mark the entry as deleted. */ 985 DH_ENTRY(dh, slot) = DIRHASH_DEL; 986 987 /* If this is the end of a chain of DIRHASH_DEL slots, remove them. */ 988 for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; ) 989 i = WRAPINCR(i, dh->dh_hlen); 990 if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) { 991 i = WRAPDECR(i, dh->dh_hlen); 992 while (DH_ENTRY(dh, i) == DIRHASH_DEL) { 993 DH_ENTRY(dh, i) = DIRHASH_EMPTY; 994 dh->dh_hused--; 995 i = WRAPDECR(i, dh->dh_hlen); 996 } 997 KASSERT(dh->dh_hused >= 0); 998 } 999 } 1000 1001 /* 1002 * Given a directory entry and its offset, find the offset of the 1003 * previous entry in the same DIRBLKSIZ-sized block. Returns an 1004 * offset, or -1 if there is no previous entry in the block or some 1005 * other problem occurred. 1006 */ 1007 static doff_t 1008 ufsdirhash_getprev(struct direct *dirp, doff_t offset, int dirblksiz) 1009 { 1010 struct direct *dp; 1011 char *blkbuf; 1012 doff_t blkoff, prevoff; 1013 int entrypos, i; 1014 1015 blkoff = offset & ~(dirblksiz - 1); /* offset of start of block */ 1016 entrypos = offset & (dirblksiz - 1); /* entry relative to block */ 1017 blkbuf = (char *)dirp - entrypos; 1018 prevoff = blkoff; 1019 1020 /* If `offset' is the start of a block, there is no previous entry. */ 1021 if (entrypos == 0) 1022 return (-1); 1023 1024 /* Scan from the start of the block until we get to the entry. */ 1025 for (i = 0; i < entrypos; i += dp->d_reclen) { 1026 dp = (struct direct *)(blkbuf + i); 1027 if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos) 1028 return (-1); /* Corrupted directory. */ 1029 prevoff = blkoff + i; 1030 } 1031 return (prevoff); 1032 } 1033 1034 /* 1035 * Try to free up `wanted' bytes by stealing memory from existing 1036 * dirhashes. Returns zero with list locked if successful. 1037 */ 1038 static int 1039 ufsdirhash_recycle(int wanted) 1040 { 1041 struct dirhash *dh; 1042 doff_t **hash; 1043 u_int8_t *blkfree; 1044 int i, mem, narrays; 1045 size_t hashsz, blkfreesz; 1046 1047 DIRHASHLIST_LOCK(); 1048 while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) { 1049 /* Find a dirhash, and lock it. */ 1050 if ((dh = TAILQ_FIRST(&ufsdirhash_list)) == NULL) { 1051 DIRHASHLIST_UNLOCK(); 1052 return (-1); 1053 } 1054 DIRHASH_LOCK(dh); 1055 KASSERT(dh->dh_hash != NULL); 1056 1057 /* Decrement the score; only recycle if it becomes zero. */ 1058 if (--dh->dh_score > 0) { 1059 DIRHASH_UNLOCK(dh); 1060 DIRHASHLIST_UNLOCK(); 1061 return (-1); 1062 } 1063 1064 /* Remove it from the list and detach its memory. */ 1065 TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); 1066 dh->dh_onlist = 0; 1067 hash = dh->dh_hash; 1068 hashsz = dh->dh_hashsz; 1069 dh->dh_hash = NULL; 1070 blkfree = dh->dh_blkfree; 1071 blkfreesz = dh->dh_blkfreesz; 1072 dh->dh_blkfree = NULL; 1073 narrays = dh->dh_narrays; 1074 mem = narrays * sizeof(*dh->dh_hash) + 1075 narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) + 1076 dh->dh_nblk * sizeof(*dh->dh_blkfree); 1077 1078 /* Unlock everything, free the detached memory. */ 1079 DIRHASH_UNLOCK(dh); 1080 DIRHASHLIST_UNLOCK(); 1081 1082 for (i = 0; i < narrays; i++) 1083 DIRHASH_BLKFREE(hash[i]); 1084 kmem_free(hash, hashsz); 1085 kmem_free(blkfree, blkfreesz); 1086 1087 /* Account for the returned memory, and repeat if necessary. */ 1088 DIRHASHLIST_LOCK(); 1089 atomic_add_int(&ufs_dirhashmem, -mem); 1090 } 1091 /* Success. */ 1092 return (0); 1093 } 1094 1095 static void 1096 ufsdirhash_sysctl_init(void) 1097 { 1098 const struct sysctlnode *rnode, *cnode; 1099 1100 sysctl_createv(&ufsdirhash_sysctl_log, 0, NULL, &rnode, 1101 CTLFLAG_PERMANENT, 1102 CTLTYPE_NODE, "vfs", NULL, 1103 NULL, 0, NULL, 0, 1104 CTL_VFS, CTL_EOL); 1105 1106 sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &rnode, 1107 CTLFLAG_PERMANENT, 1108 CTLTYPE_NODE, "ufs", 1109 SYSCTL_DESCR("ufs"), 1110 NULL, 0, NULL, 0, 1111 CTL_CREATE, CTL_EOL); 1112 1113 sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &rnode, 1114 CTLFLAG_PERMANENT, 1115 CTLTYPE_NODE, "dirhash", 1116 SYSCTL_DESCR("dirhash"), 1117 NULL, 0, NULL, 0, 1118 CTL_CREATE, CTL_EOL); 1119 1120 sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode, 1121 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1122 CTLTYPE_INT, "minblocks", 1123 SYSCTL_DESCR("minimum hashed directory size in blocks"), 1124 NULL, 0, &ufs_dirhashminblks, 0, 1125 CTL_CREATE, CTL_EOL); 1126 1127 sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode, 1128 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1129 CTLTYPE_INT, "maxmem", 1130 SYSCTL_DESCR("maximum dirhash memory usage"), 1131 NULL, 0, &ufs_dirhashmaxmem, 0, 1132 CTL_CREATE, CTL_EOL); 1133 1134 sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode, 1135 CTLFLAG_PERMANENT|CTLFLAG_READONLY, 1136 CTLTYPE_INT, "memused", 1137 SYSCTL_DESCR("current dirhash memory usage"), 1138 NULL, 0, &ufs_dirhashmem, 0, 1139 CTL_CREATE, CTL_EOL); 1140 1141 sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode, 1142 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1143 CTLTYPE_INT, "docheck", 1144 SYSCTL_DESCR("enable extra sanity checks"), 1145 NULL, 0, &ufs_dirhashcheck, 0, 1146 CTL_CREATE, CTL_EOL); 1147 } 1148 1149 void 1150 ufsdirhash_init(void) 1151 { 1152 1153 mutex_init(&ufsdirhash_lock, MUTEX_DEFAULT, IPL_NONE); 1154 ufsdirhashblk_cache = pool_cache_init(DH_NBLKOFF * sizeof(daddr_t), 0, 1155 0, 0, "dirhashblk", NULL, IPL_NONE, NULL, NULL, NULL); 1156 ufsdirhash_cache = pool_cache_init(sizeof(struct dirhash), 0, 1157 0, 0, "dirhash", NULL, IPL_NONE, NULL, NULL, NULL); 1158 TAILQ_INIT(&ufsdirhash_list); 1159 ufsdirhash_sysctl_init(); 1160 } 1161 1162 void 1163 ufsdirhash_done(void) 1164 { 1165 1166 KASSERT(TAILQ_EMPTY(&ufsdirhash_list)); 1167 pool_cache_destroy(ufsdirhashblk_cache); 1168 pool_cache_destroy(ufsdirhash_cache); 1169 mutex_destroy(&ufsdirhash_lock); 1170 sysctl_teardown(&ufsdirhash_sysctl_log); 1171 } 1172