xref: /freebsd/sys/ufs/ufs/ufs_dirhash.c (revision 7cc42f6d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2001, 2002 Ian Dowse.  All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 /*
29  * This implements a hash-based lookup scheme for UFS directories.
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include "opt_ufs.h"
36 
37 #ifdef UFS_DIRHASH
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/kernel.h>
42 #include <sys/lock.h>
43 #include <sys/mutex.h>
44 #include <sys/malloc.h>
45 #include <sys/fnv_hash.h>
46 #include <sys/proc.h>
47 #include <sys/bio.h>
48 #include <sys/buf.h>
49 #include <sys/vnode.h>
50 #include <sys/mount.h>
51 #include <sys/refcount.h>
52 #include <sys/sysctl.h>
53 #include <sys/sx.h>
54 #include <sys/eventhandler.h>
55 #include <sys/time.h>
56 #include <vm/uma.h>
57 
58 #include <ufs/ufs/quota.h>
59 #include <ufs/ufs/inode.h>
60 #include <ufs/ufs/dir.h>
61 #include <ufs/ufs/dirhash.h>
62 #include <ufs/ufs/extattr.h>
63 #include <ufs/ufs/ufsmount.h>
64 #include <ufs/ufs/ufs_extern.h>
65 
66 #define WRAPINCR(val, limit)	(((val) + 1 == (limit)) ? 0 : ((val) + 1))
67 #define WRAPDECR(val, limit)	(((val) == 0) ? ((limit) - 1) : ((val) - 1))
68 #define OFSFMT(vp)		((vp)->v_mount->mnt_maxsymlinklen <= 0)
69 #define BLKFREE2IDX(n)		((n) > DH_NFSTATS ? DH_NFSTATS : (n))
70 
71 static MALLOC_DEFINE(M_DIRHASH, "ufs_dirhash", "UFS directory hash tables");
72 
73 static int ufs_mindirhashsize = DIRBLKSIZ * 5;
74 SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW,
75     &ufs_mindirhashsize,
76     0, "minimum directory size in bytes for which to use hashed lookup");
77 static int ufs_dirhashmaxmem = 2 * 1024 * 1024;	/* NOTE: initial value. It is
78 						   tuned in ufsdirhash_init() */
79 SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_maxmem, CTLFLAG_RW, &ufs_dirhashmaxmem,
80     0, "maximum allowed dirhash memory usage");
81 static int ufs_dirhashmem;
82 SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_mem, CTLFLAG_RD, &ufs_dirhashmem,
83     0, "current dirhash memory usage");
84 static int ufs_dirhashcheck = 0;
85 SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_docheck, CTLFLAG_RW, &ufs_dirhashcheck,
86     0, "enable extra sanity tests");
87 static int ufs_dirhashlowmemcount = 0;
88 SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_lowmemcount, CTLFLAG_RD,
89     &ufs_dirhashlowmemcount, 0, "number of times low memory hook called");
90 static int ufs_dirhashreclaimpercent = 10;
91 static int ufsdirhash_set_reclaimpercent(SYSCTL_HANDLER_ARGS);
92 SYSCTL_PROC(_vfs_ufs, OID_AUTO, dirhash_reclaimpercent,
93     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
94     0, 0, ufsdirhash_set_reclaimpercent, "I",
95     "set percentage of dirhash cache to be removed in low VM events");
96 
97 static int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen);
98 static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff);
99 static void ufsdirhash_delslot(struct dirhash *dh, int slot);
100 static int ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen,
101 	   doff_t offset);
102 static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset);
103 static int ufsdirhash_recycle(int wanted);
104 static void ufsdirhash_lowmem(void);
105 static void ufsdirhash_free_locked(struct inode *ip);
106 
107 static uma_zone_t	ufsdirhash_zone;
108 
109 #define DIRHASHLIST_LOCK() 		mtx_lock(&ufsdirhash_mtx)
110 #define DIRHASHLIST_UNLOCK() 		mtx_unlock(&ufsdirhash_mtx)
111 #define DIRHASH_BLKALLOC_WAITOK() 	uma_zalloc(ufsdirhash_zone, M_WAITOK)
112 #define DIRHASH_BLKFREE(ptr) 		uma_zfree(ufsdirhash_zone, (ptr))
113 #define	DIRHASH_ASSERT_LOCKED(dh)					\
114     sx_assert(&(dh)->dh_lock, SA_LOCKED)
115 
116 /* Dirhash list; recently-used entries are near the tail. */
117 static TAILQ_HEAD(, dirhash) ufsdirhash_list;
118 
119 /* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */
120 static struct mtx	ufsdirhash_mtx;
121 
122 /*
123  * Locking:
124  *
125  * The relationship between inode and dirhash is protected either by an
126  * exclusive vnode lock or the vnode interlock where a shared vnode lock
127  * may be used.  The dirhash_mtx is acquired after the dirhash lock.  To
128  * handle teardown races, code wishing to lock the dirhash for an inode
129  * when using a shared vnode lock must obtain a private reference on the
130  * dirhash while holding the vnode interlock.  They can drop it once they
131  * have obtained the dirhash lock and verified that the dirhash wasn't
132  * recycled while they waited for the dirhash lock.
133  *
134  * ufsdirhash_build() acquires a shared lock on the dirhash when it is
135  * successful.  This lock is released after a call to ufsdirhash_lookup().
136  *
137  * Functions requiring exclusive access use ufsdirhash_acquire() which may
138  * free a dirhash structure that was recycled by ufsdirhash_recycle().
139  *
140  * The dirhash lock may be held across io operations.
141  *
142  * WITNESS reports a lock order reversal between the "bufwait" lock
143  * and the "dirhash" lock.  However, this specific reversal will not
144  * cause a deadlock.  To get a deadlock, one would have to lock a
145  * buffer followed by the dirhash while a second thread locked a
146  * buffer while holding the dirhash lock.  The second order can happen
147  * under a shared or exclusive vnode lock for the associated directory
148  * in lookup().  The first order, however, can only happen under an
149  * exclusive vnode lock (e.g. unlink(), rename(), etc.).  Thus, for
150  * a thread to be doing a "bufwait" -> "dirhash" order, it has to hold
151  * an exclusive vnode lock.  That exclusive vnode lock will prevent
152  * any other threads from doing a "dirhash" -> "bufwait" order.
153  */
154 
155 static void
156 ufsdirhash_hold(struct dirhash *dh)
157 {
158 
159 	refcount_acquire(&dh->dh_refcount);
160 }
161 
162 static void
163 ufsdirhash_drop(struct dirhash *dh)
164 {
165 
166 	if (refcount_release(&dh->dh_refcount)) {
167 		sx_destroy(&dh->dh_lock);
168 		free(dh, M_DIRHASH);
169 	}
170 }
171 
172 /*
173  * Release the lock on a dirhash.
174  */
175 static void
176 ufsdirhash_release(struct dirhash *dh)
177 {
178 
179 	sx_unlock(&dh->dh_lock);
180 }
181 
182 /*
183  * Either acquire an existing hash locked shared or create a new hash and
184  * return it exclusively locked.  May return NULL if the allocation fails.
185  *
186  * The vnode interlock is used to protect the i_dirhash pointer from
187  * simultaneous access while only a shared vnode lock is held.
188  */
189 static struct dirhash *
190 ufsdirhash_create(struct inode *ip)
191 {
192 	struct dirhash *ndh;
193 	struct dirhash *dh;
194 	struct vnode *vp;
195 	bool excl;
196 
197 	ndh = dh = NULL;
198 	vp = ip->i_vnode;
199 	excl = false;
200 	for (;;) {
201 		/* Racy check for i_dirhash to prefetch a dirhash structure. */
202 		if (ip->i_dirhash == NULL && ndh == NULL) {
203 			ndh = malloc(sizeof *dh, M_DIRHASH,
204 			    M_NOWAIT | M_ZERO);
205 			if (ndh == NULL)
206 				return (NULL);
207 			refcount_init(&ndh->dh_refcount, 1);
208 
209 			/*
210 			 * The DUPOK is to prevent warnings from the
211 			 * sx_slock() a few lines down which is safe
212 			 * since the duplicate lock in that case is
213 			 * the one for this dirhash we are creating
214 			 * now which has no external references until
215 			 * after this function returns.
216 			 */
217 			sx_init_flags(&ndh->dh_lock, "dirhash", SX_DUPOK);
218 			sx_xlock(&ndh->dh_lock);
219 		}
220 		/*
221 		 * Check i_dirhash.  If it's NULL just try to use a
222 		 * preallocated structure.  If none exists loop and try again.
223 		 */
224 		VI_LOCK(vp);
225 		dh = ip->i_dirhash;
226 		if (dh == NULL) {
227 			ip->i_dirhash = ndh;
228 			VI_UNLOCK(vp);
229 			if (ndh == NULL)
230 				continue;
231 			return (ndh);
232 		}
233 		ufsdirhash_hold(dh);
234 		VI_UNLOCK(vp);
235 
236 		/* Acquire a lock on existing hashes. */
237 		if (excl)
238 			sx_xlock(&dh->dh_lock);
239 		else
240 			sx_slock(&dh->dh_lock);
241 
242 		/* The hash could've been recycled while we were waiting. */
243 		VI_LOCK(vp);
244 		if (ip->i_dirhash != dh) {
245 			VI_UNLOCK(vp);
246 			ufsdirhash_release(dh);
247 			ufsdirhash_drop(dh);
248 			continue;
249 		}
250 		VI_UNLOCK(vp);
251 		ufsdirhash_drop(dh);
252 
253 		/* If the hash is still valid we've succeeded. */
254 		if (dh->dh_hash != NULL)
255 			break;
256 		/*
257 		 * If the hash is NULL it has been recycled.  Try to upgrade
258 		 * so we can recreate it.  If we fail the upgrade, drop our
259 		 * lock and try again.
260 		 */
261 		if (excl || sx_try_upgrade(&dh->dh_lock))
262 			break;
263 		sx_sunlock(&dh->dh_lock);
264 		excl = true;
265 	}
266 	/* Free the preallocated structure if it was not necessary. */
267 	if (ndh) {
268 		ufsdirhash_release(ndh);
269 		ufsdirhash_drop(ndh);
270 	}
271 	return (dh);
272 }
273 
274 /*
275  * Acquire an exclusive lock on an existing hash.  Requires an exclusive
276  * vnode lock to protect the i_dirhash pointer.  hashes that have been
277  * recycled are reclaimed here and NULL is returned.
278  */
279 static struct dirhash *
280 ufsdirhash_acquire(struct inode *ip)
281 {
282 	struct dirhash *dh;
283 
284 	ASSERT_VOP_ELOCKED(ip->i_vnode, __FUNCTION__);
285 
286 	dh = ip->i_dirhash;
287 	if (dh == NULL)
288 		return (NULL);
289 	sx_xlock(&dh->dh_lock);
290 	if (dh->dh_hash != NULL)
291 		return (dh);
292 	ufsdirhash_free_locked(ip);
293 	return (NULL);
294 }
295 
296 /*
297  * Acquire exclusively and free the hash pointed to by ip.  Works with a
298  * shared or exclusive vnode lock.
299  */
300 void
301 ufsdirhash_free(struct inode *ip)
302 {
303 	struct dirhash *dh;
304 	struct vnode *vp;
305 
306 	vp = ip->i_vnode;
307 	for (;;) {
308 		/* Grab a reference on this inode's dirhash if it has one. */
309 		VI_LOCK(vp);
310 		dh = ip->i_dirhash;
311 		if (dh == NULL) {
312 			VI_UNLOCK(vp);
313 			return;
314 		}
315 		ufsdirhash_hold(dh);
316 		VI_UNLOCK(vp);
317 
318 		/* Exclusively lock the dirhash. */
319 		sx_xlock(&dh->dh_lock);
320 
321 		/* If this dirhash still belongs to this inode, then free it. */
322 		VI_LOCK(vp);
323 		if (ip->i_dirhash == dh) {
324 			VI_UNLOCK(vp);
325 			ufsdirhash_drop(dh);
326 			break;
327 		}
328 		VI_UNLOCK(vp);
329 
330 		/*
331 		 * This inode's dirhash has changed while we were
332 		 * waiting for the dirhash lock, so try again.
333 		 */
334 		ufsdirhash_release(dh);
335 		ufsdirhash_drop(dh);
336 	}
337 	ufsdirhash_free_locked(ip);
338 }
339 
340 /*
341  * Attempt to build up a hash table for the directory contents in
342  * inode 'ip'. Returns 0 on success, or -1 of the operation failed.
343  */
344 int
345 ufsdirhash_build(struct inode *ip)
346 {
347 	struct dirhash *dh;
348 	struct buf *bp = NULL;
349 	struct direct *ep;
350 	struct vnode *vp;
351 	doff_t bmask, pos;
352 	u_int dirblocks, i, narrays, nblocks, nslots;
353 	int j, memreqd, slot;
354 
355 	/* Take care of a decreased sysctl value. */
356 	while (ufs_dirhashmem > ufs_dirhashmaxmem) {
357 		if (ufsdirhash_recycle(0) != 0)
358 			return (-1);
359 		/* Recycled enough memory, so unlock the list. */
360 		DIRHASHLIST_UNLOCK();
361 	}
362 
363 	/* Check if we can/should use dirhash. */
364 	if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode) ||
365 	    ip->i_effnlink == 0) {
366 		if (ip->i_dirhash)
367 			ufsdirhash_free(ip);
368 		return (-1);
369 	}
370 	dh = ufsdirhash_create(ip);
371 	if (dh == NULL)
372 		return (-1);
373 	if (dh->dh_hash != NULL)
374 		return (0);
375 
376 	vp = ip->i_vnode;
377 	/* Allocate 50% more entries than this dir size could ever need. */
378 	KASSERT(ip->i_size >= DIRBLKSIZ, ("ufsdirhash_build size"));
379 	nslots = ip->i_size / DIRECTSIZ(1);
380 	nslots = (nslots * 3 + 1) / 2;
381 	narrays = howmany(nslots, DH_NBLKOFF);
382 	nslots = narrays * DH_NBLKOFF;
383 	dirblocks = howmany(ip->i_size, DIRBLKSIZ);
384 	nblocks = (dirblocks * 3 + 1) / 2;
385 	memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
386 	    narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
387 	    nblocks * sizeof(*dh->dh_blkfree);
388 	DIRHASHLIST_LOCK();
389 	if (memreqd + ufs_dirhashmem > ufs_dirhashmaxmem) {
390 		DIRHASHLIST_UNLOCK();
391 		if (memreqd > ufs_dirhashmaxmem / 2)
392 			goto fail;
393 		/* Try to free some space. */
394 		if (ufsdirhash_recycle(memreqd) != 0)
395 			goto fail;
396 		/* Enough was freed, and list has been locked. */
397 	}
398 	ufs_dirhashmem += memreqd;
399 	DIRHASHLIST_UNLOCK();
400 
401 	/* Initialise the hash table and block statistics. */
402 	dh->dh_memreq = memreqd;
403 	dh->dh_narrays = narrays;
404 	dh->dh_hlen = nslots;
405 	dh->dh_nblk = nblocks;
406 	dh->dh_dirblks = dirblocks;
407 	for (i = 0; i < DH_NFSTATS; i++)
408 		dh->dh_firstfree[i] = -1;
409 	dh->dh_firstfree[DH_NFSTATS] = 0;
410 	dh->dh_hused = 0;
411 	dh->dh_seqoff = -1;
412 	dh->dh_score = DH_SCOREINIT;
413 	dh->dh_lastused = time_second;
414 
415 	/*
416 	 * Use non-blocking mallocs so that we will revert to a linear
417 	 * lookup on failure rather than potentially blocking forever.
418 	 */
419 	dh->dh_hash = malloc(narrays * sizeof(dh->dh_hash[0]),
420 	    M_DIRHASH, M_NOWAIT | M_ZERO);
421 	if (dh->dh_hash == NULL)
422 		goto fail;
423 	dh->dh_blkfree = malloc(nblocks * sizeof(dh->dh_blkfree[0]),
424 	    M_DIRHASH, M_NOWAIT);
425 	if (dh->dh_blkfree == NULL)
426 		goto fail;
427 	for (i = 0; i < narrays; i++) {
428 		if ((dh->dh_hash[i] = DIRHASH_BLKALLOC_WAITOK()) == NULL)
429 			goto fail;
430 		for (j = 0; j < DH_NBLKOFF; j++)
431 			dh->dh_hash[i][j] = DIRHASH_EMPTY;
432 	}
433 	for (i = 0; i < dirblocks; i++)
434 		dh->dh_blkfree[i] = DIRBLKSIZ / DIRALIGN;
435 	bmask = vp->v_mount->mnt_stat.f_iosize - 1;
436 	pos = 0;
437 	while (pos < ip->i_size) {
438 		/* If necessary, get the next directory block. */
439 		if ((pos & bmask) == 0) {
440 			if (bp != NULL)
441 				brelse(bp);
442 			if (UFS_BLKATOFF(vp, (off_t)pos, NULL, &bp) != 0)
443 				goto fail;
444 		}
445 
446 		/* Add this entry to the hash. */
447 		ep = (struct direct *)((char *)bp->b_data + (pos & bmask));
448 		if (ep->d_reclen == 0 || ep->d_reclen >
449 		    DIRBLKSIZ - (pos & (DIRBLKSIZ - 1))) {
450 			/* Corrupted directory. */
451 			brelse(bp);
452 			goto fail;
453 		}
454 		if (ep->d_ino != 0) {
455 			/* Add the entry (simplified ufsdirhash_add). */
456 			slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen);
457 			while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
458 				slot = WRAPINCR(slot, dh->dh_hlen);
459 			dh->dh_hused++;
460 			DH_ENTRY(dh, slot) = pos;
461 			ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep));
462 		}
463 		pos += ep->d_reclen;
464 	}
465 
466 	if (bp != NULL)
467 		brelse(bp);
468 	DIRHASHLIST_LOCK();
469 	TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list);
470 	dh->dh_onlist = 1;
471 	DIRHASHLIST_UNLOCK();
472 	sx_downgrade(&dh->dh_lock);
473 	return (0);
474 
475 fail:
476 	ufsdirhash_free_locked(ip);
477 	return (-1);
478 }
479 
480 /*
481  * Free any hash table associated with inode 'ip'.
482  */
483 static void
484 ufsdirhash_free_locked(struct inode *ip)
485 {
486 	struct dirhash *dh;
487 	struct vnode *vp;
488 	int i;
489 
490 	DIRHASH_ASSERT_LOCKED(ip->i_dirhash);
491 
492 	/*
493 	 * Clear the pointer in the inode to prevent new threads from
494 	 * finding the dead structure.
495 	 */
496 	vp = ip->i_vnode;
497 	VI_LOCK(vp);
498 	dh = ip->i_dirhash;
499 	ip->i_dirhash = NULL;
500 	VI_UNLOCK(vp);
501 
502 	/*
503 	 * Remove the hash from the list since we are going to free its
504 	 * memory.
505 	 */
506 	DIRHASHLIST_LOCK();
507 	if (dh->dh_onlist)
508 		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
509 	ufs_dirhashmem -= dh->dh_memreq;
510 	DIRHASHLIST_UNLOCK();
511 
512 	/*
513 	 * At this point, any waiters for the lock should hold their
514 	 * own reference on the dirhash structure.  They will drop
515 	 * that reference once they grab the vnode interlock and see
516 	 * that ip->i_dirhash is NULL.
517 	 */
518 	sx_xunlock(&dh->dh_lock);
519 
520 	/*
521 	 * Handle partially recycled as well as fully constructed hashes.
522 	 */
523 	if (dh->dh_hash != NULL) {
524 		for (i = 0; i < dh->dh_narrays; i++)
525 			if (dh->dh_hash[i] != NULL)
526 				DIRHASH_BLKFREE(dh->dh_hash[i]);
527 		free(dh->dh_hash, M_DIRHASH);
528 		if (dh->dh_blkfree != NULL)
529 			free(dh->dh_blkfree, M_DIRHASH);
530 	}
531 
532 	/*
533 	 * Drop the inode's reference to the data structure.
534 	 */
535 	ufsdirhash_drop(dh);
536 }
537 
538 /*
539  * Find the offset of the specified name within the given inode.
540  * Returns 0 on success, ENOENT if the entry does not exist, or
541  * EJUSTRETURN if the caller should revert to a linear search.
542  *
543  * If successful, the directory offset is stored in *offp, and a
544  * pointer to a struct buf containing the entry is stored in *bpp. If
545  * prevoffp is non-NULL, the offset of the previous entry within
546  * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
547  * is the first in a block, the start of the block is used).
548  *
549  * Must be called with the hash locked.  Returns with the hash unlocked.
550  */
551 int
552 ufsdirhash_lookup(struct inode *ip, char *name, int namelen, doff_t *offp,
553     struct buf **bpp, doff_t *prevoffp)
554 {
555 	struct dirhash *dh, *dh_next;
556 	struct direct *dp;
557 	struct vnode *vp;
558 	struct buf *bp;
559 	doff_t blkoff, bmask, offset, prevoff, seqoff;
560 	int i, slot;
561 	int error;
562 
563 	dh = ip->i_dirhash;
564 	KASSERT(dh != NULL && dh->dh_hash != NULL,
565 	    ("ufsdirhash_lookup: Invalid dirhash %p\n", dh));
566 	DIRHASH_ASSERT_LOCKED(dh);
567 	/*
568 	 * Move this dirhash towards the end of the list if it has a
569 	 * score higher than the next entry, and acquire the dh_lock.
570 	 */
571 	DIRHASHLIST_LOCK();
572 	if (TAILQ_NEXT(dh, dh_list) != NULL) {
573 		/*
574 		 * If the new score will be greater than that of the next
575 		 * entry, then move this entry past it. With both mutexes
576 		 * held, dh_next won't go away, but its dh_score could
577 		 * change; that's not important since it is just a hint.
578 		 */
579 		if ((dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
580 		    dh->dh_score >= dh_next->dh_score) {
581 			KASSERT(dh->dh_onlist, ("dirhash: not on list"));
582 			TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
583 			TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh,
584 			    dh_list);
585 		}
586 	}
587 	/* Update the score. */
588 	if (dh->dh_score < DH_SCOREMAX)
589 		dh->dh_score++;
590 
591 	/* Update last used time. */
592 	dh->dh_lastused = time_second;
593 	DIRHASHLIST_UNLOCK();
594 
595 	vp = ip->i_vnode;
596 	bmask = vp->v_mount->mnt_stat.f_iosize - 1;
597 	blkoff = -1;
598 	bp = NULL;
599 	seqoff = dh->dh_seqoff;
600 restart:
601 	slot = ufsdirhash_hash(dh, name, namelen);
602 
603 	if (seqoff != -1) {
604 		/*
605 		 * Sequential access optimisation. seqoff contains the
606 		 * offset of the directory entry immediately following
607 		 * the last entry that was looked up. Check if this offset
608 		 * appears in the hash chain for the name we are looking for.
609 		 */
610 		for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY;
611 		    i = WRAPINCR(i, dh->dh_hlen))
612 			if (offset == seqoff)
613 				break;
614 		if (offset == seqoff) {
615 			/*
616 			 * We found an entry with the expected offset. This
617 			 * is probably the entry we want, but if not, the
618 			 * code below will retry.
619 			 */
620 			slot = i;
621 		} else
622 			seqoff = -1;
623 	}
624 
625 	for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY;
626 	    slot = WRAPINCR(slot, dh->dh_hlen)) {
627 		if (offset == DIRHASH_DEL)
628 			continue;
629 		if (offset < 0 || offset >= ip->i_size)
630 			panic("ufsdirhash_lookup: bad offset in hash array");
631 		if ((offset & ~bmask) != blkoff) {
632 			if (bp != NULL)
633 				brelse(bp);
634 			blkoff = offset & ~bmask;
635 			if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0) {
636 				error = EJUSTRETURN;
637 				goto fail;
638 			}
639 		}
640 		KASSERT(bp != NULL, ("no buffer allocated"));
641 		dp = (struct direct *)(bp->b_data + (offset & bmask));
642 		if (dp->d_reclen == 0 || dp->d_reclen >
643 		    DIRBLKSIZ - (offset & (DIRBLKSIZ - 1))) {
644 			/* Corrupted directory. */
645 			error = EJUSTRETURN;
646 			goto fail;
647 		}
648 		if (dp->d_namlen == namelen &&
649 		    bcmp(dp->d_name, name, namelen) == 0) {
650 			/* Found. Get the prev offset if needed. */
651 			if (prevoffp != NULL) {
652 				if (offset & (DIRBLKSIZ - 1)) {
653 					prevoff = ufsdirhash_getprev(dp,
654 					    offset);
655 					if (prevoff == -1) {
656 						error = EJUSTRETURN;
657 						goto fail;
658 					}
659 				} else
660 					prevoff = offset;
661 				*prevoffp = prevoff;
662 			}
663 
664 			/* Update offset. */
665 			dh->dh_seqoff = offset + DIRSIZ(0, dp);
666 			*bpp = bp;
667 			*offp = offset;
668 			ufsdirhash_release(dh);
669 			return (0);
670 		}
671 
672 		/*
673 		 * When the name doesn't match in the sequential
674 		 * optimization case, go back and search normally.
675 		 */
676 		if (seqoff != -1) {
677 			seqoff = -1;
678 			goto restart;
679 		}
680 	}
681 	error = ENOENT;
682 fail:
683 	ufsdirhash_release(dh);
684 	if (bp != NULL)
685 		brelse(bp);
686 	return (error);
687 }
688 
689 /*
690  * Find a directory block with room for 'slotneeded' bytes. Returns
691  * the offset of the directory entry that begins the free space.
692  * This will either be the offset of an existing entry that has free
693  * space at the end, or the offset of an entry with d_ino == 0 at
694  * the start of a DIRBLKSIZ block.
695  *
696  * To use the space, the caller may need to compact existing entries in
697  * the directory. The total number of bytes in all of the entries involved
698  * in the compaction is stored in *slotsize. In other words, all of
699  * the entries that must be compacted are exactly contained in the
700  * region beginning at the returned offset and spanning *slotsize bytes.
701  *
702  * Returns -1 if no space was found, indicating that the directory
703  * must be extended.
704  */
705 doff_t
706 ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize)
707 {
708 	struct direct *dp;
709 	struct dirhash *dh;
710 	struct buf *bp;
711 	doff_t pos, slotstart;
712 	int dirblock, error, freebytes, i;
713 
714 	dh = ip->i_dirhash;
715 	KASSERT(dh != NULL && dh->dh_hash != NULL,
716 	    ("ufsdirhash_findfree: Invalid dirhash %p\n", dh));
717 	DIRHASH_ASSERT_LOCKED(dh);
718 
719 	/* Find a directory block with the desired free space. */
720 	dirblock = -1;
721 	for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
722 		if ((dirblock = dh->dh_firstfree[i]) != -1)
723 			break;
724 	if (dirblock == -1)
725 		return (-1);
726 
727 	KASSERT(dirblock < dh->dh_nblk &&
728 	    dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN),
729 	    ("ufsdirhash_findfree: bad stats"));
730 	pos = dirblock * DIRBLKSIZ;
731 	error = UFS_BLKATOFF(ip->i_vnode, (off_t)pos, (char **)&dp, &bp);
732 	if (error)
733 		return (-1);
734 
735 	/* Find the first entry with free space. */
736 	for (i = 0; i < DIRBLKSIZ; ) {
737 		if (dp->d_reclen == 0) {
738 			brelse(bp);
739 			return (-1);
740 		}
741 		if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp))
742 			break;
743 		i += dp->d_reclen;
744 		dp = (struct direct *)((char *)dp + dp->d_reclen);
745 	}
746 	if (i > DIRBLKSIZ) {
747 		brelse(bp);
748 		return (-1);
749 	}
750 	slotstart = pos + i;
751 
752 	/* Find the range of entries needed to get enough space */
753 	freebytes = 0;
754 	while (i < DIRBLKSIZ && freebytes < slotneeded) {
755 		freebytes += dp->d_reclen;
756 		if (dp->d_ino != 0)
757 			freebytes -= DIRSIZ(0, dp);
758 		if (dp->d_reclen == 0) {
759 			brelse(bp);
760 			return (-1);
761 		}
762 		i += dp->d_reclen;
763 		dp = (struct direct *)((char *)dp + dp->d_reclen);
764 	}
765 	if (i > DIRBLKSIZ) {
766 		brelse(bp);
767 		return (-1);
768 	}
769 	if (freebytes < slotneeded)
770 		panic("ufsdirhash_findfree: free mismatch");
771 	brelse(bp);
772 	*slotsize = pos + i - slotstart;
773 	return (slotstart);
774 }
775 
776 /*
777  * Return the start of the unused space at the end of a directory, or
778  * -1 if there are no trailing unused blocks.
779  */
780 doff_t
781 ufsdirhash_enduseful(struct inode *ip)
782 {
783 
784 	struct dirhash *dh;
785 	int i;
786 
787 	dh = ip->i_dirhash;
788 	DIRHASH_ASSERT_LOCKED(dh);
789 	KASSERT(dh != NULL && dh->dh_hash != NULL,
790 	    ("ufsdirhash_enduseful: Invalid dirhash %p\n", dh));
791 
792 	if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN)
793 		return (-1);
794 
795 	for (i = dh->dh_dirblks - 1; i >= 0; i--)
796 		if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
797 			break;
798 
799 	return ((doff_t)(i + 1) * DIRBLKSIZ);
800 }
801 
802 /*
803  * Insert information into the hash about a new directory entry. dirp
804  * points to a struct direct containing the entry, and offset specifies
805  * the offset of this entry.
806  */
807 void
808 ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset)
809 {
810 	struct dirhash *dh;
811 	int slot;
812 
813 	if ((dh = ufsdirhash_acquire(ip)) == NULL)
814 		return;
815 
816 	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
817 	    ("ufsdirhash_add: bad offset"));
818 	/*
819 	 * Normal hash usage is < 66%. If the usage gets too high then
820 	 * remove the hash entirely and let it be rebuilt later.
821 	 */
822 	if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
823 		ufsdirhash_free_locked(ip);
824 		return;
825 	}
826 
827 	/* Find a free hash slot (empty or deleted), and add the entry. */
828 	slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen);
829 	while (DH_ENTRY(dh, slot) >= 0)
830 		slot = WRAPINCR(slot, dh->dh_hlen);
831 	if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY)
832 		dh->dh_hused++;
833 	DH_ENTRY(dh, slot) = offset;
834 
835 	/* Update last used time. */
836 	dh->dh_lastused = time_second;
837 
838 	/* Update the per-block summary info. */
839 	ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp));
840 	ufsdirhash_release(dh);
841 }
842 
843 /*
844  * Remove the specified directory entry from the hash. The entry to remove
845  * is defined by the name in `dirp', which must exist at the specified
846  * `offset' within the directory.
847  */
848 void
849 ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset)
850 {
851 	struct dirhash *dh;
852 	int slot;
853 
854 	if ((dh = ufsdirhash_acquire(ip)) == NULL)
855 		return;
856 
857 	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
858 	    ("ufsdirhash_remove: bad offset"));
859 	/* Find the entry */
860 	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset);
861 
862 	/* Remove the hash entry. */
863 	ufsdirhash_delslot(dh, slot);
864 
865 	/* Update the per-block summary info. */
866 	ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp));
867 	ufsdirhash_release(dh);
868 }
869 
870 /*
871  * Change the offset associated with a directory entry in the hash. Used
872  * when compacting directory blocks.
873  */
874 void
875 ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff,
876     doff_t newoff)
877 {
878 	struct dirhash *dh;
879 	int slot;
880 
881 	if ((dh = ufsdirhash_acquire(ip)) == NULL)
882 		return;
883 
884 	KASSERT(oldoff < dh->dh_dirblks * DIRBLKSIZ &&
885 	    newoff < dh->dh_dirblks * DIRBLKSIZ,
886 	    ("ufsdirhash_move: bad offset"));
887 	/* Find the entry, and update the offset. */
888 	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
889 	DH_ENTRY(dh, slot) = newoff;
890 	ufsdirhash_release(dh);
891 }
892 
893 /*
894  * Inform dirhash that the directory has grown by one block that
895  * begins at offset (i.e. the new length is offset + DIRBLKSIZ).
896  */
897 void
898 ufsdirhash_newblk(struct inode *ip, doff_t offset)
899 {
900 	struct dirhash *dh;
901 	int block;
902 
903 	if ((dh = ufsdirhash_acquire(ip)) == NULL)
904 		return;
905 
906 	KASSERT(offset == dh->dh_dirblks * DIRBLKSIZ,
907 	    ("ufsdirhash_newblk: bad offset"));
908 	block = offset / DIRBLKSIZ;
909 	if (block >= dh->dh_nblk) {
910 		/* Out of space; must rebuild. */
911 		ufsdirhash_free_locked(ip);
912 		return;
913 	}
914 	dh->dh_dirblks = block + 1;
915 
916 	/* Account for the new free block. */
917 	dh->dh_blkfree[block] = DIRBLKSIZ / DIRALIGN;
918 	if (dh->dh_firstfree[DH_NFSTATS] == -1)
919 		dh->dh_firstfree[DH_NFSTATS] = block;
920 	ufsdirhash_release(dh);
921 }
922 
923 /*
924  * Inform dirhash that the directory is being truncated.
925  */
926 void
927 ufsdirhash_dirtrunc(struct inode *ip, doff_t offset)
928 {
929 	struct dirhash *dh;
930 	int block, i;
931 
932 	if ((dh = ufsdirhash_acquire(ip)) == NULL)
933 		return;
934 
935 	KASSERT(offset <= dh->dh_dirblks * DIRBLKSIZ,
936 	    ("ufsdirhash_dirtrunc: bad offset"));
937 	block = howmany(offset, DIRBLKSIZ);
938 	/*
939 	 * If the directory shrinks to less than 1/8 of dh_nblk blocks
940 	 * (about 20% of its original size due to the 50% extra added in
941 	 * ufsdirhash_build) then free it, and let the caller rebuild
942 	 * if necessary.
943 	 */
944 	if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) {
945 		ufsdirhash_free_locked(ip);
946 		return;
947 	}
948 
949 	/*
950 	 * Remove any `first free' information pertaining to the
951 	 * truncated blocks. All blocks we're removing should be
952 	 * completely unused.
953 	 */
954 	if (dh->dh_firstfree[DH_NFSTATS] >= block)
955 		dh->dh_firstfree[DH_NFSTATS] = -1;
956 	for (i = block; i < dh->dh_dirblks; i++)
957 		if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
958 			panic("ufsdirhash_dirtrunc: blocks in use");
959 	for (i = 0; i < DH_NFSTATS; i++)
960 		if (dh->dh_firstfree[i] >= block)
961 			panic("ufsdirhash_dirtrunc: first free corrupt");
962 	dh->dh_dirblks = block;
963 	ufsdirhash_release(dh);
964 }
965 
966 /*
967  * Debugging function to check that the dirhash information about
968  * a directory block matches its actual contents. Panics if a mismatch
969  * is detected.
970  *
971  * On entry, `buf' should point to the start of an in-core
972  * DIRBLKSIZ-sized directory block, and `offset' should contain the
973  * offset from the start of the directory of that block.
974  */
975 void
976 ufsdirhash_checkblock(struct inode *ip, char *buf, doff_t offset)
977 {
978 	struct dirhash *dh;
979 	struct direct *dp;
980 	int block, ffslot, i, nfree;
981 
982 	if (!ufs_dirhashcheck)
983 		return;
984 	if ((dh = ufsdirhash_acquire(ip)) == NULL)
985 		return;
986 
987 	block = offset / DIRBLKSIZ;
988 	if ((offset & (DIRBLKSIZ - 1)) != 0 || block >= dh->dh_dirblks)
989 		panic("ufsdirhash_checkblock: bad offset");
990 
991 	nfree = 0;
992 	for (i = 0; i < DIRBLKSIZ; i += dp->d_reclen) {
993 		dp = (struct direct *)(buf + i);
994 		if (dp->d_reclen == 0 || i + dp->d_reclen > DIRBLKSIZ)
995 			panic("ufsdirhash_checkblock: bad dir");
996 
997 		if (dp->d_ino == 0) {
998 #if 0
999 			/*
1000 			 * XXX entries with d_ino == 0 should only occur
1001 			 * at the start of a DIRBLKSIZ block. However the
1002 			 * ufs code is tolerant of such entries at other
1003 			 * offsets, and fsck does not fix them.
1004 			 */
1005 			if (i != 0)
1006 				panic("ufsdirhash_checkblock: bad dir inode");
1007 #endif
1008 			nfree += dp->d_reclen;
1009 			continue;
1010 		}
1011 
1012 		/* Check that the entry	exists (will panic if it doesn't). */
1013 		ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i);
1014 
1015 		nfree += dp->d_reclen - DIRSIZ(0, dp);
1016 	}
1017 	if (i != DIRBLKSIZ)
1018 		panic("ufsdirhash_checkblock: bad dir end");
1019 
1020 	if (dh->dh_blkfree[block] * DIRALIGN != nfree)
1021 		panic("ufsdirhash_checkblock: bad free count");
1022 
1023 	ffslot = BLKFREE2IDX(nfree / DIRALIGN);
1024 	for (i = 0; i <= DH_NFSTATS; i++)
1025 		if (dh->dh_firstfree[i] == block && i != ffslot)
1026 			panic("ufsdirhash_checkblock: bad first-free");
1027 	if (dh->dh_firstfree[ffslot] == -1)
1028 		panic("ufsdirhash_checkblock: missing first-free entry");
1029 	ufsdirhash_release(dh);
1030 }
1031 
1032 /*
1033  * Hash the specified filename into a dirhash slot.
1034  */
1035 static int
1036 ufsdirhash_hash(struct dirhash *dh, char *name, int namelen)
1037 {
1038 	u_int32_t hash;
1039 
1040 	/*
1041 	 * We hash the name and then some other bit of data that is
1042 	 * invariant over the dirhash's lifetime. Otherwise names
1043 	 * differing only in the last byte are placed close to one
1044 	 * another in the table, which is bad for linear probing.
1045 	 */
1046 	hash = fnv_32_buf(name, namelen, FNV1_32_INIT);
1047 	hash = fnv_32_buf(&dh, sizeof(dh), hash);
1048 	return (hash % dh->dh_hlen);
1049 }
1050 
1051 /*
1052  * Adjust the number of free bytes in the block containing `offset'
1053  * by the value specified by `diff'.
1054  *
1055  * The caller must ensure we have exclusive access to `dh'; normally
1056  * that means that dh_lock should be held, but this is also called
1057  * from ufsdirhash_build() where exclusive access can be assumed.
1058  */
1059 static void
1060 ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff)
1061 {
1062 	int block, i, nfidx, ofidx;
1063 
1064 	/* Update the per-block summary info. */
1065 	block = offset / DIRBLKSIZ;
1066 	KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks,
1067 	     ("dirhash bad offset"));
1068 	ofidx = BLKFREE2IDX(dh->dh_blkfree[block]);
1069 	dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN);
1070 	nfidx = BLKFREE2IDX(dh->dh_blkfree[block]);
1071 
1072 	/* Update the `first free' list if necessary. */
1073 	if (ofidx != nfidx) {
1074 		/* If removing, scan forward for the next block. */
1075 		if (dh->dh_firstfree[ofidx] == block) {
1076 			for (i = block + 1; i < dh->dh_dirblks; i++)
1077 				if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx)
1078 					break;
1079 			dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1;
1080 		}
1081 
1082 		/* Make this the new `first free' if necessary */
1083 		if (dh->dh_firstfree[nfidx] > block ||
1084 		    dh->dh_firstfree[nfidx] == -1)
1085 			dh->dh_firstfree[nfidx] = block;
1086 	}
1087 }
1088 
1089 /*
1090  * Find the specified name which should have the specified offset.
1091  * Returns a slot number, and panics on failure.
1092  *
1093  * `dh' must be locked on entry and remains so on return.
1094  */
1095 static int
1096 ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, doff_t offset)
1097 {
1098 	int slot;
1099 
1100 	DIRHASH_ASSERT_LOCKED(dh);
1101 
1102 	/* Find the entry. */
1103 	KASSERT(dh->dh_hused < dh->dh_hlen, ("dirhash find full"));
1104 	slot = ufsdirhash_hash(dh, name, namelen);
1105 	while (DH_ENTRY(dh, slot) != offset &&
1106 	    DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
1107 		slot = WRAPINCR(slot, dh->dh_hlen);
1108 	if (DH_ENTRY(dh, slot) != offset)
1109 		panic("ufsdirhash_findslot: '%.*s' not found", namelen, name);
1110 
1111 	return (slot);
1112 }
1113 
1114 /*
1115  * Remove the entry corresponding to the specified slot from the hash array.
1116  *
1117  * `dh' must be locked on entry and remains so on return.
1118  */
1119 static void
1120 ufsdirhash_delslot(struct dirhash *dh, int slot)
1121 {
1122 	int i;
1123 
1124 	DIRHASH_ASSERT_LOCKED(dh);
1125 
1126 	/* Mark the entry as deleted. */
1127 	DH_ENTRY(dh, slot) = DIRHASH_DEL;
1128 
1129 	/* If this is the end of a chain of DIRHASH_DEL slots, remove them. */
1130 	for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; )
1131 		i = WRAPINCR(i, dh->dh_hlen);
1132 	if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) {
1133 		i = WRAPDECR(i, dh->dh_hlen);
1134 		while (DH_ENTRY(dh, i) == DIRHASH_DEL) {
1135 			DH_ENTRY(dh, i) = DIRHASH_EMPTY;
1136 			dh->dh_hused--;
1137 			i = WRAPDECR(i, dh->dh_hlen);
1138 		}
1139 		KASSERT(dh->dh_hused >= 0, ("ufsdirhash_delslot neg hlen"));
1140 	}
1141 }
1142 
1143 /*
1144  * Given a directory entry and its offset, find the offset of the
1145  * previous entry in the same DIRBLKSIZ-sized block. Returns an
1146  * offset, or -1 if there is no previous entry in the block or some
1147  * other problem occurred.
1148  */
1149 static doff_t
1150 ufsdirhash_getprev(struct direct *dirp, doff_t offset)
1151 {
1152 	struct direct *dp;
1153 	char *blkbuf;
1154 	doff_t blkoff, prevoff;
1155 	int entrypos, i;
1156 
1157 	blkoff = rounddown2(offset, DIRBLKSIZ);	/* offset of start of block */
1158 	entrypos = offset & (DIRBLKSIZ - 1);	/* entry relative to block */
1159 	blkbuf = (char *)dirp - entrypos;
1160 	prevoff = blkoff;
1161 
1162 	/* If `offset' is the start of a block, there is no previous entry. */
1163 	if (entrypos == 0)
1164 		return (-1);
1165 
1166 	/* Scan from the start of the block until we get to the entry. */
1167 	for (i = 0; i < entrypos; i += dp->d_reclen) {
1168 		dp = (struct direct *)(blkbuf + i);
1169 		if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos)
1170 			return (-1);	/* Corrupted directory. */
1171 		prevoff = blkoff + i;
1172 	}
1173 	return (prevoff);
1174 }
1175 
1176 /*
1177  * Delete the given dirhash and reclaim its memory. Assumes that
1178  * ufsdirhash_list is locked, and leaves it locked. Also assumes
1179  * that dh is locked. Returns the amount of memory freed.
1180  */
1181 static int
1182 ufsdirhash_destroy(struct dirhash *dh)
1183 {
1184 	doff_t **hash;
1185 	u_int8_t *blkfree;
1186 	int i, mem, narrays;
1187 
1188 	KASSERT(dh->dh_hash != NULL, ("dirhash: NULL hash on list"));
1189 
1190 	/* Remove it from the list and detach its memory. */
1191 	TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
1192 	dh->dh_onlist = 0;
1193 	hash = dh->dh_hash;
1194 	dh->dh_hash = NULL;
1195 	blkfree = dh->dh_blkfree;
1196 	dh->dh_blkfree = NULL;
1197 	narrays = dh->dh_narrays;
1198 	mem = dh->dh_memreq;
1199 	dh->dh_memreq = 0;
1200 
1201 	/* Unlock dirhash and free the detached memory. */
1202 	ufsdirhash_release(dh);
1203 	for (i = 0; i < narrays; i++)
1204 		DIRHASH_BLKFREE(hash[i]);
1205 	free(hash, M_DIRHASH);
1206 	free(blkfree, M_DIRHASH);
1207 
1208 	/* Account for the returned memory. */
1209 	ufs_dirhashmem -= mem;
1210 
1211 	return (mem);
1212 }
1213 
1214 /*
1215  * Try to free up `wanted' bytes by stealing memory from existing
1216  * dirhashes. Returns zero with list locked if successful.
1217  */
1218 static int
1219 ufsdirhash_recycle(int wanted)
1220 {
1221 	struct dirhash *dh;
1222 
1223 	DIRHASHLIST_LOCK();
1224 	dh = TAILQ_FIRST(&ufsdirhash_list);
1225 	while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) {
1226 		/* Decrement the score; only recycle if it becomes zero. */
1227 		if (dh == NULL || --dh->dh_score > 0) {
1228 			DIRHASHLIST_UNLOCK();
1229 			return (-1);
1230 		}
1231 		/*
1232 		 * If we can't lock it it's in use and we don't want to
1233 		 * recycle it anyway.
1234 		 */
1235 		if (!sx_try_xlock(&dh->dh_lock)) {
1236 			dh = TAILQ_NEXT(dh, dh_list);
1237 			continue;
1238 		}
1239 
1240 		ufsdirhash_destroy(dh);
1241 
1242 		/* Repeat if necessary. */
1243 		dh = TAILQ_FIRST(&ufsdirhash_list);
1244 	}
1245 	/* Success; return with list locked. */
1246 	return (0);
1247 }
1248 
1249 /*
1250  * Callback that frees some dirhashes when the system is low on virtual memory.
1251  */
1252 static void
1253 ufsdirhash_lowmem()
1254 {
1255 	struct dirhash *dh, *dh_temp;
1256 	int memfreed, memwanted;
1257 
1258 	ufs_dirhashlowmemcount++;
1259 	memfreed = 0;
1260 	memwanted = ufs_dirhashmem * ufs_dirhashreclaimpercent / 100;
1261 
1262 	DIRHASHLIST_LOCK();
1263 
1264 	/*
1265 	 * Reclaim up to memwanted from the oldest dirhashes. This will allow
1266 	 * us to make some progress when the system is running out of memory
1267 	 * without compromising the dinamicity of maximum age. If the situation
1268 	 * does not improve lowmem will be eventually retriggered and free some
1269 	 * other entry in the cache. The entries on the head of the list should
1270 	 * be the oldest. If during list traversal we can't get a lock on the
1271 	 * dirhash, it will be skipped.
1272 	 */
1273 	TAILQ_FOREACH_SAFE(dh, &ufsdirhash_list, dh_list, dh_temp) {
1274 		if (sx_try_xlock(&dh->dh_lock))
1275 			memfreed += ufsdirhash_destroy(dh);
1276 		if (memfreed >= memwanted)
1277 			break;
1278 	}
1279 	DIRHASHLIST_UNLOCK();
1280 }
1281 
1282 static int
1283 ufsdirhash_set_reclaimpercent(SYSCTL_HANDLER_ARGS)
1284 {
1285 	int error, v;
1286 
1287 	v = ufs_dirhashreclaimpercent;
1288 	error = sysctl_handle_int(oidp, &v, v, req);
1289 	if (error)
1290 		return (error);
1291 	if (req->newptr == NULL)
1292 		return (error);
1293 	if (v == ufs_dirhashreclaimpercent)
1294 		return (0);
1295 
1296 	/* Refuse invalid percentages */
1297 	if (v < 0 || v > 100)
1298 		return (EINVAL);
1299 	ufs_dirhashreclaimpercent = v;
1300 	return (0);
1301 }
1302 
1303 void
1304 ufsdirhash_init()
1305 {
1306 	ufs_dirhashmaxmem = lmax(roundup(hibufspace / 64, PAGE_SIZE),
1307 	    2 * 1024 * 1024);
1308 
1309 	ufsdirhash_zone = uma_zcreate("DIRHASH", DH_NBLKOFF * sizeof(doff_t),
1310 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1311 	mtx_init(&ufsdirhash_mtx, "dirhash list", NULL, MTX_DEF);
1312 	TAILQ_INIT(&ufsdirhash_list);
1313 
1314 	/* Register a callback function to handle low memory signals */
1315 	EVENTHANDLER_REGISTER(vm_lowmem, ufsdirhash_lowmem, NULL,
1316 	    EVENTHANDLER_PRI_FIRST);
1317 }
1318 
1319 void
1320 ufsdirhash_uninit()
1321 {
1322 	KASSERT(TAILQ_EMPTY(&ufsdirhash_list), ("ufsdirhash_uninit"));
1323 	uma_zdestroy(ufsdirhash_zone);
1324 	mtx_destroy(&ufsdirhash_mtx);
1325 }
1326 
1327 #endif /* UFS_DIRHASH */
1328