xref: /freebsd/sys/kern/subr_blist.c (revision 6419bb52)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 /*
30  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
31  *
32  *	This module implements a general bitmap allocator/deallocator.  The
33  *	allocator eats around 2 bits per 'block'.  The module does not
34  *	try to interpret the meaning of a 'block' other than to return
35  *	SWAPBLK_NONE on an allocation failure.
36  *
37  *	A radix tree controls access to pieces of the bitmap, and includes
38  *	auxiliary information at each interior node about the availabilty of
39  *	contiguous free blocks in the subtree rooted at that node.  Two radix
40  *	constants are involved: one for the size of the bitmaps contained in the
41  *	leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
42  *	each of the meta (interior) nodes (BLIST_META_RADIX).  Each subtree is
43  *	associated with a range of blocks.  The root of any subtree stores a
44  *	hint field that defines an upper bound on the size of the largest
45  *	allocation that can begin in the associated block range.  A hint is an
46  *	upper bound on a potential allocation, but not necessarily a tight upper
47  *	bound.
48  *
49  *	The bitmap field in each node directs the search for available blocks.
50  *	For a leaf node, a bit is set if the corresponding block is free.  For a
51  *	meta node, a bit is set if the corresponding subtree contains a free
52  *	block somewhere within it.  The search at a meta node considers only
53  *	children of that node that represent a range that includes a free block.
54  *
55  * 	The hinting greatly increases code efficiency for allocations while
56  *	the general radix structure optimizes both allocations and frees.  The
57  *	radix tree should be able to operate well no matter how much
58  *	fragmentation there is and no matter how large a bitmap is used.
59  *
60  *	The blist code wires all necessary memory at creation time.  Neither
61  *	allocations nor frees require interaction with the memory subsystem.
62  *	The non-blocking nature of allocations and frees is required by swap
63  *	code (vm/swap_pager.c).
64  *
65  *	LAYOUT: The radix tree is laid out recursively using a linear array.
66  *	Each meta node is immediately followed (laid out sequentially in
67  *	memory) by BLIST_META_RADIX lower level nodes.  This is a recursive
68  *	structure but one that can be easily scanned through a very simple
69  *	'skip' calculation.  The memory allocation is only large enough to
70  *	cover the number of blocks requested at creation time.  Nodes that
71  *	represent blocks beyond that limit, nodes that would never be read
72  *	or written, are not allocated, so that the last of the
73  *	BLIST_META_RADIX lower level nodes of a some nodes may not be
74  *	allocated.
75  *
76  *	NOTE: the allocator cannot currently allocate more than
77  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
78  *	large' if you try.  This is an area that could use improvement.  The
79  *	radix is large enough that this restriction does not effect the swap
80  *	system, though.  Currently only the allocation code is affected by
81  *	this algorithmic unfeature.  The freeing code can handle arbitrary
82  *	ranges.
83  *
84  *	This code can be compiled stand-alone for debugging.
85  */
86 
87 #include <sys/cdefs.h>
88 __FBSDID("$FreeBSD$");
89 
90 #ifdef _KERNEL
91 
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/lock.h>
95 #include <sys/kernel.h>
96 #include <sys/blist.h>
97 #include <sys/malloc.h>
98 #include <sys/sbuf.h>
99 #include <sys/proc.h>
100 #include <sys/mutex.h>
101 
102 #else
103 
104 #ifndef BLIST_NO_DEBUG
105 #define BLIST_DEBUG
106 #endif
107 
108 #include <sys/errno.h>
109 #include <sys/types.h>
110 #include <sys/malloc.h>
111 #include <sys/sbuf.h>
112 #include <assert.h>
113 #include <stdio.h>
114 #include <string.h>
115 #include <stddef.h>
116 #include <stdlib.h>
117 #include <stdarg.h>
118 #include <stdbool.h>
119 
120 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
121 #define malloc(a,b,c)	calloc(a, 1)
122 #define free(a,b)	free(a)
123 #define ummin(a,b)	((a) < (b) ? (a) : (b))
124 #define imin(a,b)	((a) < (b) ? (a) : (b))
125 #define KASSERT(a,b)	assert(a)
126 
127 #include <sys/blist.h>
128 
129 #endif
130 
131 /*
132  * static support functions
133  */
134 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk,
135     int *count, int maxcount);
136 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
137     int maxcount, u_daddr_t radix);
138 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
139 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
140 		    u_daddr_t radix);
141 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
142 		    blist_t dest, daddr_t count);
143 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
144 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
145 		    u_daddr_t radix);
146 #ifndef _KERNEL
147 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
148 		    int tab);
149 #endif
150 
151 #ifdef _KERNEL
152 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
153 #endif
154 
155 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
156     "radix divisibility error");
157 #define	BLIST_BMAP_MASK	(BLIST_BMAP_RADIX - 1)
158 #define	BLIST_META_MASK	(BLIST_META_RADIX - 1)
159 
160 /*
161  * For a subtree that can represent the state of up to 'radix' blocks, the
162  * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX.  If 'm'
163  * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
164  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
165  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
166  * in the 'meta' functions that process subtrees.  Since integer division
167  * discards remainders, we can express this computation as
168  * skip = (m * m**h) / (m - 1)
169  * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
170  * and since m divides BLIST_BMAP_RADIX, we can simplify further to
171  * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
172  * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
173  * so that simple integer division by a constant can safely be used for the
174  * calculation.
175  */
176 static inline daddr_t
177 radix_to_skip(daddr_t radix)
178 {
179 
180 	return (radix /
181 	    ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
182 }
183 
184 /*
185  * Provide a mask with count bits set, starting as position n.
186  */
187 static inline u_daddr_t
188 bitrange(int n, int count)
189 {
190 
191 	return (((u_daddr_t)-1 << n) &
192 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - (n + count))));
193 }
194 
195 /*
196  * Find the first bit set in a u_daddr_t.
197  */
198 static inline int
199 generic_bitpos(u_daddr_t mask)
200 {
201 	int hi, lo, mid;
202 
203 	lo = 0;
204 	hi = BLIST_BMAP_RADIX;
205 	while (lo + 1 < hi) {
206 		mid = (lo + hi) >> 1;
207 		if (mask & bitrange(0, mid))
208 			hi = mid;
209 		else
210 			lo = mid;
211 	}
212 	return (lo);
213 }
214 
215 static inline int
216 bitpos(u_daddr_t mask)
217 {
218 
219 	switch (sizeof(mask)) {
220 #ifdef HAVE_INLINE_FFSLL
221 	case sizeof(long long):
222 		return (ffsll(mask) - 1);
223 #endif
224 #ifdef HAVE_INLINE_FFS
225 	case sizeof(int):
226 		return (ffs(mask) - 1);
227 #endif
228 	default:
229 		return (generic_bitpos(mask));
230 	}
231 }
232 
233 /*
234  * blist_create() - create a blist capable of handling up to the specified
235  *		    number of blocks
236  *
237  *	blocks - must be greater than 0
238  * 	flags  - malloc flags
239  *
240  *	The smallest blist consists of a single leaf node capable of
241  *	managing BLIST_BMAP_RADIX blocks.
242  */
243 blist_t
244 blist_create(daddr_t blocks, int flags)
245 {
246 	blist_t bl;
247 	u_daddr_t nodes, radix;
248 
249 	KASSERT(blocks > 0, ("invalid block count"));
250 
251 	/*
252 	 * Calculate the radix and node count used for scanning.
253 	 */
254 	nodes = 1;
255 	radix = BLIST_BMAP_RADIX;
256 	while (radix <= blocks) {
257 		nodes += 1 + (blocks - 1) / radix;
258 		radix *= BLIST_META_RADIX;
259 	}
260 
261 	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
262 	    M_ZERO);
263 	if (bl == NULL)
264 		return (NULL);
265 
266 	bl->bl_blocks = blocks;
267 	bl->bl_radix = radix;
268 
269 #if defined(BLIST_DEBUG)
270 	printf(
271 		"BLIST representing %lld blocks (%lld MB of swap)"
272 		", requiring %lldK of ram\n",
273 		(long long)bl->bl_blocks,
274 		(long long)bl->bl_blocks * 4 / 1024,
275 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
276 	);
277 	printf("BLIST raw radix tree contains %lld records\n",
278 	    (long long)nodes);
279 #endif
280 
281 	return (bl);
282 }
283 
284 void
285 blist_destroy(blist_t bl)
286 {
287 
288 	free(bl, M_SWAP);
289 }
290 
291 /*
292  * blist_alloc() -   reserve space in the block bitmap.  Return the base
293  *		     of a contiguous region or SWAPBLK_NONE if space could
294  *		     not be allocated.
295  */
296 daddr_t
297 blist_alloc(blist_t bl, int *count, int maxcount)
298 {
299 	daddr_t blk, cursor;
300 
301 	KASSERT(*count <= maxcount,
302 	    ("invalid parameters %d > %d", *count, maxcount));
303 	KASSERT(*count <= BLIST_MAX_ALLOC,
304 	    ("minimum allocation too large: %d", *count));
305 
306 	/*
307 	 * This loop iterates at most twice.  An allocation failure in the
308 	 * first iteration leads to a second iteration only if the cursor was
309 	 * non-zero.  When the cursor is zero, an allocation failure will
310 	 * stop further iterations.
311 	 */
312 	for (cursor = bl->bl_cursor;; cursor = 0) {
313 		blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount,
314 		    bl->bl_radix);
315 		if (blk != SWAPBLK_NONE) {
316 			bl->bl_avail -= *count;
317 			bl->bl_cursor = blk + *count;
318 			if (bl->bl_cursor == bl->bl_blocks)
319 				bl->bl_cursor = 0;
320 			return (blk);
321 		}
322 		if (cursor == 0)
323 			return (SWAPBLK_NONE);
324 	}
325 }
326 
327 /*
328  * blist_avail() -	return the number of free blocks.
329  */
330 daddr_t
331 blist_avail(blist_t bl)
332 {
333 
334 	return (bl->bl_avail);
335 }
336 
337 /*
338  * blist_free() -	free up space in the block bitmap.  Return the base
339  *		     	of a contiguous region.
340  */
341 void
342 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
343 {
344 
345 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
346 	    ("freeing invalid range: blkno %jx, count %d, blocks %jd",
347 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
348 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
349 	bl->bl_avail += count;
350 }
351 
352 /*
353  * blist_fill() -	mark a region in the block bitmap as off-limits
354  *			to the allocator (i.e. allocate it), ignoring any
355  *			existing allocations.  Return the number of blocks
356  *			actually filled that were free before the call.
357  */
358 daddr_t
359 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
360 {
361 	daddr_t filled;
362 
363 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
364 	    ("filling invalid range: blkno %jx, count %d, blocks %jd",
365 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
366 	filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
367 	bl->bl_avail -= filled;
368 	return (filled);
369 }
370 
371 /*
372  * blist_resize() -	resize an existing radix tree to handle the
373  *			specified number of blocks.  This will reallocate
374  *			the tree and transfer the previous bitmap to the new
375  *			one.  When extending the tree you can specify whether
376  *			the new blocks are to left allocated or freed.
377  */
378 void
379 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
380 {
381     blist_t newbl = blist_create(count, flags);
382     blist_t save = *pbl;
383 
384     *pbl = newbl;
385     if (count > save->bl_blocks)
386 	    count = save->bl_blocks;
387     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
388 
389     /*
390      * If resizing upwards, should we free the new space or not?
391      */
392     if (freenew && count < newbl->bl_blocks) {
393 	    blist_free(newbl, count, newbl->bl_blocks - count);
394     }
395     blist_destroy(save);
396 }
397 
398 #ifdef BLIST_DEBUG
399 
400 /*
401  * blist_print()    - dump radix tree
402  */
403 void
404 blist_print(blist_t bl)
405 {
406 	printf("BLIST avail = %jd, cursor = %08jx {\n",
407 	    (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
408 
409 	if (bl->bl_root->bm_bitmap != 0)
410 		blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
411 	printf("}\n");
412 }
413 
414 #endif
415 
416 static const u_daddr_t fib[] = {
417 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
418 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
419 	514229, 832040, 1346269, 2178309, 3524578,
420 };
421 
422 /*
423  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
424  */
425 struct gap_stats {
426 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
427 	daddr_t	num;		/* number of gaps observed */
428 	daddr_t	max;		/* largest gap size */
429 	daddr_t	avg;		/* average gap size */
430 	daddr_t	err;		/* sum - num * avg */
431 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
432 	int	max_bucket;	/* last histo elt with nonzero val */
433 };
434 
435 /*
436  * gap_stats_counting()    - is the state 'counting 1 bits'?
437  *                           or 'skipping 0 bits'?
438  */
439 static inline bool
440 gap_stats_counting(const struct gap_stats *stats)
441 {
442 
443 	return (stats->start != SWAPBLK_NONE);
444 }
445 
446 /*
447  * init_gap_stats()    - initialize stats on gap sizes
448  */
449 static inline void
450 init_gap_stats(struct gap_stats *stats)
451 {
452 
453 	bzero(stats, sizeof(*stats));
454 	stats->start = SWAPBLK_NONE;
455 }
456 
457 /*
458  * update_gap_stats()    - update stats on gap sizes
459  */
460 static void
461 update_gap_stats(struct gap_stats *stats, daddr_t posn)
462 {
463 	daddr_t size;
464 	int hi, lo, mid;
465 
466 	if (!gap_stats_counting(stats)) {
467 		stats->start = posn;
468 		return;
469 	}
470 	size = posn - stats->start;
471 	stats->start = SWAPBLK_NONE;
472 	if (size > stats->max)
473 		stats->max = size;
474 
475 	/*
476 	 * Find the fibonacci range that contains size,
477 	 * expecting to find it in an early range.
478 	 */
479 	lo = 0;
480 	hi = 1;
481 	while (hi < nitems(fib) && fib[hi] <= size) {
482 		lo = hi;
483 		hi *= 2;
484 	}
485 	if (hi >= nitems(fib))
486 		hi = nitems(fib);
487 	while (lo + 1 != hi) {
488 		mid = (lo + hi) >> 1;
489 		if (fib[mid] <= size)
490 			lo = mid;
491 		else
492 			hi = mid;
493 	}
494 	stats->histo[lo]++;
495 	if (lo > stats->max_bucket)
496 		stats->max_bucket = lo;
497 	stats->err += size - stats->avg;
498 	stats->num++;
499 	stats->avg += stats->err / stats->num;
500 	stats->err %= stats->num;
501 }
502 
503 /*
504  * dump_gap_stats()    - print stats on gap sizes
505  */
506 static inline void
507 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
508 {
509 	int i;
510 
511 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
512 	    (intmax_t)stats->num);
513 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
514 	sbuf_printf(s, "average maximal free range size: %jd\n",
515 	    (intmax_t)stats->avg);
516 	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
517 	sbuf_printf(s, "               count  |  size range\n");
518 	sbuf_printf(s, "               -----  |  ----------\n");
519 	for (i = 0; i < stats->max_bucket; i++) {
520 		if (stats->histo[i] != 0) {
521 			sbuf_printf(s, "%20jd  |  ",
522 			    (intmax_t)stats->histo[i]);
523 			if (fib[i] != fib[i + 1] - 1)
524 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
525 				    (intmax_t)fib[i + 1] - 1);
526 			else
527 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
528 		}
529 	}
530 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
531 	if (stats->histo[i] > 1)
532 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
533 		    (intmax_t)stats->max);
534 	else
535 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
536 }
537 
538 /*
539  * blist_stats()    - dump radix tree stats
540  */
541 void
542 blist_stats(blist_t bl, struct sbuf *s)
543 {
544 	struct gap_stats gstats;
545 	struct gap_stats *stats = &gstats;
546 	daddr_t i, nodes, radix;
547 	u_daddr_t diff, mask;
548 	int digit;
549 
550 	init_gap_stats(stats);
551 	nodes = 0;
552 	i = bl->bl_radix;
553 	while (i < bl->bl_radix + bl->bl_blocks) {
554 		/*
555 		 * Find max size subtree starting at i.
556 		 */
557 		radix = BLIST_BMAP_RADIX;
558 		while (((i / radix) & BLIST_META_MASK) == 0)
559 			radix *= BLIST_META_RADIX;
560 
561 		/*
562 		 * Check for skippable subtrees starting at i.
563 		 */
564 		while (radix > BLIST_BMAP_RADIX) {
565 			if (bl->bl_root[nodes].bm_bitmap == 0) {
566 				if (gap_stats_counting(stats))
567 					update_gap_stats(stats, i);
568 				break;
569 			}
570 
571 			/*
572 			 * Skip subtree root.
573 			 */
574 			nodes++;
575 			radix /= BLIST_META_RADIX;
576 		}
577 		if (radix == BLIST_BMAP_RADIX) {
578 			/*
579 			 * Scan leaf.
580 			 */
581 			mask = bl->bl_root[nodes].bm_bitmap;
582 			diff = mask ^ (mask << 1);
583 			if (gap_stats_counting(stats))
584 				diff ^= 1;
585 			while (diff != 0) {
586 				digit = bitpos(diff);
587 				update_gap_stats(stats, i + digit);
588 				diff ^= bitrange(digit, 1);
589 			}
590 		}
591 		nodes += radix_to_skip(radix);
592 		i += radix;
593 	}
594 	update_gap_stats(stats, i);
595 	dump_gap_stats(stats, s);
596 }
597 
598 /************************************************************************
599  *			  ALLOCATION SUPPORT FUNCTIONS			*
600  ************************************************************************
601  *
602  *	These support functions do all the actual work.  They may seem
603  *	rather longish, but that's because I've commented them up.  The
604  *	actual code is straight forward.
605  *
606  */
607 
608 /*
609  * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf.
610  *
611  *	'scan' is a leaf node, and its first block is at address 'start'.  The
612  *	next leaf node could be adjacent, or several nodes away if the least
613  *	common ancestor of 'scan' and its neighbor is several levels up.  Use
614  *	addresses to determine how many meta-nodes lie between the leaves.  If
615  *	sequence of leaves starting with the next one has enough initial bits
616  *	set, clear them and clear the bits in the meta nodes on the path up to
617  *	the least common ancestor to mark any subtrees made completely empty.
618  */
619 static int
620 blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount)
621 {
622 	u_daddr_t radix;
623 	daddr_t blk;
624 	int avail, digit;
625 
626 	start += BLIST_BMAP_RADIX;
627 	for (blk = start; blk - start < maxcount; blk += BLIST_BMAP_RADIX) {
628 		/* Skip meta-nodes, as long as they promise more free blocks. */
629 		radix = BLIST_BMAP_RADIX;
630 		while (((++scan)->bm_bitmap & 1) == 1 &&
631 		    ((blk / radix) & BLIST_META_MASK) == 0)
632 			radix *= BLIST_META_RADIX;
633 		if (~scan->bm_bitmap != 0) {
634 			/*
635 			 * Either there is no next leaf with any free blocks,
636 			 * or we've reached the next leaf and found that some
637 			 * of its blocks are not free.  In the first case,
638 			 * bitpos() returns zero here.
639 			 */
640 			avail = blk - start + bitpos(~scan->bm_bitmap);
641 			if (avail < count || avail == 0) {
642 				/*
643 				 * There isn't a next leaf with enough free
644 				 * blocks at its beginning to bother
645 				 * allocating.
646 				 */
647 				return (avail);
648 			}
649 			maxcount = imin(avail, maxcount);
650 			if (maxcount % BLIST_BMAP_RADIX == 0) {
651 				/*
652 				 * There was no next leaf.  Back scan up to
653 				 * last leaf.
654 				 */
655 				--scan;
656 				while (radix != BLIST_BMAP_RADIX) {
657 					radix /= BLIST_META_RADIX;
658 					--scan;
659 				}
660 				blk -= BLIST_BMAP_RADIX;
661 			}
662 		}
663 	}
664 
665 	/*
666 	 * 'scan' is the last leaf that provides blocks.  Clear from 1 to
667 	 * BLIST_BMAP_RADIX bits to represent the allocation of those last
668 	 * blocks.
669 	 */
670 	if (maxcount % BLIST_BMAP_RADIX != 0)
671 		scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_BMAP_RADIX);
672 	else
673 		scan->bm_bitmap = 0;
674 
675 	for (;;) {
676 		/* Back up over meta-nodes, clearing bits if necessary. */
677 		blk -= BLIST_BMAP_RADIX;
678 		radix = BLIST_BMAP_RADIX;
679 		while ((digit = ((blk / radix) & BLIST_META_MASK)) == 0) {
680 			if ((scan--)->bm_bitmap == 0)
681 				scan->bm_bitmap ^= 1;
682 			radix *= BLIST_META_RADIX;
683 		}
684 		if ((scan--)->bm_bitmap == 0)
685 			scan[-digit * radix_to_skip(radix)].bm_bitmap ^=
686 			    (u_daddr_t)1 << digit;
687 
688 		if (blk == start)
689 			break;
690 		/* Clear all the bits of this leaf. */
691 		scan->bm_bitmap = 0;
692 	}
693 	return (maxcount);
694 }
695 
696 /*
697  * BLST_LEAF_ALLOC() -	allocate at a leaf in the radix tree (a bitmap).
698  *
699  *	This function is the core of the allocator.  Its execution time is
700  *	proportional to log(count), plus height of the tree if the allocation
701  *	crosses a leaf boundary.
702  */
703 static daddr_t
704 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount)
705 {
706 	u_daddr_t mask;
707 	int bighint, count1, hi, lo, num_shifts;
708 
709 	count1 = *count - 1;
710 	num_shifts = fls(count1);
711 	mask = ~scan->bm_bitmap;
712 	while ((mask & (mask + 1)) != 0 && num_shifts > 0) {
713 		/*
714 		 * If bit i is 0 in mask, then bits in [i, i + (count1 >>
715 		 * num_shifts)] are 1 in scan->bm_bitmap.  Reduce num_shifts to
716 		 * 0, while preserving this invariant.  The updates to mask
717 		 * leave fewer bits 0, but each bit that remains 0 represents a
718 		 * longer string of consecutive 1-bits in scan->bm_bitmap.  If
719 		 * more updates to mask cannot set more bits, because mask is
720 		 * partitioned with all 1 bits following all 0 bits, the loop
721 		 * terminates immediately.
722 		 */
723 		num_shifts--;
724 		mask |= mask >> ((count1 >> num_shifts) + 1) / 2;
725 	}
726 	bighint = count1 >> num_shifts;
727 	if (~mask == 0) {
728 		/*
729 		 * Update bighint.  There is no allocation bigger than
730 		 * count1 >> num_shifts starting in this leaf.
731 		 */
732 		scan->bm_bighint = bighint;
733 		return (SWAPBLK_NONE);
734 	}
735 
736 	/* Discard any candidates that appear before blk. */
737 	if ((blk & BLIST_BMAP_MASK) != 0) {
738 		if ((~mask & bitrange(0, blk & BLIST_BMAP_MASK)) != 0) {
739 			/* Grow bighint in case all discarded bits are set. */
740 			bighint += blk & BLIST_BMAP_MASK;
741 			mask |= bitrange(0, blk & BLIST_BMAP_MASK);
742 			if (~mask == 0) {
743 				scan->bm_bighint = bighint;
744 				return (SWAPBLK_NONE);
745 			}
746 		}
747 		blk -= blk & BLIST_BMAP_MASK;
748 	}
749 
750 	/*
751 	 * The least significant set bit in mask marks the start of the first
752 	 * available range of sufficient size.  Find its position.
753 	 */
754 	lo = bitpos(~mask);
755 
756 	/*
757 	 * Find how much space is available starting at that position.
758 	 */
759 	if ((mask & (mask + 1)) != 0) {
760 		/* Count the 1 bits starting at position lo. */
761 		hi = bitpos(mask & (mask + 1)) + count1;
762 		if (maxcount < hi - lo)
763 			hi = lo + maxcount;
764 		*count = hi - lo;
765 		mask = ~bitrange(lo, *count);
766 	} else if (maxcount <= BLIST_BMAP_RADIX - lo) {
767 		/* All the blocks we can use are available here. */
768 		hi = lo + maxcount;
769 		*count = maxcount;
770 		mask = ~bitrange(lo, *count);
771 		if (hi == BLIST_BMAP_RADIX)
772 			scan->bm_bighint = bighint;
773 	} else {
774 		/* Check next leaf for some of the blocks we want or need. */
775 		count1 = *count - (BLIST_BMAP_RADIX - lo);
776 		maxcount -= BLIST_BMAP_RADIX - lo;
777 		hi = blst_next_leaf_alloc(scan, blk, count1, maxcount);
778 		if (hi < count1)
779 			/*
780 			 * The next leaf cannot supply enough blocks to reach
781 			 * the minimum required allocation.  The hint cannot be
782 			 * updated, because the same allocation request could
783 			 * be satisfied later, by this leaf, if the state of
784 			 * the next leaf changes, and without any changes to
785 			 * this leaf.
786 			 */
787 			return (SWAPBLK_NONE);
788 		*count = BLIST_BMAP_RADIX - lo + hi;
789 		scan->bm_bighint = bighint;
790 	}
791 
792 	/* Clear the allocated bits from this leaf. */
793 	scan->bm_bitmap &= mask;
794 	return (blk + lo);
795 }
796 
797 /*
798  * blist_meta_alloc() -	allocate at a meta in the radix tree.
799  *
800  *	Attempt to allocate at a meta node.  If we can't, we update
801  *	bighint and return a failure.  Updating bighint optimize future
802  *	calls that hit this node.  We have to check for our collapse cases
803  *	and we have a few optimizations strewn in as well.
804  */
805 static daddr_t
806 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
807     int maxcount, u_daddr_t radix)
808 {
809 	daddr_t blk, i, r, skip;
810 	u_daddr_t mask;
811 	bool scan_from_start;
812 	int digit;
813 
814 	if (radix == BLIST_BMAP_RADIX)
815 		return (blst_leaf_alloc(scan, cursor, count, maxcount));
816 	blk = cursor & -radix;
817 	scan_from_start = (cursor == blk);
818 	radix /= BLIST_META_RADIX;
819 	skip = radix_to_skip(radix);
820 	mask = scan->bm_bitmap;
821 
822 	/* Discard any candidates that appear before cursor. */
823 	digit = (cursor / radix) & BLIST_META_MASK;
824 	mask &= (u_daddr_t)-1 << digit;
825 	if (mask == 0)
826 		return (SWAPBLK_NONE);
827 
828 	/*
829 	 * If the first try is for a block that includes the cursor, pre-undo
830 	 * the digit * radix offset in the first call; otherwise, ignore the
831 	 * cursor entirely.
832 	 */
833 	if (((mask >> digit) & 1) == 1)
834 		cursor -= digit * radix;
835 	else
836 		cursor = blk;
837 
838 	/*
839 	 * Examine the nonempty subtree associated with each bit set in mask.
840 	 */
841 	do {
842 		digit = bitpos(mask);
843 		i = 1 + digit * skip;
844 		if (*count <= scan[i].bm_bighint) {
845 			/*
846 			 * The allocation might fit beginning in the i'th subtree.
847 			 */
848 			r = blst_meta_alloc(&scan[i], cursor + digit * radix,
849 			    count, maxcount, radix);
850 			if (r != SWAPBLK_NONE) {
851 				if (scan[i].bm_bitmap == 0)
852 					scan->bm_bitmap ^= bitrange(digit, 1);
853 				return (r);
854 			}
855 		}
856 		cursor = blk;
857 	} while ((mask ^= bitrange(digit, 1)) != 0);
858 
859 	/*
860 	 * We couldn't allocate count in this subtree.  If the whole tree was
861 	 * scanned, and the last tree node is allocated, update bighint.
862 	 */
863 	if (scan_from_start && !(digit == BLIST_META_RADIX - 1 &&
864 	    scan[i].bm_bighint == BLIST_MAX_ALLOC))
865 		scan->bm_bighint = *count - 1;
866 
867 	return (SWAPBLK_NONE);
868 }
869 
870 /*
871  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
872  *
873  */
874 static void
875 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
876 {
877 	u_daddr_t mask;
878 
879 	/*
880 	 * free some data in this bitmap
881 	 * mask=0000111111111110000
882 	 *          \_________/\__/
883 	 *		count   n
884 	 */
885 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
886 	KASSERT((scan->bm_bitmap & mask) == 0,
887 	    ("freeing free block: %jx, size %d, mask %jx",
888 	    (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask));
889 	scan->bm_bitmap |= mask;
890 }
891 
892 /*
893  * BLST_META_FREE() - free allocated blocks from radix tree meta info
894  *
895  *	This support routine frees a range of blocks from the bitmap.
896  *	The range must be entirely enclosed by this radix node.  If a
897  *	meta node, we break the range down recursively to free blocks
898  *	in subnodes (which means that this code can free an arbitrary
899  *	range whereas the allocation code cannot allocate an arbitrary
900  *	range).
901  */
902 static void
903 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
904 {
905 	daddr_t blk, endBlk, i, skip;
906 	int digit, endDigit;
907 
908 	/*
909 	 * We could probably do a better job here.  We are required to make
910 	 * bighint at least as large as the biggest allocable block of data.
911 	 * If we just shoehorn it, a little extra overhead will be incurred
912 	 * on the next allocation (but only that one typically).
913 	 */
914 	scan->bm_bighint = BLIST_MAX_ALLOC;
915 
916 	if (radix == BLIST_BMAP_RADIX)
917 		return (blst_leaf_free(scan, freeBlk, count));
918 
919 	endBlk = ummin(freeBlk + count, (freeBlk + radix) & -radix);
920 	radix /= BLIST_META_RADIX;
921 	skip = radix_to_skip(radix);
922 	blk = freeBlk & -radix;
923 	digit = (blk / radix) & BLIST_META_MASK;
924 	endDigit = 1 + (((endBlk - 1) / radix) & BLIST_META_MASK);
925 	scan->bm_bitmap |= bitrange(digit, endDigit - digit);
926 	for (i = 1 + digit * skip; blk < endBlk; i += skip) {
927 		blk += radix;
928 		count = ummin(blk, endBlk) - freeBlk;
929 		blst_meta_free(&scan[i], freeBlk, count, radix);
930 		freeBlk = blk;
931 	}
932 }
933 
934 /*
935  * BLST_COPY() - copy one radix tree to another
936  *
937  *	Locates free space in the source tree and frees it in the destination
938  *	tree.  The space may not already be free in the destination.
939  */
940 static void
941 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
942     daddr_t count)
943 {
944 	daddr_t endBlk, i, skip;
945 
946 	/*
947 	 * Leaf node
948 	 */
949 
950 	if (radix == BLIST_BMAP_RADIX) {
951 		u_daddr_t v = scan->bm_bitmap;
952 
953 		if (v == (u_daddr_t)-1) {
954 			blist_free(dest, blk, count);
955 		} else if (v != 0) {
956 			int i;
957 
958 			for (i = 0; i < count; ++i) {
959 				if (v & ((u_daddr_t)1 << i))
960 					blist_free(dest, blk + i, 1);
961 			}
962 		}
963 		return;
964 	}
965 
966 	/*
967 	 * Meta node
968 	 */
969 
970 	if (scan->bm_bitmap == 0) {
971 		/*
972 		 * Source all allocated, leave dest allocated
973 		 */
974 		return;
975 	}
976 
977 	endBlk = blk + count;
978 	radix /= BLIST_META_RADIX;
979 	skip = radix_to_skip(radix);
980 	for (i = 1; blk < endBlk; i += skip) {
981 		blk += radix;
982 		count = radix;
983 		if (blk >= endBlk)
984 			count -= blk - endBlk;
985 		blst_copy(&scan[i], blk - radix, radix, dest, count);
986 	}
987 }
988 
989 /*
990  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
991  *
992  *	This routine allocates all blocks in the specified range
993  *	regardless of any existing allocations in that range.  Returns
994  *	the number of blocks allocated by the call.
995  */
996 static daddr_t
997 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
998 {
999 	daddr_t nblks;
1000 	u_daddr_t mask;
1001 
1002 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
1003 
1004 	/* Count the number of blocks that we are allocating. */
1005 	nblks = bitcount64(scan->bm_bitmap & mask);
1006 
1007 	scan->bm_bitmap &= ~mask;
1008 	return (nblks);
1009 }
1010 
1011 /*
1012  * BLIST_META_FILL() -	allocate specific blocks at a meta node
1013  *
1014  *	This routine allocates the specified range of blocks,
1015  *	regardless of any existing allocations in the range.  The
1016  *	range must be within the extent of this node.  Returns the
1017  *	number of blocks allocated by the call.
1018  */
1019 static daddr_t
1020 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1021 {
1022 	daddr_t blk, endBlk, i, nblks, skip;
1023 	int digit;
1024 
1025 	if (radix == BLIST_BMAP_RADIX)
1026 		return (blst_leaf_fill(scan, allocBlk, count));
1027 
1028 	endBlk = ummin(allocBlk + count, (allocBlk + radix) & -radix);
1029 	radix /= BLIST_META_RADIX;
1030 	skip = radix_to_skip(radix);
1031 	blk = allocBlk & -radix;
1032 	nblks = 0;
1033 	while (blk < endBlk) {
1034 		digit = (blk / radix) & BLIST_META_MASK;
1035 		i = 1 + digit * skip;
1036 		blk += radix;
1037 		count = ummin(blk, endBlk) - allocBlk;
1038 		nblks += blst_meta_fill(&scan[i], allocBlk, count, radix);
1039 		if (scan[i].bm_bitmap == 0)
1040 			scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
1041 		allocBlk = blk;
1042 	}
1043 	return (nblks);
1044 }
1045 
1046 #ifdef BLIST_DEBUG
1047 
1048 static void
1049 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1050 {
1051 	daddr_t skip;
1052 	u_daddr_t mask;
1053 	int digit;
1054 
1055 	if (radix == BLIST_BMAP_RADIX) {
1056 		printf(
1057 		    "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
1058 		    tab, tab, "",
1059 		    (long long)blk, (long long)radix,
1060 		    1 + (BLIST_BMAP_RADIX - 1) / 4,
1061 		    (long long)scan->bm_bitmap,
1062 		    (long long)scan->bm_bighint
1063 		);
1064 		return;
1065 	}
1066 
1067 	printf(
1068 	    "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
1069 	    tab, tab, "",
1070 	    (long long)blk, (long long)radix,
1071 	    (long long)radix,
1072 	    1 + (BLIST_META_RADIX - 1) / 4,
1073 	    (long long)scan->bm_bitmap,
1074 	    (long long)scan->bm_bighint
1075 	);
1076 
1077 	radix /= BLIST_META_RADIX;
1078 	skip = radix_to_skip(radix);
1079 	tab += 4;
1080 
1081 	mask = scan->bm_bitmap;
1082 	/* Examine the nonempty subtree associated with each bit set in mask */
1083 	do {
1084 		digit = bitpos(mask);
1085 		blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
1086 		    radix, tab);
1087 	} while ((mask ^= bitrange(digit, 1)) != 0);
1088 	tab -= 4;
1089 
1090 	printf(
1091 	    "%*.*s}\n",
1092 	    tab, tab, ""
1093 	);
1094 }
1095 
1096 #endif
1097 
1098 #ifdef BLIST_DEBUG
1099 
1100 int
1101 main(int ac, char **av)
1102 {
1103 	int size = BLIST_META_RADIX * BLIST_BMAP_RADIX;
1104 	int i;
1105 	blist_t bl;
1106 	struct sbuf *s;
1107 
1108 	for (i = 1; i < ac; ++i) {
1109 		const char *ptr = av[i];
1110 		if (*ptr != '-') {
1111 			size = strtol(ptr, NULL, 0);
1112 			continue;
1113 		}
1114 		ptr += 2;
1115 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1116 		exit(1);
1117 	}
1118 	bl = blist_create(size, M_WAITOK);
1119 	blist_free(bl, 0, size);
1120 
1121 	for (;;) {
1122 		char buf[1024];
1123 		long long da = 0;
1124 		int count = 0, maxcount = 0;
1125 
1126 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1127 		    (long long)size, (long long)bl->bl_radix);
1128 		fflush(stdout);
1129 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1130 			break;
1131 		switch(buf[0]) {
1132 		case 'r':
1133 			if (sscanf(buf + 1, "%d", &count) == 1) {
1134 				blist_resize(&bl, count, 1, M_WAITOK);
1135 			} else {
1136 				printf("?\n");
1137 			}
1138 		case 'p':
1139 			blist_print(bl);
1140 			break;
1141 		case 's':
1142 			s = sbuf_new_auto();
1143 			blist_stats(bl, s);
1144 			sbuf_finish(s);
1145 			printf("%s", sbuf_data(s));
1146 			sbuf_delete(s);
1147 			break;
1148 		case 'a':
1149 			if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) {
1150 				daddr_t blk = blist_alloc(bl, &count, maxcount);
1151 				printf("    R=%08llx, c=%08d\n",
1152 				    (long long)blk, count);
1153 			} else {
1154 				printf("?\n");
1155 			}
1156 			break;
1157 		case 'f':
1158 			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1159 				blist_free(bl, da, count);
1160 			} else {
1161 				printf("?\n");
1162 			}
1163 			break;
1164 		case 'l':
1165 			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1166 				printf("    n=%jd\n",
1167 				    (intmax_t)blist_fill(bl, da, count));
1168 			} else {
1169 				printf("?\n");
1170 			}
1171 			break;
1172 		case '?':
1173 		case 'h':
1174 			puts(
1175 			    "p          -print\n"
1176 			    "s          -stats\n"
1177 			    "a %d %d    -allocate\n"
1178 			    "f %x %d    -free\n"
1179 			    "l %x %d    -fill\n"
1180 			    "r %d       -resize\n"
1181 			    "h/?        -help\n"
1182 			    "q          -quit"
1183 			);
1184 			break;
1185 		case 'q':
1186 			break;
1187 		default:
1188 			printf("?\n");
1189 			break;
1190 		}
1191 		if (buf[0] == 'q')
1192 			break;
1193 	}
1194 	return (0);
1195 }
1196 
1197 #endif
1198