xref: /freebsd/sys/kern/subr_blist.c (revision 09b380a1)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 /*
30  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
31  *
32  *	This module implements a general bitmap allocator/deallocator.  The
33  *	allocator eats around 2 bits per 'block'.  The module does not
34  *	try to interpret the meaning of a 'block' other than to return
35  *	SWAPBLK_NONE on an allocation failure.
36  *
37  *	A radix tree controls access to pieces of the bitmap, and includes
38  *	auxiliary information at each interior node about the availabilty of
39  *	contiguous free blocks in the subtree rooted at that node.  Two radix
40  *	constants are involved: one for the size of the bitmaps contained in the
41  *	leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
42  *	each of the meta (interior) nodes (BLIST_META_RADIX).  Each subtree is
43  *	associated with a range of blocks.  The root of any subtree stores a
44  *	hint field that defines an upper bound on the size of the largest
45  *	allocation that can begin in the associated block range.  A hint is an
46  *	upper bound on a potential allocation, but not necessarily a tight upper
47  *	bound.
48  *
49  *	The bitmap field in each node directs the search for available blocks.
50  *	For a leaf node, a bit is set if the corresponding block is free.  For a
51  *	meta node, a bit is set if the corresponding subtree contains a free
52  *	block somewhere within it.  The search at a meta node considers only
53  *	children of that node that represent a range that includes a free block.
54  *
55  * 	The hinting greatly increases code efficiency for allocations while
56  *	the general radix structure optimizes both allocations and frees.  The
57  *	radix tree should be able to operate well no matter how much
58  *	fragmentation there is and no matter how large a bitmap is used.
59  *
60  *	The blist code wires all necessary memory at creation time.  Neither
61  *	allocations nor frees require interaction with the memory subsystem.
62  *	The non-blocking nature of allocations and frees is required by swap
63  *	code (vm/swap_pager.c).
64  *
65  *	LAYOUT: The radix tree is laid out recursively using a linear array.
66  *	Each meta node is immediately followed (laid out sequentially in
67  *	memory) by BLIST_META_RADIX lower level nodes.  This is a recursive
68  *	structure but one that can be easily scanned through a very simple
69  *	'skip' calculation.  The memory allocation is only large enough to
70  *	cover the number of blocks requested at creation time.  Nodes that
71  *	represent blocks beyond that limit, nodes that would never be read
72  *	or written, are not allocated, so that the last of the
73  *	BLIST_META_RADIX lower level nodes of a some nodes may not be
74  *	allocated.
75  *
76  *	NOTE: the allocator cannot currently allocate more than
77  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
78  *	large' if you try.  This is an area that could use improvement.  The
79  *	radix is large enough that this restriction does not effect the swap
80  *	system, though.  Currently only the allocation code is affected by
81  *	this algorithmic unfeature.  The freeing code can handle arbitrary
82  *	ranges.
83  *
84  *	This code can be compiled stand-alone for debugging.
85  */
86 
87 #include <sys/cdefs.h>
88 __FBSDID("$FreeBSD$");
89 
90 #ifdef _KERNEL
91 
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/lock.h>
95 #include <sys/kernel.h>
96 #include <sys/blist.h>
97 #include <sys/malloc.h>
98 #include <sys/sbuf.h>
99 #include <sys/proc.h>
100 #include <sys/mutex.h>
101 
102 #else
103 
104 #ifndef BLIST_NO_DEBUG
105 #define BLIST_DEBUG
106 #endif
107 
108 #include <sys/errno.h>
109 #include <sys/types.h>
110 #include <sys/malloc.h>
111 #include <sys/sbuf.h>
112 #include <assert.h>
113 #include <stdio.h>
114 #include <string.h>
115 #include <stddef.h>
116 #include <stdlib.h>
117 #include <stdarg.h>
118 #include <stdbool.h>
119 
120 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
121 #define malloc(a,b,c)	calloc(a, 1)
122 #define free(a,b)	free(a)
123 #define ummin(a,b)	((a) < (b) ? (a) : (b))
124 #define KASSERT(a,b)	assert(a)
125 
126 #include <sys/blist.h>
127 
128 #endif
129 
130 /*
131  * static support functions
132  */
133 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
134 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
135 		    u_daddr_t radix);
136 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
137 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
138 		    u_daddr_t radix);
139 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
140 		    blist_t dest, daddr_t count);
141 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
142 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
143 		    u_daddr_t radix);
144 #ifndef _KERNEL
145 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
146 		    int tab);
147 #endif
148 
149 #ifdef _KERNEL
150 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
151 #endif
152 
153 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
154     "radix divisibility error");
155 #define	BLIST_BMAP_MASK	(BLIST_BMAP_RADIX - 1)
156 #define	BLIST_META_MASK	(BLIST_META_RADIX - 1)
157 
158 /*
159  * For a subtree that can represent the state of up to 'radix' blocks, the
160  * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX.  If 'm'
161  * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
162  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
163  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
164  * in the 'meta' functions that process subtrees.  Since integer division
165  * discards remainders, we can express this computation as
166  * skip = (m * m**h) / (m - 1)
167  * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
168  * and since m divides BLIST_BMAP_RADIX, we can simplify further to
169  * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
170  * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
171  * so that simple integer division by a constant can safely be used for the
172  * calculation.
173  */
174 static inline daddr_t
175 radix_to_skip(daddr_t radix)
176 {
177 
178 	return (radix /
179 	    ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
180 }
181 
182 /*
183  * Provide a mask with count bits set, starting as position n.
184  */
185 static inline u_daddr_t
186 bitrange(int n, int count)
187 {
188 
189 	return (((u_daddr_t)-1 << n) &
190 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - (n + count))));
191 }
192 
193 
194 /*
195  * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t.
196  * Assumes that the argument has only one bit set.
197  */
198 static inline int
199 bitpos(u_daddr_t mask)
200 {
201 	int hi, lo, mid;
202 
203 	switch (sizeof(mask)) {
204 #ifdef HAVE_INLINE_FFSLL
205 	case sizeof(long long):
206 		return (ffsll(mask) - 1);
207 #endif
208 	default:
209 		lo = 0;
210 		hi = BLIST_BMAP_RADIX;
211 		while (lo + 1 < hi) {
212 			mid = (lo + hi) >> 1;
213 			if ((mask >> mid) != 0)
214 				lo = mid;
215 			else
216 				hi = mid;
217 		}
218 		return (lo);
219 	}
220 }
221 
222 /*
223  * blist_create() - create a blist capable of handling up to the specified
224  *		    number of blocks
225  *
226  *	blocks - must be greater than 0
227  * 	flags  - malloc flags
228  *
229  *	The smallest blist consists of a single leaf node capable of
230  *	managing BLIST_BMAP_RADIX blocks.
231  */
232 blist_t
233 blist_create(daddr_t blocks, int flags)
234 {
235 	blist_t bl;
236 	u_daddr_t nodes, radix;
237 
238 	KASSERT(blocks > 0, ("invalid block count"));
239 
240 	/*
241 	 * Calculate the radix and node count used for scanning.
242 	 */
243 	nodes = 1;
244 	radix = BLIST_BMAP_RADIX;
245 	while (radix <= blocks) {
246 		nodes += 1 + (blocks - 1) / radix;
247 		radix *= BLIST_META_RADIX;
248 	}
249 
250 	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
251 	    M_ZERO);
252 	if (bl == NULL)
253 		return (NULL);
254 
255 	bl->bl_blocks = blocks;
256 	bl->bl_radix = radix;
257 
258 #if defined(BLIST_DEBUG)
259 	printf(
260 		"BLIST representing %lld blocks (%lld MB of swap)"
261 		", requiring %lldK of ram\n",
262 		(long long)bl->bl_blocks,
263 		(long long)bl->bl_blocks * 4 / 1024,
264 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
265 	);
266 	printf("BLIST raw radix tree contains %lld records\n",
267 	    (long long)nodes);
268 #endif
269 
270 	return (bl);
271 }
272 
273 void
274 blist_destroy(blist_t bl)
275 {
276 
277 	free(bl, M_SWAP);
278 }
279 
280 /*
281  * blist_alloc() -   reserve space in the block bitmap.  Return the base
282  *		     of a contiguous region or SWAPBLK_NONE if space could
283  *		     not be allocated.
284  */
285 daddr_t
286 blist_alloc(blist_t bl, daddr_t count)
287 {
288 	daddr_t blk, cursor;
289 
290 	KASSERT(count <= BLIST_MAX_ALLOC,
291 	    ("allocation too large: %d", (int)count));
292 
293 	/*
294 	 * This loop iterates at most twice.  An allocation failure in the
295 	 * first iteration leads to a second iteration only if the cursor was
296 	 * non-zero.  When the cursor is zero, an allocation failure will
297 	 * stop further iterations.
298 	 */
299 	cursor = bl->bl_cursor;
300 	for (;;) {
301 		blk = blst_meta_alloc(bl->bl_root, cursor, count,
302 		    bl->bl_radix);
303 		if (blk != SWAPBLK_NONE) {
304 			bl->bl_avail -= count;
305 			bl->bl_cursor = blk + count;
306 			if (bl->bl_cursor == bl->bl_blocks)
307 				bl->bl_cursor = 0;
308 			return (blk);
309 		} else if (cursor == 0)
310 			return (SWAPBLK_NONE);
311 		cursor = 0;
312 	}
313 }
314 
315 /*
316  * blist_avail() -	return the number of free blocks.
317  */
318 daddr_t
319 blist_avail(blist_t bl)
320 {
321 
322 	return (bl->bl_avail);
323 }
324 
325 /*
326  * blist_free() -	free up space in the block bitmap.  Return the base
327  *		     	of a contiguous region.
328  */
329 void
330 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
331 {
332 
333 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
334 	    ("freeing invalid range: blkno %jx, count %d, blocks %jd",
335 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
336 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
337 	bl->bl_avail += count;
338 }
339 
340 /*
341  * blist_fill() -	mark a region in the block bitmap as off-limits
342  *			to the allocator (i.e. allocate it), ignoring any
343  *			existing allocations.  Return the number of blocks
344  *			actually filled that were free before the call.
345  */
346 daddr_t
347 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
348 {
349 	daddr_t filled;
350 
351 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
352 	    ("filling invalid range: blkno %jx, count %d, blocks %jd",
353 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
354 	filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
355 	bl->bl_avail -= filled;
356 	return (filled);
357 }
358 
359 /*
360  * blist_resize() -	resize an existing radix tree to handle the
361  *			specified number of blocks.  This will reallocate
362  *			the tree and transfer the previous bitmap to the new
363  *			one.  When extending the tree you can specify whether
364  *			the new blocks are to left allocated or freed.
365  */
366 void
367 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
368 {
369     blist_t newbl = blist_create(count, flags);
370     blist_t save = *pbl;
371 
372     *pbl = newbl;
373     if (count > save->bl_blocks)
374 	    count = save->bl_blocks;
375     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
376 
377     /*
378      * If resizing upwards, should we free the new space or not?
379      */
380     if (freenew && count < newbl->bl_blocks) {
381 	    blist_free(newbl, count, newbl->bl_blocks - count);
382     }
383     blist_destroy(save);
384 }
385 
386 #ifdef BLIST_DEBUG
387 
388 /*
389  * blist_print()    - dump radix tree
390  */
391 void
392 blist_print(blist_t bl)
393 {
394 	printf("BLIST avail = %jd, cursor = %08jx {\n",
395 	    (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
396 
397 	if (bl->bl_root->bm_bitmap != 0)
398 		blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
399 	printf("}\n");
400 }
401 
402 #endif
403 
404 static const u_daddr_t fib[] = {
405 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
406 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
407 	514229, 832040, 1346269, 2178309, 3524578,
408 };
409 
410 /*
411  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
412  */
413 struct gap_stats {
414 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
415 	daddr_t	num;		/* number of gaps observed */
416 	daddr_t	max;		/* largest gap size */
417 	daddr_t	avg;		/* average gap size */
418 	daddr_t	err;		/* sum - num * avg */
419 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
420 	int	max_bucket;	/* last histo elt with nonzero val */
421 };
422 
423 /*
424  * gap_stats_counting()    - is the state 'counting 1 bits'?
425  *                           or 'skipping 0 bits'?
426  */
427 static inline bool
428 gap_stats_counting(const struct gap_stats *stats)
429 {
430 
431 	return (stats->start != SWAPBLK_NONE);
432 }
433 
434 /*
435  * init_gap_stats()    - initialize stats on gap sizes
436  */
437 static inline void
438 init_gap_stats(struct gap_stats *stats)
439 {
440 
441 	bzero(stats, sizeof(*stats));
442 	stats->start = SWAPBLK_NONE;
443 }
444 
445 /*
446  * update_gap_stats()    - update stats on gap sizes
447  */
448 static void
449 update_gap_stats(struct gap_stats *stats, daddr_t posn)
450 {
451 	daddr_t size;
452 	int hi, lo, mid;
453 
454 	if (!gap_stats_counting(stats)) {
455 		stats->start = posn;
456 		return;
457 	}
458 	size = posn - stats->start;
459 	stats->start = SWAPBLK_NONE;
460 	if (size > stats->max)
461 		stats->max = size;
462 
463 	/*
464 	 * Find the fibonacci range that contains size,
465 	 * expecting to find it in an early range.
466 	 */
467 	lo = 0;
468 	hi = 1;
469 	while (hi < nitems(fib) && fib[hi] <= size) {
470 		lo = hi;
471 		hi *= 2;
472 	}
473 	if (hi >= nitems(fib))
474 		hi = nitems(fib);
475 	while (lo + 1 != hi) {
476 		mid = (lo + hi) >> 1;
477 		if (fib[mid] <= size)
478 			lo = mid;
479 		else
480 			hi = mid;
481 	}
482 	stats->histo[lo]++;
483 	if (lo > stats->max_bucket)
484 		stats->max_bucket = lo;
485 	stats->err += size - stats->avg;
486 	stats->num++;
487 	stats->avg += stats->err / stats->num;
488 	stats->err %= stats->num;
489 }
490 
491 /*
492  * dump_gap_stats()    - print stats on gap sizes
493  */
494 static inline void
495 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
496 {
497 	int i;
498 
499 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
500 	    (intmax_t)stats->num);
501 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
502 	sbuf_printf(s, "average maximal free range size: %jd\n",
503 	    (intmax_t)stats->avg);
504 	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
505 	sbuf_printf(s, "               count  |  size range\n");
506 	sbuf_printf(s, "               -----  |  ----------\n");
507 	for (i = 0; i < stats->max_bucket; i++) {
508 		if (stats->histo[i] != 0) {
509 			sbuf_printf(s, "%20jd  |  ",
510 			    (intmax_t)stats->histo[i]);
511 			if (fib[i] != fib[i + 1] - 1)
512 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
513 				    (intmax_t)fib[i + 1] - 1);
514 			else
515 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
516 		}
517 	}
518 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
519 	if (stats->histo[i] > 1)
520 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
521 		    (intmax_t)stats->max);
522 	else
523 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
524 }
525 
526 /*
527  * blist_stats()    - dump radix tree stats
528  */
529 void
530 blist_stats(blist_t bl, struct sbuf *s)
531 {
532 	struct gap_stats gstats;
533 	struct gap_stats *stats = &gstats;
534 	daddr_t i, nodes, radix;
535 	u_daddr_t bit, diff, mask;
536 
537 	init_gap_stats(stats);
538 	nodes = 0;
539 	i = bl->bl_radix;
540 	while (i < bl->bl_radix + bl->bl_blocks) {
541 		/*
542 		 * Find max size subtree starting at i.
543 		 */
544 		radix = BLIST_BMAP_RADIX;
545 		while (((i / radix) & BLIST_META_MASK) == 0)
546 			radix *= BLIST_META_RADIX;
547 
548 		/*
549 		 * Check for skippable subtrees starting at i.
550 		 */
551 		while (radix > BLIST_BMAP_RADIX) {
552 			if (bl->bl_root[nodes].bm_bitmap == 0) {
553 				if (gap_stats_counting(stats))
554 					update_gap_stats(stats, i);
555 				break;
556 			}
557 
558 			/*
559 			 * Skip subtree root.
560 			 */
561 			nodes++;
562 			radix /= BLIST_META_RADIX;
563 		}
564 		if (radix == BLIST_BMAP_RADIX) {
565 			/*
566 			 * Scan leaf.
567 			 */
568 			mask = bl->bl_root[nodes].bm_bitmap;
569 			diff = mask ^ (mask << 1);
570 			if (gap_stats_counting(stats))
571 				diff ^= 1;
572 			while (diff != 0) {
573 				bit = diff & -diff;
574 				update_gap_stats(stats, i + bitpos(bit));
575 				diff ^= bit;
576 			}
577 		}
578 		nodes += radix_to_skip(radix);
579 		i += radix;
580 	}
581 	update_gap_stats(stats, i);
582 	dump_gap_stats(stats, s);
583 }
584 
585 /************************************************************************
586  *			  ALLOCATION SUPPORT FUNCTIONS			*
587  ************************************************************************
588  *
589  *	These support functions do all the actual work.  They may seem
590  *	rather longish, but that's because I've commented them up.  The
591  *	actual code is straight forward.
592  *
593  */
594 
595 /*
596  * BLST_NEXT_LEAF_ALLOC() - allocate the first few blocks in the next leaf.
597  *
598  *	'scan' is a leaf node, associated with a block containing 'blk'.
599  *	The next leaf node could be adjacent, or several nodes away if the
600  *	least common ancestor of 'scan' and its neighbor is several levels
601  *	up.  Use 'blk' to determine how many meta-nodes lie between the
602  *	leaves.  If the next leaf has enough initial bits set, clear them
603  *	and clear the bits in the meta nodes on the path up to the least
604  *	common ancestor to mark any subtrees made completely empty.
605  */
606 static int
607 blst_next_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
608 {
609 	blmeta_t *next;
610 	u_daddr_t radix;
611 	int digit;
612 
613 	next = scan + 1;
614 	blk += BLIST_BMAP_RADIX;
615 	radix = BLIST_BMAP_RADIX;
616 	while ((digit = ((blk / radix) & BLIST_META_MASK)) == 0 &&
617 	    (next->bm_bitmap & 1) == 1) {
618 		next++;
619 		radix *= BLIST_META_RADIX;
620 	}
621 	if (((next->bm_bitmap + 1) & ~((u_daddr_t)-1 << count)) != 0) {
622 		/*
623 		 * The next leaf doesn't have enough free blocks at the
624 		 * beginning to complete the spanning allocation.
625 		 */
626 		return (ENOMEM);
627 	}
628 	/* Clear the first 'count' bits in the next leaf to allocate. */
629 	next->bm_bitmap &= (u_daddr_t)-1 << count;
630 
631 	/*
632 	 * Update bitmaps of next-ancestors, up to least common ancestor.
633 	 */
634 	while (next->bm_bitmap == 0) {
635 		if (--next == scan) {
636 			scan[-digit * radix_to_skip(radix)].bm_bitmap ^=
637 			    (u_daddr_t)1 << digit;
638 			break;
639 		}
640 		next->bm_bitmap ^= 1;
641  	}
642 	return (0);
643 }
644 
645 /*
646  * Given a bitmask, flip all the bits from the least-significant 1-bit to the
647  * most significant bit.  If the result is non-zero, then the least-significant
648  * 1-bit of the result is in the same position as the least-signification 0-bit
649  * in mask that is followed by a 1-bit.
650  */
651 static inline u_daddr_t
652 flip_hibits(u_daddr_t mask)
653 {
654 
655 	return (-mask & ~mask);
656 }
657 
658 /*
659  * BLST_LEAF_ALLOC() -	allocate at a leaf in the radix tree (a bitmap).
660  *
661  *	This function is the core of the allocator.  Its execution time is
662  *	proportional to log(count), plus height of the tree if the allocation
663  *	crosses a leaf boundary.
664  */
665 static daddr_t
666 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
667 {
668 	u_daddr_t cursor_mask, mask;
669 	int count1, hi, lo, num_shifts, range1, range_ext;
670 
671 	range1 = 0;
672 	count1 = count - 1;
673 	num_shifts = fls(count1);
674 	mask = scan->bm_bitmap;
675 	while (flip_hibits(mask) != 0 && num_shifts > 0) {
676 		/*
677 		 * If bit i is set in mask, then bits in [i, i+range1] are set
678 		 * in scan->bm_bitmap.  The value of range1 is equal to count1
679 		 * >> num_shifts.  Grow range1 and reduce num_shifts to 0,
680 		 * while preserving these invariants.  The updates to mask
681 		 * leave fewer bits set, but each bit that remains set
682 		 * represents a longer string of consecutive bits set in
683 		 * scan->bm_bitmap.  If more updates to mask cannot clear more
684 		 * bits, because mask is partitioned with all 0 bits preceding
685 		 * all 1 bits, the loop terminates immediately.
686 		 */
687 		num_shifts--;
688 		range_ext = range1 + ((count1 >> num_shifts) & 1);
689 		/*
690 		 * mask is a signed quantity for the shift because when it is
691 		 * shifted right, the sign bit should copied; when the last
692 		 * block of the leaf is free, pretend, for a while, that all the
693 		 * blocks that follow it are also free.
694 		 */
695 		mask &= (daddr_t)mask >> range_ext;
696 		range1 += range_ext;
697 	}
698 	if (mask == 0) {
699 		/*
700 		 * Update bighint.  There is no allocation bigger than range1
701 		 * starting in this leaf.
702 		 */
703 		scan->bm_bighint = range1;
704 		return (SWAPBLK_NONE);
705 	}
706 
707 	/* Discard any candidates that appear before blk. */
708 	if ((blk & BLIST_BMAP_MASK) != 0) {
709 		cursor_mask = mask & bitrange(0, blk & BLIST_BMAP_MASK);
710 		if (cursor_mask != 0) {
711 			mask ^= cursor_mask;
712 			if (mask == 0)
713 				return (SWAPBLK_NONE);
714 
715 			/*
716 			 * Bighint change for last block allocation cannot
717 			 * assume that any other blocks are allocated, so the
718 			 * bighint cannot be reduced much.
719 			 */
720 			range1 = BLIST_MAX_ALLOC - 1;
721 		}
722 		blk &= ~BLIST_BMAP_MASK;
723 	}
724 
725 	/*
726 	 * The least significant set bit in mask marks the start of the first
727 	 * available range of sufficient size.  Clear all the bits but that one,
728 	 * and then find its position.
729 	 */
730 	mask &= -mask;
731 	lo = bitpos(mask);
732 
733 	hi = lo + count;
734 	if (hi > BLIST_BMAP_RADIX) {
735 		/*
736 		 * An allocation within this leaf is impossible, so a successful
737 		 * allocation depends on the next leaf providing some of the blocks.
738 		 */
739 		if (blst_next_leaf_alloc(scan, blk, hi - BLIST_BMAP_RADIX) != 0)
740 			/*
741 			 * The hint cannot be updated, because the same
742 			 * allocation request could be satisfied later, by this
743 			 * leaf, if the state of the next leaf changes, and
744 			 * without any changes to this leaf.
745 			 */
746 			return (SWAPBLK_NONE);
747 		hi = BLIST_BMAP_RADIX;
748 	}
749 
750 	/* Set the bits of mask at position 'lo' and higher. */
751 	mask = -mask;
752 	if (hi == BLIST_BMAP_RADIX) {
753 		/*
754 		 * Update bighint.  There is no allocation bigger than range1
755 		 * available in this leaf after this allocation completes.
756 		 */
757 		scan->bm_bighint = range1;
758 	} else {
759 		/* Clear the bits of mask at position 'hi' and higher. */
760 		mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi);
761 	}
762 	/* Clear the allocated bits from this leaf. */
763 	scan->bm_bitmap &= ~mask;
764 	return (blk + lo);
765 }
766 
767 /*
768  * blist_meta_alloc() -	allocate at a meta in the radix tree.
769  *
770  *	Attempt to allocate at a meta node.  If we can't, we update
771  *	bighint and return a failure.  Updating bighint optimize future
772  *	calls that hit this node.  We have to check for our collapse cases
773  *	and we have a few optimizations strewn in as well.
774  */
775 static daddr_t
776 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
777 {
778 	daddr_t blk, i, r, skip;
779 	u_daddr_t bit, mask;
780 	bool scan_from_start;
781 	int digit;
782 
783 	if (radix == BLIST_BMAP_RADIX)
784 		return (blst_leaf_alloc(scan, cursor, count));
785 	blk = cursor & -radix;
786 	scan_from_start = (cursor == blk);
787 	radix /= BLIST_META_RADIX;
788 	skip = radix_to_skip(radix);
789 	mask = scan->bm_bitmap;
790 
791 	/* Discard any candidates that appear before cursor. */
792 	digit = (cursor / radix) & BLIST_META_MASK;
793 	mask &= (u_daddr_t)-1 << digit;
794 	if (mask == 0)
795 		return (SWAPBLK_NONE);
796 
797 	/*
798 	 * If the first try is for a block that includes the cursor, pre-undo
799 	 * the digit * radix offset in the first call; otherwise, ignore the
800 	 * cursor entirely.
801 	 */
802 	if (((mask >> digit) & 1) == 1)
803 		cursor -= digit * radix;
804 	else
805 		cursor = blk;
806 
807 	/*
808 	 * Examine the nonempty subtree associated with each bit set in mask.
809 	 */
810 	do {
811 		bit = mask & -mask;
812 		digit = bitpos(bit);
813 		i = 1 + digit * skip;
814 		if (count <= scan[i].bm_bighint) {
815 			/*
816 			 * The allocation might fit beginning in the i'th subtree.
817 			 */
818 			r = blst_meta_alloc(&scan[i], cursor + digit * radix,
819 			    count, radix);
820 			if (r != SWAPBLK_NONE) {
821 				if (scan[i].bm_bitmap == 0)
822 					scan->bm_bitmap ^= bit;
823 				return (r);
824 			}
825 		}
826 		cursor = blk;
827 	} while ((mask ^= bit) != 0);
828 
829 	/*
830 	 * We couldn't allocate count in this subtree.  If the whole tree was
831 	 * scanned, and the last tree node is allocated, update bighint.
832 	 */
833 	if (scan_from_start && !(digit == BLIST_META_RADIX - 1 &&
834 	    scan[i].bm_bighint == BLIST_MAX_ALLOC))
835 		scan->bm_bighint = count - 1;
836 
837 	return (SWAPBLK_NONE);
838 }
839 
840 /*
841  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
842  *
843  */
844 static void
845 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
846 {
847 	u_daddr_t mask;
848 
849 	/*
850 	 * free some data in this bitmap
851 	 * mask=0000111111111110000
852 	 *          \_________/\__/
853 	 *		count   n
854 	 */
855 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
856 	KASSERT((scan->bm_bitmap & mask) == 0,
857 	    ("freeing free block: %jx, size %d, mask %jx",
858 	    (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask));
859 	scan->bm_bitmap |= mask;
860 }
861 
862 /*
863  * BLST_META_FREE() - free allocated blocks from radix tree meta info
864  *
865  *	This support routine frees a range of blocks from the bitmap.
866  *	The range must be entirely enclosed by this radix node.  If a
867  *	meta node, we break the range down recursively to free blocks
868  *	in subnodes (which means that this code can free an arbitrary
869  *	range whereas the allocation code cannot allocate an arbitrary
870  *	range).
871  */
872 static void
873 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
874 {
875 	daddr_t blk, endBlk, i, skip;
876 	int digit, endDigit;
877 
878 	/*
879 	 * We could probably do a better job here.  We are required to make
880 	 * bighint at least as large as the biggest allocable block of data.
881 	 * If we just shoehorn it, a little extra overhead will be incurred
882 	 * on the next allocation (but only that one typically).
883 	 */
884 	scan->bm_bighint = BLIST_MAX_ALLOC;
885 
886 	if (radix == BLIST_BMAP_RADIX)
887 		return (blst_leaf_free(scan, freeBlk, count));
888 
889 	endBlk = ummin(freeBlk + count, (freeBlk + radix) & -radix);
890 	radix /= BLIST_META_RADIX;
891 	skip = radix_to_skip(radix);
892 	blk = freeBlk & -radix;
893 	digit = (blk / radix) & BLIST_META_MASK;
894 	endDigit = 1 + (((endBlk - 1) / radix) & BLIST_META_MASK);
895 	scan->bm_bitmap |= bitrange(digit, endDigit - digit);
896 	for (i = 1 + digit * skip; blk < endBlk; i += skip) {
897 		blk += radix;
898 		count = ummin(blk, endBlk) - freeBlk;
899 		blst_meta_free(&scan[i], freeBlk, count, radix);
900 		freeBlk = blk;
901 	}
902 }
903 
904 /*
905  * BLST_COPY() - copy one radix tree to another
906  *
907  *	Locates free space in the source tree and frees it in the destination
908  *	tree.  The space may not already be free in the destination.
909  */
910 static void
911 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
912     daddr_t count)
913 {
914 	daddr_t endBlk, i, skip;
915 
916 	/*
917 	 * Leaf node
918 	 */
919 
920 	if (radix == BLIST_BMAP_RADIX) {
921 		u_daddr_t v = scan->bm_bitmap;
922 
923 		if (v == (u_daddr_t)-1) {
924 			blist_free(dest, blk, count);
925 		} else if (v != 0) {
926 			int i;
927 
928 			for (i = 0; i < count; ++i) {
929 				if (v & ((u_daddr_t)1 << i))
930 					blist_free(dest, blk + i, 1);
931 			}
932 		}
933 		return;
934 	}
935 
936 	/*
937 	 * Meta node
938 	 */
939 
940 	if (scan->bm_bitmap == 0) {
941 		/*
942 		 * Source all allocated, leave dest allocated
943 		 */
944 		return;
945 	}
946 
947 	endBlk = blk + count;
948 	radix /= BLIST_META_RADIX;
949 	skip = radix_to_skip(radix);
950 	for (i = 1; blk < endBlk; i += skip) {
951 		blk += radix;
952 		count = radix;
953 		if (blk >= endBlk)
954 			count -= blk - endBlk;
955 		blst_copy(&scan[i], blk - radix, radix, dest, count);
956 	}
957 }
958 
959 /*
960  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
961  *
962  *	This routine allocates all blocks in the specified range
963  *	regardless of any existing allocations in that range.  Returns
964  *	the number of blocks allocated by the call.
965  */
966 static daddr_t
967 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
968 {
969 	daddr_t nblks;
970 	u_daddr_t mask;
971 
972 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
973 
974 	/* Count the number of blocks that we are allocating. */
975 	nblks = bitcount64(scan->bm_bitmap & mask);
976 
977 	scan->bm_bitmap &= ~mask;
978 	return (nblks);
979 }
980 
981 /*
982  * BLIST_META_FILL() -	allocate specific blocks at a meta node
983  *
984  *	This routine allocates the specified range of blocks,
985  *	regardless of any existing allocations in the range.  The
986  *	range must be within the extent of this node.  Returns the
987  *	number of blocks allocated by the call.
988  */
989 static daddr_t
990 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
991 {
992 	daddr_t blk, endBlk, i, nblks, skip;
993 	int digit;
994 
995 	if (radix == BLIST_BMAP_RADIX)
996 		return (blst_leaf_fill(scan, allocBlk, count));
997 
998 	endBlk = ummin(allocBlk + count, (allocBlk + radix) & -radix);
999 	radix /= BLIST_META_RADIX;
1000 	skip = radix_to_skip(radix);
1001 	blk = allocBlk & -radix;
1002 	nblks = 0;
1003 	while (blk < endBlk) {
1004 		digit = (blk / radix) & BLIST_META_MASK;
1005 		i = 1 + digit * skip;
1006 		blk += radix;
1007 		count = ummin(blk, endBlk) - allocBlk;
1008 		nblks += blst_meta_fill(&scan[i], allocBlk, count, radix);
1009 		if (scan[i].bm_bitmap == 0)
1010 			scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
1011 		allocBlk = blk;
1012 	}
1013 	return (nblks);
1014 }
1015 
1016 #ifdef BLIST_DEBUG
1017 
1018 static void
1019 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1020 {
1021 	daddr_t skip;
1022 	u_daddr_t bit, mask;
1023 	int digit;
1024 
1025 	if (radix == BLIST_BMAP_RADIX) {
1026 		printf(
1027 		    "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
1028 		    tab, tab, "",
1029 		    (long long)blk, (long long)radix,
1030 		    1 + (BLIST_BMAP_RADIX - 1) / 4,
1031 		    (long long)scan->bm_bitmap,
1032 		    (long long)scan->bm_bighint
1033 		);
1034 		return;
1035 	}
1036 
1037 	printf(
1038 	    "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
1039 	    tab, tab, "",
1040 	    (long long)blk, (long long)radix,
1041 	    (long long)radix,
1042 	    1 + (BLIST_META_RADIX - 1) / 4,
1043 	    (long long)scan->bm_bitmap,
1044 	    (long long)scan->bm_bighint
1045 	);
1046 
1047 	radix /= BLIST_META_RADIX;
1048 	skip = radix_to_skip(radix);
1049 	tab += 4;
1050 
1051 	mask = scan->bm_bitmap;
1052 	/* Examine the nonempty subtree associated with each bit set in mask */
1053 	do {
1054 		bit = mask & -mask;
1055 		digit = bitpos(bit);
1056 		blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
1057 		    radix, tab);
1058 	} while ((mask ^= bit) != 0);
1059 	tab -= 4;
1060 
1061 	printf(
1062 	    "%*.*s}\n",
1063 	    tab, tab, ""
1064 	);
1065 }
1066 
1067 #endif
1068 
1069 #ifdef BLIST_DEBUG
1070 
1071 int
1072 main(int ac, char **av)
1073 {
1074 	int size = BLIST_META_RADIX * BLIST_BMAP_RADIX;
1075 	int i;
1076 	blist_t bl;
1077 	struct sbuf *s;
1078 
1079 	for (i = 1; i < ac; ++i) {
1080 		const char *ptr = av[i];
1081 		if (*ptr != '-') {
1082 			size = strtol(ptr, NULL, 0);
1083 			continue;
1084 		}
1085 		ptr += 2;
1086 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1087 		exit(1);
1088 	}
1089 	bl = blist_create(size, M_WAITOK);
1090 	blist_free(bl, 0, size);
1091 
1092 	for (;;) {
1093 		char buf[1024];
1094 		long long da = 0;
1095 		long long count = 0;
1096 
1097 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1098 		    (long long)size, (long long)bl->bl_radix);
1099 		fflush(stdout);
1100 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1101 			break;
1102 		switch(buf[0]) {
1103 		case 'r':
1104 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1105 				blist_resize(&bl, count, 1, M_WAITOK);
1106 			} else {
1107 				printf("?\n");
1108 			}
1109 		case 'p':
1110 			blist_print(bl);
1111 			break;
1112 		case 's':
1113 			s = sbuf_new_auto();
1114 			blist_stats(bl, s);
1115 			sbuf_finish(s);
1116 			printf("%s", sbuf_data(s));
1117 			sbuf_delete(s);
1118 			break;
1119 		case 'a':
1120 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1121 				daddr_t blk = blist_alloc(bl, count);
1122 				printf("    R=%08llx\n", (long long)blk);
1123 			} else {
1124 				printf("?\n");
1125 			}
1126 			break;
1127 		case 'f':
1128 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1129 				blist_free(bl, da, count);
1130 			} else {
1131 				printf("?\n");
1132 			}
1133 			break;
1134 		case 'l':
1135 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1136 				printf("    n=%jd\n",
1137 				    (intmax_t)blist_fill(bl, da, count));
1138 			} else {
1139 				printf("?\n");
1140 			}
1141 			break;
1142 		case '?':
1143 		case 'h':
1144 			puts(
1145 			    "p          -print\n"
1146 			    "s          -stats\n"
1147 			    "a %d       -allocate\n"
1148 			    "f %x %d    -free\n"
1149 			    "l %x %d    -fill\n"
1150 			    "r %d       -resize\n"
1151 			    "h/?        -help"
1152 			);
1153 			break;
1154 		default:
1155 			printf("?\n");
1156 			break;
1157 		}
1158 	}
1159 	return (0);
1160 }
1161 
1162 #endif
1163