xref: /freebsd/sys/kern/subr_blist.c (revision 27d172bb)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 /*
30  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
31  *
32  *	This module implements a general bitmap allocator/deallocator.  The
33  *	allocator eats around 2 bits per 'block'.  The module does not
34  *	try to interpret the meaning of a 'block' other than to return
35  *	SWAPBLK_NONE on an allocation failure.
36  *
37  *	A radix tree controls access to pieces of the bitmap, and includes
38  *	auxiliary information at each interior node about the availabilty of
39  *	contiguous free blocks in the subtree rooted at that node.  Two radix
40  *	constants are involved: one for the size of the bitmaps contained in the
41  *	leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
42  *	each of the meta (interior) nodes (BLIST_META_RADIX).  Each subtree is
43  *	associated with a range of blocks.  The root of any subtree stores a
44  *	hint field that defines an upper bound on the size of the largest
45  *	allocation that can begin in the associated block range.  A hint is an
46  *	upper bound on a potential allocation, but not necessarily a tight upper
47  *	bound.
48  *
49  *	The bitmap field in each node directs the search for available blocks.
50  *	For a leaf node, a bit is set if the corresponding block is free.  For a
51  *	meta node, a bit is set if the corresponding subtree contains a free
52  *	block somewhere within it.  The search at a meta node considers only
53  *	children of that node that represent a range that includes a free block.
54  *
55  * 	The hinting greatly increases code efficiency for allocations while
56  *	the general radix structure optimizes both allocations and frees.  The
57  *	radix tree should be able to operate well no matter how much
58  *	fragmentation there is and no matter how large a bitmap is used.
59  *
60  *	The blist code wires all necessary memory at creation time.  Neither
61  *	allocations nor frees require interaction with the memory subsystem.
62  *	The non-blocking nature of allocations and frees is required by swap
63  *	code (vm/swap_pager.c).
64  *
65  *	LAYOUT: The radix tree is laid out recursively using a linear array.
66  *	Each meta node is immediately followed (laid out sequentially in
67  *	memory) by BLIST_META_RADIX lower level nodes.  This is a recursive
68  *	structure but one that can be easily scanned through a very simple
69  *	'skip' calculation.  The memory allocation is only large enough to
70  *	cover the number of blocks requested at creation time.  Nodes that
71  *	represent blocks beyond that limit, nodes that would never be read
72  *	or written, are not allocated, so that the last of the
73  *	BLIST_META_RADIX lower level nodes of a some nodes may not be
74  *	allocated.
75  *
76  *	NOTE: the allocator cannot currently allocate more than
77  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
78  *	large' if you try.  This is an area that could use improvement.  The
79  *	radix is large enough that this restriction does not effect the swap
80  *	system, though.  Currently only the allocation code is affected by
81  *	this algorithmic unfeature.  The freeing code can handle arbitrary
82  *	ranges.
83  *
84  *	This code can be compiled stand-alone for debugging.
85  */
86 
87 #include <sys/cdefs.h>
88 __FBSDID("$FreeBSD$");
89 
90 #ifdef _KERNEL
91 
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/lock.h>
95 #include <sys/kernel.h>
96 #include <sys/blist.h>
97 #include <sys/malloc.h>
98 #include <sys/sbuf.h>
99 #include <sys/proc.h>
100 #include <sys/mutex.h>
101 
102 #else
103 
104 #ifndef BLIST_NO_DEBUG
105 #define BLIST_DEBUG
106 #endif
107 
108 #include <sys/errno.h>
109 #include <sys/types.h>
110 #include <sys/malloc.h>
111 #include <sys/sbuf.h>
112 #include <stdio.h>
113 #include <string.h>
114 #include <stddef.h>
115 #include <stdlib.h>
116 #include <stdarg.h>
117 #include <stdbool.h>
118 
119 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
120 #define malloc(a,b,c)	calloc(a, 1)
121 #define free(a,b)	free(a)
122 #define ummin(a,b)	((a) < (b) ? (a) : (b))
123 
124 #include <sys/blist.h>
125 
126 void panic(const char *ctl, ...);
127 
128 #endif
129 
130 /*
131  * static support functions
132  */
133 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
134 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
135 		    u_daddr_t radix);
136 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
137 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
138 		    u_daddr_t radix);
139 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
140 		    blist_t dest, daddr_t count);
141 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
142 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
143 		    u_daddr_t radix);
144 #ifndef _KERNEL
145 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
146 		    int tab);
147 #endif
148 
149 #ifdef _KERNEL
150 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
151 #endif
152 
153 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
154     "radix divisibility error");
155 #define	BLIST_BMAP_MASK	(BLIST_BMAP_RADIX - 1)
156 #define	BLIST_META_MASK	(BLIST_META_RADIX - 1)
157 
158 /*
159  * For a subtree that can represent the state of up to 'radix' blocks, the
160  * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX.  If 'm'
161  * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
162  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
163  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
164  * in the 'meta' functions that process subtrees.  Since integer division
165  * discards remainders, we can express this computation as
166  * skip = (m * m**h) / (m - 1)
167  * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
168  * and since m divides BLIST_BMAP_RADIX, we can simplify further to
169  * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
170  * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
171  * so that simple integer division by a constant can safely be used for the
172  * calculation.
173  */
174 static inline daddr_t
175 radix_to_skip(daddr_t radix)
176 {
177 
178 	return (radix /
179 	    ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
180 }
181 
182 /*
183  * Provide a mask with count bits set, starting as position n.
184  */
185 static inline u_daddr_t
186 bitrange(int n, int count)
187 {
188 
189 	return (((u_daddr_t)-1 << n) &
190 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - (n + count))));
191 }
192 
193 
194 /*
195  * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t.
196  * Assumes that the argument has only one bit set.
197  */
198 static inline int
199 bitpos(u_daddr_t mask)
200 {
201 	int hi, lo, mid;
202 
203 	switch (sizeof(mask)) {
204 #ifdef HAVE_INLINE_FFSLL
205 	case sizeof(long long):
206 		return (ffsll(mask) - 1);
207 #endif
208 	default:
209 		lo = 0;
210 		hi = BLIST_BMAP_RADIX;
211 		while (lo + 1 < hi) {
212 			mid = (lo + hi) >> 1;
213 			if ((mask >> mid) != 0)
214 				lo = mid;
215 			else
216 				hi = mid;
217 		}
218 		return (lo);
219 	}
220 }
221 
222 /*
223  * blist_create() - create a blist capable of handling up to the specified
224  *		    number of blocks
225  *
226  *	blocks - must be greater than 0
227  * 	flags  - malloc flags
228  *
229  *	The smallest blist consists of a single leaf node capable of
230  *	managing BLIST_BMAP_RADIX blocks.
231  */
232 blist_t
233 blist_create(daddr_t blocks, int flags)
234 {
235 	blist_t bl;
236 	u_daddr_t nodes, radix;
237 
238 	if (blocks == 0)
239 		panic("invalid block count");
240 
241 	/*
242 	 * Calculate the radix and node count used for scanning.
243 	 */
244 	nodes = 1;
245 	radix = BLIST_BMAP_RADIX;
246 	while (radix <= blocks) {
247 		nodes += 1 + (blocks - 1) / radix;
248 		radix *= BLIST_META_RADIX;
249 	}
250 
251 	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
252 	    M_ZERO);
253 	if (bl == NULL)
254 		return (NULL);
255 
256 	bl->bl_blocks = blocks;
257 	bl->bl_radix = radix;
258 
259 #if defined(BLIST_DEBUG)
260 	printf(
261 		"BLIST representing %lld blocks (%lld MB of swap)"
262 		", requiring %lldK of ram\n",
263 		(long long)bl->bl_blocks,
264 		(long long)bl->bl_blocks * 4 / 1024,
265 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
266 	);
267 	printf("BLIST raw radix tree contains %lld records\n",
268 	    (long long)nodes);
269 #endif
270 
271 	return (bl);
272 }
273 
274 void
275 blist_destroy(blist_t bl)
276 {
277 
278 	free(bl, M_SWAP);
279 }
280 
281 /*
282  * blist_alloc() -   reserve space in the block bitmap.  Return the base
283  *		     of a contiguous region or SWAPBLK_NONE if space could
284  *		     not be allocated.
285  */
286 daddr_t
287 blist_alloc(blist_t bl, daddr_t count)
288 {
289 	daddr_t blk, cursor;
290 
291 	if (count > BLIST_MAX_ALLOC)
292 		panic("allocation too large");
293 
294 	/*
295 	 * This loop iterates at most twice.  An allocation failure in the
296 	 * first iteration leads to a second iteration only if the cursor was
297 	 * non-zero.  When the cursor is zero, an allocation failure will
298 	 * stop further iterations.
299 	 */
300 	cursor = bl->bl_cursor;
301 	for (;;) {
302 		blk = blst_meta_alloc(bl->bl_root, cursor, count,
303 		    bl->bl_radix);
304 		if (blk != SWAPBLK_NONE) {
305 			bl->bl_avail -= count;
306 			bl->bl_cursor = blk + count;
307 			if (bl->bl_cursor == bl->bl_blocks)
308 				bl->bl_cursor = 0;
309 			return (blk);
310 		} else if (cursor == 0)
311 			return (SWAPBLK_NONE);
312 		cursor = 0;
313 	}
314 }
315 
316 /*
317  * blist_avail() -	return the number of free blocks.
318  */
319 daddr_t
320 blist_avail(blist_t bl)
321 {
322 
323 	return (bl->bl_avail);
324 }
325 
326 /*
327  * blist_free() -	free up space in the block bitmap.  Return the base
328  *		     	of a contiguous region.  Panic if an inconsistancy is
329  *			found.
330  */
331 void
332 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
333 {
334 
335 	if (blkno < 0 || blkno + count > bl->bl_blocks)
336 		panic("freeing invalid range");
337 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
338 	bl->bl_avail += count;
339 }
340 
341 /*
342  * blist_fill() -	mark a region in the block bitmap as off-limits
343  *			to the allocator (i.e. allocate it), ignoring any
344  *			existing allocations.  Return the number of blocks
345  *			actually filled that were free before the call.
346  */
347 daddr_t
348 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
349 {
350 	daddr_t filled;
351 
352 	if (blkno < 0 || blkno + count > bl->bl_blocks)
353 		panic("filling invalid range");
354 	filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
355 	bl->bl_avail -= filled;
356 	return (filled);
357 }
358 
359 /*
360  * blist_resize() -	resize an existing radix tree to handle the
361  *			specified number of blocks.  This will reallocate
362  *			the tree and transfer the previous bitmap to the new
363  *			one.  When extending the tree you can specify whether
364  *			the new blocks are to left allocated or freed.
365  */
366 void
367 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
368 {
369     blist_t newbl = blist_create(count, flags);
370     blist_t save = *pbl;
371 
372     *pbl = newbl;
373     if (count > save->bl_blocks)
374 	    count = save->bl_blocks;
375     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
376 
377     /*
378      * If resizing upwards, should we free the new space or not?
379      */
380     if (freenew && count < newbl->bl_blocks) {
381 	    blist_free(newbl, count, newbl->bl_blocks - count);
382     }
383     blist_destroy(save);
384 }
385 
386 #ifdef BLIST_DEBUG
387 
388 /*
389  * blist_print()    - dump radix tree
390  */
391 void
392 blist_print(blist_t bl)
393 {
394 	printf("BLIST avail = %jd, cursor = %08jx {\n",
395 	    (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
396 
397 	if (bl->bl_root->bm_bitmap != 0)
398 		blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
399 	printf("}\n");
400 }
401 
402 #endif
403 
404 static const u_daddr_t fib[] = {
405 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
406 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
407 	514229, 832040, 1346269, 2178309, 3524578,
408 };
409 
410 /*
411  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
412  */
413 struct gap_stats {
414 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
415 	daddr_t	num;		/* number of gaps observed */
416 	daddr_t	max;		/* largest gap size */
417 	daddr_t	avg;		/* average gap size */
418 	daddr_t	err;		/* sum - num * avg */
419 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
420 	int	max_bucket;	/* last histo elt with nonzero val */
421 };
422 
423 /*
424  * gap_stats_counting()    - is the state 'counting 1 bits'?
425  *                           or 'skipping 0 bits'?
426  */
427 static inline bool
428 gap_stats_counting(const struct gap_stats *stats)
429 {
430 
431 	return (stats->start != SWAPBLK_NONE);
432 }
433 
434 /*
435  * init_gap_stats()    - initialize stats on gap sizes
436  */
437 static inline void
438 init_gap_stats(struct gap_stats *stats)
439 {
440 
441 	bzero(stats, sizeof(*stats));
442 	stats->start = SWAPBLK_NONE;
443 }
444 
445 /*
446  * update_gap_stats()    - update stats on gap sizes
447  */
448 static void
449 update_gap_stats(struct gap_stats *stats, daddr_t posn)
450 {
451 	daddr_t size;
452 	int hi, lo, mid;
453 
454 	if (!gap_stats_counting(stats)) {
455 		stats->start = posn;
456 		return;
457 	}
458 	size = posn - stats->start;
459 	stats->start = SWAPBLK_NONE;
460 	if (size > stats->max)
461 		stats->max = size;
462 
463 	/*
464 	 * Find the fibonacci range that contains size,
465 	 * expecting to find it in an early range.
466 	 */
467 	lo = 0;
468 	hi = 1;
469 	while (hi < nitems(fib) && fib[hi] <= size) {
470 		lo = hi;
471 		hi *= 2;
472 	}
473 	if (hi >= nitems(fib))
474 		hi = nitems(fib);
475 	while (lo + 1 != hi) {
476 		mid = (lo + hi) >> 1;
477 		if (fib[mid] <= size)
478 			lo = mid;
479 		else
480 			hi = mid;
481 	}
482 	stats->histo[lo]++;
483 	if (lo > stats->max_bucket)
484 		stats->max_bucket = lo;
485 	stats->err += size - stats->avg;
486 	stats->num++;
487 	stats->avg += stats->err / stats->num;
488 	stats->err %= stats->num;
489 }
490 
491 /*
492  * dump_gap_stats()    - print stats on gap sizes
493  */
494 static inline void
495 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
496 {
497 	int i;
498 
499 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
500 	    (intmax_t)stats->num);
501 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
502 	sbuf_printf(s, "average maximal free range size: %jd\n",
503 	    (intmax_t)stats->avg);
504 	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
505 	sbuf_printf(s, "               count  |  size range\n");
506 	sbuf_printf(s, "               -----  |  ----------\n");
507 	for (i = 0; i < stats->max_bucket; i++) {
508 		if (stats->histo[i] != 0) {
509 			sbuf_printf(s, "%20jd  |  ",
510 			    (intmax_t)stats->histo[i]);
511 			if (fib[i] != fib[i + 1] - 1)
512 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
513 				    (intmax_t)fib[i + 1] - 1);
514 			else
515 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
516 		}
517 	}
518 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
519 	if (stats->histo[i] > 1)
520 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
521 		    (intmax_t)stats->max);
522 	else
523 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
524 }
525 
526 /*
527  * blist_stats()    - dump radix tree stats
528  */
529 void
530 blist_stats(blist_t bl, struct sbuf *s)
531 {
532 	struct gap_stats gstats;
533 	struct gap_stats *stats = &gstats;
534 	daddr_t i, nodes, radix;
535 	u_daddr_t bit, diff, mask;
536 
537 	init_gap_stats(stats);
538 	nodes = 0;
539 	i = bl->bl_radix;
540 	while (i < bl->bl_radix + bl->bl_blocks) {
541 		/*
542 		 * Find max size subtree starting at i.
543 		 */
544 		radix = BLIST_BMAP_RADIX;
545 		while (((i / radix) & BLIST_META_MASK) == 0)
546 			radix *= BLIST_META_RADIX;
547 
548 		/*
549 		 * Check for skippable subtrees starting at i.
550 		 */
551 		while (radix > BLIST_BMAP_RADIX) {
552 			if (bl->bl_root[nodes].bm_bitmap == 0) {
553 				if (gap_stats_counting(stats))
554 					update_gap_stats(stats, i);
555 				break;
556 			}
557 
558 			/*
559 			 * Skip subtree root.
560 			 */
561 			nodes++;
562 			radix /= BLIST_META_RADIX;
563 		}
564 		if (radix == BLIST_BMAP_RADIX) {
565 			/*
566 			 * Scan leaf.
567 			 */
568 			mask = bl->bl_root[nodes].bm_bitmap;
569 			diff = mask ^ (mask << 1);
570 			if (gap_stats_counting(stats))
571 				diff ^= 1;
572 			while (diff != 0) {
573 				bit = diff & -diff;
574 				update_gap_stats(stats, i + bitpos(bit));
575 				diff ^= bit;
576 			}
577 		}
578 		nodes += radix_to_skip(radix);
579 		i += radix;
580 	}
581 	update_gap_stats(stats, i);
582 	dump_gap_stats(stats, s);
583 }
584 
585 /************************************************************************
586  *			  ALLOCATION SUPPORT FUNCTIONS			*
587  ************************************************************************
588  *
589  *	These support functions do all the actual work.  They may seem
590  *	rather longish, but that's because I've commented them up.  The
591  *	actual code is straight forward.
592  *
593  */
594 
595 /*
596  * BLST_NEXT_LEAF_ALLOC() - allocate the first few blocks in the next leaf.
597  *
598  *	'scan' is a leaf node, associated with a block containing 'blk'.
599  *	The next leaf node could be adjacent, or several nodes away if the
600  *	least common ancestor of 'scan' and its neighbor is several levels
601  *	up.  Use 'blk' to determine how many meta-nodes lie between the
602  *	leaves.  If the next leaf has enough initial bits set, clear them
603  *	and clear the bits in the meta nodes on the path up to the least
604  *	common ancestor to mark any subtrees made completely empty.
605  */
606 static int
607 blst_next_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
608 {
609 	blmeta_t *next;
610 	daddr_t skip;
611 	u_daddr_t radix;
612 	int digit;
613 
614 	next = scan + 1;
615 	blk += BLIST_BMAP_RADIX;
616 	radix = BLIST_BMAP_RADIX;
617 	while ((digit = ((blk / radix) & BLIST_META_MASK)) == 0 &&
618 	    (next->bm_bitmap & 1) == 1) {
619 		next++;
620 		radix *= BLIST_META_RADIX;
621 	}
622 	if (((next->bm_bitmap + 1) & ~((u_daddr_t)-1 << count)) != 0) {
623 		/*
624 		 * The next leaf doesn't have enough free blocks at the
625 		 * beginning to complete the spanning allocation.
626 		 */
627 		return (ENOMEM);
628 	}
629 	/* Clear the first 'count' bits in the next leaf to allocate. */
630 	next->bm_bitmap &= (u_daddr_t)-1 << count;
631 
632 	/*
633 	 * Update bitmaps of next-ancestors, up to least common ancestor.
634 	 */
635 	skip = radix_to_skip(radix);
636 	while (radix != BLIST_BMAP_RADIX && next->bm_bitmap == 0) {
637 		(--next)->bm_bitmap ^= 1;
638 		radix /= BLIST_META_RADIX;
639 	}
640 	if (next->bm_bitmap == 0)
641 		scan[-digit * skip].bm_bitmap ^= (u_daddr_t)1 << digit;
642 	return (0);
643 }
644 
645 /*
646  * BLST_LEAF_ALLOC() -	allocate at a leaf in the radix tree (a bitmap).
647  *
648  *	This function is the core of the allocator.  Its execution time is
649  *	proportional to log(count), plus height of the tree if the allocation
650  *	crosses a leaf boundary.
651  */
652 static daddr_t
653 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
654 {
655 	u_daddr_t cursor_mask, mask;
656 	int count1, hi, lo, num_shifts, range1, range_ext;
657 
658 	range1 = 0;
659 	count1 = count - 1;
660 	num_shifts = fls(count1);
661 	mask = scan->bm_bitmap;
662 	while ((-mask & ~mask) != 0 && num_shifts > 0) {
663 		/*
664 		 * If bit i is set in mask, then bits in [i, i+range1] are set
665 		 * in scan->bm_bitmap.  The value of range1 is equal to count1
666 		 * >> num_shifts.  Grow range1 and reduce num_shifts to 0,
667 		 * while preserving these invariants.  The updates to mask
668 		 * leave fewer bits set, but each bit that remains set
669 		 * represents a longer string of consecutive bits set in
670 		 * scan->bm_bitmap.  If more updates to mask cannot clear more
671 		 * bits, because mask is partitioned with all 0 bits preceding
672 		 * all 1 bits, the loop terminates immediately.
673 		 */
674 		num_shifts--;
675 		range_ext = range1 + ((count1 >> num_shifts) & 1);
676 		/*
677 		 * mask is a signed quantity for the shift because when it is
678 		 * shifted right, the sign bit should copied; when the last
679 		 * block of the leaf is free, pretend, for a while, that all the
680 		 * blocks that follow it are also free.
681 		 */
682 		mask &= (daddr_t)mask >> range_ext;
683 		range1 += range_ext;
684 	}
685 	if (mask == 0) {
686 		/*
687 		 * Update bighint.  There is no allocation bigger than range1
688 		 * starting in this leaf.
689 		 */
690 		scan->bm_bighint = range1;
691 		return (SWAPBLK_NONE);
692 	}
693 
694 	/* Discard any candidates that appear before blk. */
695 	if ((blk & BLIST_BMAP_MASK) != 0) {
696 		cursor_mask = mask & bitrange(0, blk & BLIST_BMAP_MASK);
697 		if (cursor_mask != 0) {
698 			mask ^= cursor_mask;
699 			if (mask == 0)
700 				return (SWAPBLK_NONE);
701 
702 			/*
703 			 * Bighint change for last block allocation cannot
704 			 * assume that any other blocks are allocated, so the
705 			 * bighint cannot be reduced much.
706 			 */
707 			range1 = BLIST_MAX_ALLOC - 1;
708 		}
709 		blk &= ~BLIST_BMAP_MASK;
710 	}
711 
712 	/*
713 	 * The least significant set bit in mask marks the start of the first
714 	 * available range of sufficient size.  Clear all the bits but that one,
715 	 * and then find its position.
716 	 */
717 	mask &= -mask;
718 	lo = bitpos(mask);
719 
720 	hi = lo + count;
721 	if (hi > BLIST_BMAP_RADIX) {
722 		/*
723 		 * An allocation within this leaf is impossible, so a successful
724 		 * allocation depends on the next leaf providing some of the blocks.
725 		 */
726 		if (blst_next_leaf_alloc(scan, blk, hi - BLIST_BMAP_RADIX) != 0)
727 			/*
728 			 * The hint cannot be updated, because the same
729 			 * allocation request could be satisfied later, by this
730 			 * leaf, if the state of the next leaf changes, and
731 			 * without any changes to this leaf.
732 			 */
733 			return (SWAPBLK_NONE);
734 		hi = BLIST_BMAP_RADIX;
735 	}
736 
737 	/* Set the bits of mask at position 'lo' and higher. */
738 	mask = -mask;
739 	if (hi == BLIST_BMAP_RADIX) {
740 		/*
741 		 * Update bighint.  There is no allocation bigger than range1
742 		 * available in this leaf after this allocation completes.
743 		 */
744 		scan->bm_bighint = range1;
745 	} else {
746 		/* Clear the bits of mask at position 'hi' and higher. */
747 		mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi);
748 	}
749 	/* Clear the allocated bits from this leaf. */
750 	scan->bm_bitmap &= ~mask;
751 	return (blk + lo);
752 }
753 
754 /*
755  * blist_meta_alloc() -	allocate at a meta in the radix tree.
756  *
757  *	Attempt to allocate at a meta node.  If we can't, we update
758  *	bighint and return a failure.  Updating bighint optimize future
759  *	calls that hit this node.  We have to check for our collapse cases
760  *	and we have a few optimizations strewn in as well.
761  */
762 static daddr_t
763 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
764 {
765 	daddr_t blk, i, r, skip;
766 	u_daddr_t bit, mask;
767 	bool scan_from_start;
768 	int digit;
769 
770 	if (radix == BLIST_BMAP_RADIX)
771 		return (blst_leaf_alloc(scan, cursor, count));
772 	blk = cursor & -radix;
773 	scan_from_start = (cursor == blk);
774 	radix /= BLIST_META_RADIX;
775 	skip = radix_to_skip(radix);
776 	mask = scan->bm_bitmap;
777 
778 	/* Discard any candidates that appear before cursor. */
779 	digit = (cursor / radix) & BLIST_META_MASK;
780 	mask &= (u_daddr_t)-1 << digit;
781 	if (mask == 0)
782 		return (SWAPBLK_NONE);
783 
784 	/*
785 	 * If the first try is for a block that includes the cursor, pre-undo
786 	 * the digit * radix offset in the first call; otherwise, ignore the
787 	 * cursor entirely.
788 	 */
789 	if (((mask >> digit) & 1) == 1)
790 		cursor -= digit * radix;
791 	else
792 		cursor = blk;
793 
794 	/*
795 	 * Examine the nonempty subtree associated with each bit set in mask.
796 	 */
797 	do {
798 		bit = mask & -mask;
799 		digit = bitpos(bit);
800 		i = 1 + digit * skip;
801 		if (count <= scan[i].bm_bighint) {
802 			/*
803 			 * The allocation might fit beginning in the i'th subtree.
804 			 */
805 			r = blst_meta_alloc(&scan[i], cursor + digit * radix,
806 			    count, radix);
807 			if (r != SWAPBLK_NONE) {
808 				if (scan[i].bm_bitmap == 0)
809 					scan->bm_bitmap ^= bit;
810 				return (r);
811 			}
812 		}
813 		cursor = blk;
814 	} while ((mask ^= bit) != 0);
815 
816 	/*
817 	 * We couldn't allocate count in this subtree.  If the whole tree was
818 	 * scanned, and the last tree node is allocated, update bighint.
819 	 */
820 	if (scan_from_start && !(digit == BLIST_META_RADIX - 1 &&
821 	    scan[i].bm_bighint == BLIST_MAX_ALLOC))
822 		scan->bm_bighint = count - 1;
823 
824 	return (SWAPBLK_NONE);
825 }
826 
827 /*
828  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
829  *
830  */
831 static void
832 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
833 {
834 	u_daddr_t mask;
835 
836 	/*
837 	 * free some data in this bitmap
838 	 * mask=0000111111111110000
839 	 *          \_________/\__/
840 	 *		count   n
841 	 */
842 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
843 	if (scan->bm_bitmap & mask)
844 		panic("freeing free block");
845 	scan->bm_bitmap |= mask;
846 }
847 
848 /*
849  * BLST_META_FREE() - free allocated blocks from radix tree meta info
850  *
851  *	This support routine frees a range of blocks from the bitmap.
852  *	The range must be entirely enclosed by this radix node.  If a
853  *	meta node, we break the range down recursively to free blocks
854  *	in subnodes (which means that this code can free an arbitrary
855  *	range whereas the allocation code cannot allocate an arbitrary
856  *	range).
857  */
858 static void
859 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
860 {
861 	daddr_t blk, endBlk, i, skip;
862 	int digit, endDigit;
863 
864 	/*
865 	 * We could probably do a better job here.  We are required to make
866 	 * bighint at least as large as the biggest allocable block of data.
867 	 * If we just shoehorn it, a little extra overhead will be incurred
868 	 * on the next allocation (but only that one typically).
869 	 */
870 	scan->bm_bighint = BLIST_MAX_ALLOC;
871 
872 	if (radix == BLIST_BMAP_RADIX)
873 		return (blst_leaf_free(scan, freeBlk, count));
874 
875 	endBlk = ummin(freeBlk + count, (freeBlk + radix) & -radix);
876 	radix /= BLIST_META_RADIX;
877 	skip = radix_to_skip(radix);
878 	blk = freeBlk & -radix;
879 	digit = (blk / radix) & BLIST_META_MASK;
880 	endDigit = 1 + (((endBlk - 1) / radix) & BLIST_META_MASK);
881 	scan->bm_bitmap |= bitrange(digit, endDigit - digit);
882 	for (i = 1 + digit * skip; blk < endBlk; i += skip) {
883 		blk += radix;
884 		count = ummin(blk, endBlk) - freeBlk;
885 		blst_meta_free(&scan[i], freeBlk, count, radix);
886 		freeBlk = blk;
887 	}
888 }
889 
890 /*
891  * BLST_COPY() - copy one radix tree to another
892  *
893  *	Locates free space in the source tree and frees it in the destination
894  *	tree.  The space may not already be free in the destination.
895  */
896 static void
897 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
898     daddr_t count)
899 {
900 	daddr_t endBlk, i, skip;
901 
902 	/*
903 	 * Leaf node
904 	 */
905 
906 	if (radix == BLIST_BMAP_RADIX) {
907 		u_daddr_t v = scan->bm_bitmap;
908 
909 		if (v == (u_daddr_t)-1) {
910 			blist_free(dest, blk, count);
911 		} else if (v != 0) {
912 			int i;
913 
914 			for (i = 0; i < count; ++i) {
915 				if (v & ((u_daddr_t)1 << i))
916 					blist_free(dest, blk + i, 1);
917 			}
918 		}
919 		return;
920 	}
921 
922 	/*
923 	 * Meta node
924 	 */
925 
926 	if (scan->bm_bitmap == 0) {
927 		/*
928 		 * Source all allocated, leave dest allocated
929 		 */
930 		return;
931 	}
932 
933 	endBlk = blk + count;
934 	radix /= BLIST_META_RADIX;
935 	skip = radix_to_skip(radix);
936 	for (i = 1; blk < endBlk; i += skip) {
937 		blk += radix;
938 		count = radix;
939 		if (blk >= endBlk)
940 			count -= blk - endBlk;
941 		blst_copy(&scan[i], blk - radix, radix, dest, count);
942 	}
943 }
944 
945 /*
946  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
947  *
948  *	This routine allocates all blocks in the specified range
949  *	regardless of any existing allocations in that range.  Returns
950  *	the number of blocks allocated by the call.
951  */
952 static daddr_t
953 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
954 {
955 	daddr_t nblks;
956 	u_daddr_t mask;
957 
958 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
959 
960 	/* Count the number of blocks that we are allocating. */
961 	nblks = bitcount64(scan->bm_bitmap & mask);
962 
963 	scan->bm_bitmap &= ~mask;
964 	return (nblks);
965 }
966 
967 /*
968  * BLIST_META_FILL() -	allocate specific blocks at a meta node
969  *
970  *	This routine allocates the specified range of blocks,
971  *	regardless of any existing allocations in the range.  The
972  *	range must be within the extent of this node.  Returns the
973  *	number of blocks allocated by the call.
974  */
975 static daddr_t
976 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
977 {
978 	daddr_t blk, endBlk, i, nblks, skip;
979 	int digit;
980 
981 	if (radix == BLIST_BMAP_RADIX)
982 		return (blst_leaf_fill(scan, allocBlk, count));
983 
984 	endBlk = ummin(allocBlk + count, (allocBlk + radix) & -radix);
985 	radix /= BLIST_META_RADIX;
986 	skip = radix_to_skip(radix);
987 	blk = allocBlk & -radix;
988 	nblks = 0;
989 	while (blk < endBlk) {
990 		digit = (blk / radix) & BLIST_META_MASK;
991 		i = 1 + digit * skip;
992 		blk += radix;
993 		count = ummin(blk, endBlk) - allocBlk;
994 		nblks += blst_meta_fill(&scan[i], allocBlk, count, radix);
995 		if (scan[i].bm_bitmap == 0)
996 			scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
997 		allocBlk = blk;
998 	}
999 	return (nblks);
1000 }
1001 
1002 #ifdef BLIST_DEBUG
1003 
1004 static void
1005 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1006 {
1007 	daddr_t skip;
1008 	u_daddr_t bit, mask;
1009 	int digit;
1010 
1011 	if (radix == BLIST_BMAP_RADIX) {
1012 		printf(
1013 		    "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
1014 		    tab, tab, "",
1015 		    (long long)blk, (long long)radix,
1016 		    1 + (BLIST_BMAP_RADIX - 1) / 4,
1017 		    (long long)scan->bm_bitmap,
1018 		    (long long)scan->bm_bighint
1019 		);
1020 		return;
1021 	}
1022 
1023 	printf(
1024 	    "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
1025 	    tab, tab, "",
1026 	    (long long)blk, (long long)radix,
1027 	    (long long)radix,
1028 	    1 + (BLIST_META_RADIX - 1) / 4,
1029 	    (long long)scan->bm_bitmap,
1030 	    (long long)scan->bm_bighint
1031 	);
1032 
1033 	radix /= BLIST_META_RADIX;
1034 	skip = radix_to_skip(radix);
1035 	tab += 4;
1036 
1037 	mask = scan->bm_bitmap;
1038 	/* Examine the nonempty subtree associated with each bit set in mask */
1039 	do {
1040 		bit = mask & -mask;
1041 		digit = bitpos(bit);
1042 		blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
1043 		    radix, tab);
1044 	} while ((mask ^= bit) != 0);
1045 	tab -= 4;
1046 
1047 	printf(
1048 	    "%*.*s}\n",
1049 	    tab, tab, ""
1050 	);
1051 }
1052 
1053 #endif
1054 
1055 #ifdef BLIST_DEBUG
1056 
1057 int
1058 main(int ac, char **av)
1059 {
1060 	int size = BLIST_META_RADIX * BLIST_BMAP_RADIX;
1061 	int i;
1062 	blist_t bl;
1063 	struct sbuf *s;
1064 
1065 	for (i = 1; i < ac; ++i) {
1066 		const char *ptr = av[i];
1067 		if (*ptr != '-') {
1068 			size = strtol(ptr, NULL, 0);
1069 			continue;
1070 		}
1071 		ptr += 2;
1072 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1073 		exit(1);
1074 	}
1075 	bl = blist_create(size, M_WAITOK);
1076 	blist_free(bl, 0, size);
1077 
1078 	for (;;) {
1079 		char buf[1024];
1080 		long long da = 0;
1081 		long long count = 0;
1082 
1083 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1084 		    (long long)size, (long long)bl->bl_radix);
1085 		fflush(stdout);
1086 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1087 			break;
1088 		switch(buf[0]) {
1089 		case 'r':
1090 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1091 				blist_resize(&bl, count, 1, M_WAITOK);
1092 			} else {
1093 				printf("?\n");
1094 			}
1095 		case 'p':
1096 			blist_print(bl);
1097 			break;
1098 		case 's':
1099 			s = sbuf_new_auto();
1100 			blist_stats(bl, s);
1101 			sbuf_finish(s);
1102 			printf("%s", sbuf_data(s));
1103 			sbuf_delete(s);
1104 			break;
1105 		case 'a':
1106 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1107 				daddr_t blk = blist_alloc(bl, count);
1108 				printf("    R=%08llx\n", (long long)blk);
1109 			} else {
1110 				printf("?\n");
1111 			}
1112 			break;
1113 		case 'f':
1114 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1115 				blist_free(bl, da, count);
1116 			} else {
1117 				printf("?\n");
1118 			}
1119 			break;
1120 		case 'l':
1121 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1122 				printf("    n=%jd\n",
1123 				    (intmax_t)blist_fill(bl, da, count));
1124 			} else {
1125 				printf("?\n");
1126 			}
1127 			break;
1128 		case '?':
1129 		case 'h':
1130 			puts(
1131 			    "p          -print\n"
1132 			    "s          -stats\n"
1133 			    "a %d       -allocate\n"
1134 			    "f %x %d    -free\n"
1135 			    "l %x %d    -fill\n"
1136 			    "r %d       -resize\n"
1137 			    "h/?        -help"
1138 			);
1139 			break;
1140 		default:
1141 			printf("?\n");
1142 			break;
1143 		}
1144 	}
1145 	return(0);
1146 }
1147 
1148 void
1149 panic(const char *ctl, ...)
1150 {
1151 	va_list va;
1152 
1153 	va_start(va, ctl);
1154 	vfprintf(stderr, ctl, va);
1155 	fprintf(stderr, "\n");
1156 	va_end(va);
1157 	exit(1);
1158 }
1159 
1160 #endif
1161