xref: /freebsd/sys/kern/subr_blist.c (revision 38069501)
1 /*-
2  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
3  * Redistribution and use in source and binary forms, with or without
4  * modification, are permitted provided that the following conditions
5  * are met:
6  * 1. Redistributions of source code must retain the above copyright
7  *    notice, this list of conditions and the following disclaimer.
8  * 2. Redistributions in binary form must reproduce the above copyright
9  *    notice, this list of conditions and the following disclaimer in the
10  *    documentation and/or other materials provided with the distribution.
11  * 3. Neither the name of the University nor the names of its contributors
12  *    may be used to endorse or promote products derived from this software
13  *    without specific prior written permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
19  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
21  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 /*
28  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
29  *
30  *	This module implements a general bitmap allocator/deallocator.  The
31  *	allocator eats around 2 bits per 'block'.  The module does not
32  *	try to interpret the meaning of a 'block' other than to return
33  *	SWAPBLK_NONE on an allocation failure.
34  *
35  *	A radix tree controls access to pieces of the bitmap, and includes
36  *	auxiliary information at each interior node about the availabilty of
37  *	contiguous free blocks in the subtree rooted at that node.  Two radix
38  *	constants are involved: one for the size of the bitmaps contained in the
39  *	leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
40  *	each of the meta (interior) nodes (BLIST_META_RADIX).  Each subtree is
41  *	associated with a range of blocks.  The root of any subtree stores a
42  *	hint field that defines an upper bound on the size of the largest
43  *	allocation that can begin in the associated block range.  A hint is an
44  *	upper bound on a potential allocation, but not necessarily a tight upper
45  *	bound.
46  *
47  *	The radix tree also implements two collapsed states for meta nodes:
48  *	the ALL-ALLOCATED state and the ALL-FREE state.  If a meta node is
49  *	in either of these two states, all information contained underneath
50  *	the node is considered stale.  These states are used to optimize
51  *	allocation and freeing operations.
52  *
53  * 	The hinting greatly increases code efficiency for allocations while
54  *	the general radix structure optimizes both allocations and frees.  The
55  *	radix tree should be able to operate well no matter how much
56  *	fragmentation there is and no matter how large a bitmap is used.
57  *
58  *	The blist code wires all necessary memory at creation time.  Neither
59  *	allocations nor frees require interaction with the memory subsystem.
60  *	The non-blocking features of the blist code are used in the swap code
61  *	(vm/swap_pager.c).
62  *
63  *	LAYOUT: The radix tree is laid out recursively using a
64  *	linear array.  Each meta node is immediately followed (laid out
65  *	sequentially in memory) by BLIST_META_RADIX lower level nodes.  This
66  *	is a recursive structure but one that can be easily scanned through
67  *	a very simple 'skip' calculation.  In order to support large radixes,
68  *	portions of the tree may reside outside our memory allocation.  We
69  *	handle this with an early-termination optimization (when bighint is
70  *	set to -1) on the scan.  The memory allocation is only large enough
71  *	to cover the number of blocks requested at creation time even if it
72  *	must be encompassed in larger root-node radix.
73  *
74  *	NOTE: the allocator cannot currently allocate more than
75  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
76  *	large' if you try.  This is an area that could use improvement.  The
77  *	radix is large enough that this restriction does not effect the swap
78  *	system, though.  Currently only the allocation code is affected by
79  *	this algorithmic unfeature.  The freeing code can handle arbitrary
80  *	ranges.
81  *
82  *	This code can be compiled stand-alone for debugging.
83  */
84 
85 #include <sys/cdefs.h>
86 __FBSDID("$FreeBSD$");
87 
88 #ifdef _KERNEL
89 
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/lock.h>
93 #include <sys/kernel.h>
94 #include <sys/blist.h>
95 #include <sys/malloc.h>
96 #include <sys/sbuf.h>
97 #include <sys/proc.h>
98 #include <sys/mutex.h>
99 
100 #else
101 
102 #ifndef BLIST_NO_DEBUG
103 #define BLIST_DEBUG
104 #endif
105 
106 #include <sys/types.h>
107 #include <sys/malloc.h>
108 #include <sys/sbuf.h>
109 #include <stdio.h>
110 #include <string.h>
111 #include <stddef.h>
112 #include <stdlib.h>
113 #include <stdarg.h>
114 #include <stdbool.h>
115 
116 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
117 #define malloc(a,b,c)	calloc(a, 1)
118 #define free(a,b)	free(a)
119 static __inline int imax(int a, int b) { return (a > b ? a : b); }
120 
121 #include <sys/blist.h>
122 
123 void panic(const char *ctl, ...);
124 
125 #endif
126 
127 /*
128  * static support functions
129  */
130 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
131 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
132 		    u_daddr_t radix);
133 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
134 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
135 		    u_daddr_t radix);
136 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
137 		    blist_t dest, daddr_t count);
138 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
139 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
140 		    u_daddr_t radix);
141 #ifndef _KERNEL
142 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
143 		    int tab);
144 #endif
145 
146 #ifdef _KERNEL
147 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
148 #endif
149 
150 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
151     "radix divisibility error");
152 #define	BLIST_BMAP_MASK	(BLIST_BMAP_RADIX - 1)
153 #define	BLIST_META_MASK	(BLIST_META_RADIX - 1)
154 
155 /*
156  * For a subtree that can represent the state of up to 'radix' blocks, the
157  * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX.  If 'm'
158  * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
159  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
160  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
161  * in the 'meta' functions that process subtrees.  Since integer division
162  * discards remainders, we can express this computation as
163  * skip = (m * m**h) / (m - 1)
164  * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
165  * and since m divides BLIST_BMAP_RADIX, we can simplify further to
166  * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
167  * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
168  * so that simple integer division by a constant can safely be used for the
169  * calculation.
170  */
171 static inline daddr_t
172 radix_to_skip(daddr_t radix)
173 {
174 
175 	return (radix /
176 	    ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
177 }
178 
179 /*
180  * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t.
181  * Assumes that the argument has only one bit set.
182  */
183 static inline int
184 bitpos(u_daddr_t mask)
185 {
186 	int hi, lo, mid;
187 
188 	switch (sizeof(mask)) {
189 #ifdef HAVE_INLINE_FFSLL
190 	case sizeof(long long):
191 		return (ffsll(mask) - 1);
192 #endif
193 	default:
194 		lo = 0;
195 		hi = BLIST_BMAP_RADIX;
196 		while (lo + 1 < hi) {
197 			mid = (lo + hi) >> 1;
198 			if ((mask >> mid) != 0)
199 				lo = mid;
200 			else
201 				hi = mid;
202 		}
203 		return (lo);
204 	}
205 }
206 
207 /*
208  * blist_create() - create a blist capable of handling up to the specified
209  *		    number of blocks
210  *
211  *	blocks - must be greater than 0
212  * 	flags  - malloc flags
213  *
214  *	The smallest blist consists of a single leaf node capable of
215  *	managing BLIST_BMAP_RADIX blocks.
216  */
217 blist_t
218 blist_create(daddr_t blocks, int flags)
219 {
220 	blist_t bl;
221 	daddr_t i, last_block;
222 	u_daddr_t nodes, radix, skip;
223 	int digit;
224 
225 	/*
226 	 * Calculate the radix and node count used for scanning.  Find the last
227 	 * block that is followed by a terminator.
228 	 */
229 	last_block = blocks - 1;
230 	radix = BLIST_BMAP_RADIX;
231 	while (radix < blocks) {
232 		if (((last_block / radix + 1) & BLIST_META_MASK) != 0)
233 			/*
234 			 * A terminator will be added.  Update last_block to the
235 			 * position just before that terminator.
236 			 */
237 			last_block |= radix - 1;
238 		radix *= BLIST_META_RADIX;
239 	}
240 
241 	/*
242 	 * Count the meta-nodes in the expanded tree, including the final
243 	 * terminator, from the bottom level up to the root.
244 	 */
245 	nodes = (last_block >= blocks) ? 2 : 1;
246 	last_block /= BLIST_BMAP_RADIX;
247 	while (last_block > 0) {
248 		nodes += last_block + 1;
249 		last_block /= BLIST_META_RADIX;
250 	}
251 	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
252 	    M_ZERO);
253 	if (bl == NULL)
254 		return (NULL);
255 
256 	bl->bl_blocks = blocks;
257 	bl->bl_radix = radix;
258 	bl->bl_cursor = 0;
259 
260 	/*
261 	 * Initialize the empty tree by filling in root values, then initialize
262 	 * just the terminators in the rest of the tree.
263 	 */
264 	bl->bl_root[0].bm_bighint = 0;
265 	if (radix == BLIST_BMAP_RADIX)
266 		bl->bl_root[0].u.bmu_bitmap = 0;
267 	else
268 		bl->bl_root[0].u.bmu_avail = 0;
269 	last_block = blocks - 1;
270 	i = 0;
271 	while (radix > BLIST_BMAP_RADIX) {
272 		radix /= BLIST_META_RADIX;
273 		skip = radix_to_skip(radix);
274 		digit = last_block / radix;
275 		i += 1 + digit * skip;
276 		if (digit != BLIST_META_MASK) {
277 			/*
278 			 * Add a terminator.
279 			 */
280 			bl->bl_root[i + skip].bm_bighint = (daddr_t)-1;
281 			bl->bl_root[i + skip].u.bmu_bitmap = 0;
282 		}
283 		last_block %= radix;
284 	}
285 
286 #if defined(BLIST_DEBUG)
287 	printf(
288 		"BLIST representing %lld blocks (%lld MB of swap)"
289 		", requiring %lldK of ram\n",
290 		(long long)bl->bl_blocks,
291 		(long long)bl->bl_blocks * 4 / 1024,
292 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
293 	);
294 	printf("BLIST raw radix tree contains %lld records\n",
295 	    (long long)nodes);
296 #endif
297 
298 	return (bl);
299 }
300 
301 void
302 blist_destroy(blist_t bl)
303 {
304 
305 	free(bl, M_SWAP);
306 }
307 
308 /*
309  * blist_alloc() -   reserve space in the block bitmap.  Return the base
310  *		     of a contiguous region or SWAPBLK_NONE if space could
311  *		     not be allocated.
312  */
313 daddr_t
314 blist_alloc(blist_t bl, daddr_t count)
315 {
316 	daddr_t blk;
317 
318 	/*
319 	 * This loop iterates at most twice.  An allocation failure in the
320 	 * first iteration leads to a second iteration only if the cursor was
321 	 * non-zero.  When the cursor is zero, an allocation failure will
322 	 * reduce the hint, stopping further iterations.
323 	 */
324 	while (count <= bl->bl_root->bm_bighint) {
325 		blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count,
326 		    bl->bl_radix);
327 		if (blk != SWAPBLK_NONE) {
328 			bl->bl_cursor = blk + count;
329 			if (bl->bl_cursor == bl->bl_blocks)
330 				bl->bl_cursor = 0;
331 			return (blk);
332 		} else if (bl->bl_cursor != 0)
333 			bl->bl_cursor = 0;
334 	}
335 	return (SWAPBLK_NONE);
336 }
337 
338 /*
339  * blist_avail() -	return the number of free blocks.
340  */
341 daddr_t
342 blist_avail(blist_t bl)
343 {
344 
345 	if (bl->bl_radix == BLIST_BMAP_RADIX)
346 		return (bitcount64(bl->bl_root->u.bmu_bitmap));
347 	else
348 		return (bl->bl_root->u.bmu_avail);
349 }
350 
351 /*
352  * blist_free() -	free up space in the block bitmap.  Return the base
353  *		     	of a contiguous region.  Panic if an inconsistancy is
354  *			found.
355  */
356 void
357 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
358 {
359 
360 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
361 }
362 
363 /*
364  * blist_fill() -	mark a region in the block bitmap as off-limits
365  *			to the allocator (i.e. allocate it), ignoring any
366  *			existing allocations.  Return the number of blocks
367  *			actually filled that were free before the call.
368  */
369 daddr_t
370 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
371 {
372 
373 	return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix));
374 }
375 
376 /*
377  * blist_resize() -	resize an existing radix tree to handle the
378  *			specified number of blocks.  This will reallocate
379  *			the tree and transfer the previous bitmap to the new
380  *			one.  When extending the tree you can specify whether
381  *			the new blocks are to left allocated or freed.
382  */
383 void
384 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
385 {
386     blist_t newbl = blist_create(count, flags);
387     blist_t save = *pbl;
388 
389     *pbl = newbl;
390     if (count > save->bl_blocks)
391 	    count = save->bl_blocks;
392     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
393 
394     /*
395      * If resizing upwards, should we free the new space or not?
396      */
397     if (freenew && count < newbl->bl_blocks) {
398 	    blist_free(newbl, count, newbl->bl_blocks - count);
399     }
400     blist_destroy(save);
401 }
402 
403 #ifdef BLIST_DEBUG
404 
405 /*
406  * blist_print()    - dump radix tree
407  */
408 void
409 blist_print(blist_t bl)
410 {
411 	printf("BLIST cursor = %08jx {\n", (uintmax_t)bl->bl_cursor);
412 	blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
413 	printf("}\n");
414 }
415 
416 #endif
417 
418 static const u_daddr_t fib[] = {
419 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
420 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
421 	514229, 832040, 1346269, 2178309, 3524578,
422 };
423 
424 /*
425  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
426  */
427 struct gap_stats {
428 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
429 	daddr_t	num;		/* number of gaps observed */
430 	daddr_t	max;		/* largest gap size */
431 	daddr_t	avg;		/* average gap size */
432 	daddr_t	err;		/* sum - num * avg */
433 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
434 	int	max_bucket;	/* last histo elt with nonzero val */
435 };
436 
437 /*
438  * gap_stats_counting()    - is the state 'counting 1 bits'?
439  *                           or 'skipping 0 bits'?
440  */
441 static inline bool
442 gap_stats_counting(const struct gap_stats *stats)
443 {
444 
445 	return (stats->start != SWAPBLK_NONE);
446 }
447 
448 /*
449  * init_gap_stats()    - initialize stats on gap sizes
450  */
451 static inline void
452 init_gap_stats(struct gap_stats *stats)
453 {
454 
455 	bzero(stats, sizeof(*stats));
456 	stats->start = SWAPBLK_NONE;
457 }
458 
459 /*
460  * update_gap_stats()    - update stats on gap sizes
461  */
462 static void
463 update_gap_stats(struct gap_stats *stats, daddr_t posn)
464 {
465 	daddr_t size;
466 	int hi, lo, mid;
467 
468 	if (!gap_stats_counting(stats)) {
469 		stats->start = posn;
470 		return;
471 	}
472 	size = posn - stats->start;
473 	stats->start = SWAPBLK_NONE;
474 	if (size > stats->max)
475 		stats->max = size;
476 
477 	/*
478 	 * Find the fibonacci range that contains size,
479 	 * expecting to find it in an early range.
480 	 */
481 	lo = 0;
482 	hi = 1;
483 	while (hi < nitems(fib) && fib[hi] <= size) {
484 		lo = hi;
485 		hi *= 2;
486 	}
487 	if (hi >= nitems(fib))
488 		hi = nitems(fib);
489 	while (lo + 1 != hi) {
490 		mid = (lo + hi) >> 1;
491 		if (fib[mid] <= size)
492 			lo = mid;
493 		else
494 			hi = mid;
495 	}
496 	stats->histo[lo]++;
497 	if (lo > stats->max_bucket)
498 		stats->max_bucket = lo;
499 	stats->err += size - stats->avg;
500 	stats->num++;
501 	stats->avg += stats->err / stats->num;
502 	stats->err %= stats->num;
503 }
504 
505 /*
506  * dump_gap_stats()    - print stats on gap sizes
507  */
508 static inline void
509 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
510 {
511 	int i;
512 
513 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
514 	    (intmax_t)stats->num);
515 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
516 	sbuf_printf(s, "average maximal free range size: %jd\n",
517 	    (intmax_t)stats->avg);
518 	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
519 	sbuf_printf(s, "               count  |  size range\n");
520 	sbuf_printf(s, "               -----  |  ----------\n");
521 	for (i = 0; i < stats->max_bucket; i++) {
522 		if (stats->histo[i] != 0) {
523 			sbuf_printf(s, "%20jd  |  ",
524 			    (intmax_t)stats->histo[i]);
525 			if (fib[i] != fib[i + 1] - 1)
526 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
527 				    (intmax_t)fib[i + 1] - 1);
528 			else
529 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
530 		}
531 	}
532 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
533 	if (stats->histo[i] > 1)
534 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
535 		    (intmax_t)stats->max);
536 	else
537 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
538 }
539 
540 /*
541  * blist_stats()    - dump radix tree stats
542  */
543 void
544 blist_stats(blist_t bl, struct sbuf *s)
545 {
546 	struct gap_stats gstats;
547 	struct gap_stats *stats = &gstats;
548 	daddr_t i, nodes, radix;
549 	u_daddr_t bit, diff, mask;
550 
551 	init_gap_stats(stats);
552 	nodes = 0;
553 	i = bl->bl_radix;
554 	while (i < bl->bl_radix + bl->bl_blocks) {
555 		/*
556 		 * Find max size subtree starting at i.
557 		 */
558 		radix = BLIST_BMAP_RADIX;
559 		while (((i / radix) & BLIST_META_MASK) == 0)
560 			radix *= BLIST_META_RADIX;
561 
562 		/*
563 		 * Check for skippable subtrees starting at i.
564 		 */
565 		while (radix > BLIST_BMAP_RADIX) {
566 			if (bl->bl_root[nodes].u.bmu_avail == 0) {
567 				if (gap_stats_counting(stats))
568 					update_gap_stats(stats, i);
569 				break;
570 			}
571 			if (bl->bl_root[nodes].u.bmu_avail == radix) {
572 				if (!gap_stats_counting(stats))
573 					update_gap_stats(stats, i);
574 				break;
575 			}
576 
577 			/*
578 			 * Skip subtree root.
579 			 */
580 			nodes++;
581 			radix /= BLIST_META_RADIX;
582 		}
583 		if (radix == BLIST_BMAP_RADIX) {
584 			/*
585 			 * Scan leaf.
586 			 */
587 			mask = bl->bl_root[nodes].u.bmu_bitmap;
588 			diff = mask ^ (mask << 1);
589 			if (gap_stats_counting(stats))
590 				diff ^= 1;
591 			while (diff != 0) {
592 				bit = diff & -diff;
593 				update_gap_stats(stats, i + bitpos(bit));
594 				diff ^= bit;
595 			}
596 		}
597 		nodes += radix_to_skip(radix);
598 		i += radix;
599 	}
600 	update_gap_stats(stats, i);
601 	dump_gap_stats(stats, s);
602 }
603 
604 /************************************************************************
605  *			  ALLOCATION SUPPORT FUNCTIONS			*
606  ************************************************************************
607  *
608  *	These support functions do all the actual work.  They may seem
609  *	rather longish, but that's because I've commented them up.  The
610  *	actual code is straight forward.
611  *
612  */
613 
614 /*
615  * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
616  *
617  *	This is the core of the allocator and is optimized for the
618  *	BLIST_BMAP_RADIX block allocation case.  Otherwise, execution
619  *	time is proportional to log2(count) + bitpos time.
620  */
621 static daddr_t
622 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
623 {
624 	u_daddr_t mask;
625 	int count1, hi, lo, num_shifts, range1, range_ext;
626 
627 	range1 = 0;
628 	count1 = count - 1;
629 	num_shifts = fls(count1);
630 	mask = scan->u.bmu_bitmap;
631 	while ((-mask & ~mask) != 0 && num_shifts > 0) {
632 		/*
633 		 * If bit i is set in mask, then bits in [i, i+range1] are set
634 		 * in scan->u.bmu_bitmap.  The value of range1 is equal to
635 		 * count1 >> num_shifts.  Grow range and reduce num_shifts to 0,
636 		 * while preserving these invariants.  The updates to mask leave
637 		 * fewer bits set, but each bit that remains set represents a
638 		 * longer string of consecutive bits set in scan->u.bmu_bitmap.
639 		 * If more updates to mask cannot clear more bits, because mask
640 		 * is partitioned with all 0 bits preceding all 1 bits, the loop
641 		 * terminates immediately.
642 		 */
643 		num_shifts--;
644 		range_ext = range1 + ((count1 >> num_shifts) & 1);
645 		/*
646 		 * mask is a signed quantity for the shift because when it is
647 		 * shifted right, the sign bit should copied; when the last
648 		 * block of the leaf is free, pretend, for a while, that all the
649 		 * blocks that follow it are also free.
650 		 */
651 		mask &= (daddr_t)mask >> range_ext;
652 		range1 += range_ext;
653 	}
654 	if (mask == 0) {
655 		/*
656 		 * Update bighint.  There is no allocation bigger than range1
657 		 * starting in this leaf.
658 		 */
659 		scan->bm_bighint = range1;
660 		return (SWAPBLK_NONE);
661 	}
662 
663 	/* Discard any candidates that appear before blk. */
664 	mask &= (u_daddr_t)-1 << (blk & BLIST_BMAP_MASK);
665 	if (mask == 0)
666 		return (SWAPBLK_NONE);
667 
668 	/*
669 	 * The least significant set bit in mask marks the start of the first
670 	 * available range of sufficient size.  Clear all the bits but that one,
671 	 * and then find its position.
672 	 */
673 	mask &= -mask;
674 	lo = bitpos(mask);
675 
676 	hi = lo + count;
677 	if (hi > BLIST_BMAP_RADIX) {
678 		/*
679 		 * An allocation within this leaf is impossible, so a successful
680 		 * allocation depends on the next leaf providing some of the blocks.
681 		 */
682 		if (((blk / BLIST_BMAP_RADIX + 1) & BLIST_META_MASK) == 0) {
683 			/*
684 			 * The next leaf has a different meta-node parent, so it
685 			 * is not necessarily initialized.  Update bighint,
686 			 * comparing the range found at the end of mask to the
687 			 * largest earlier range that could have been made to
688 			 * vanish in the initial processing of mask.
689 			 */
690 			scan->bm_bighint = imax(BLIST_BMAP_RADIX - lo, range1);
691 			return (SWAPBLK_NONE);
692 		}
693 		hi -= BLIST_BMAP_RADIX;
694 		if (((scan[1].u.bmu_bitmap + 1) & ~((u_daddr_t)-1 << hi)) != 0) {
695 			/*
696 			 * The next leaf doesn't have enough free blocks at the
697 			 * beginning to complete the spanning allocation.  The
698 			 * hint cannot be updated, because the same allocation
699 			 * request could be satisfied later, by this leaf, if
700 			 * the state of the next leaf changes, and without any
701 			 * changes to this leaf.
702 			 */
703 			return (SWAPBLK_NONE);
704 		}
705 		/* Clear the first 'hi' bits in the next leaf, allocating them. */
706 		scan[1].u.bmu_bitmap &= (u_daddr_t)-1 << hi;
707 		hi = BLIST_BMAP_RADIX;
708 	}
709 
710 	/* Set the bits of mask at position 'lo' and higher. */
711 	mask = -mask;
712 	if (hi == BLIST_BMAP_RADIX) {
713 		/*
714 		 * Update bighint.  There is no allocation bigger than range1
715 		 * available in this leaf after this allocation completes.
716 		 */
717 		scan->bm_bighint = range1;
718 	} else {
719 		/* Clear the bits of mask at position 'hi' and higher. */
720 		mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi);
721 		/* If this allocation uses all the bits, clear the hint. */
722 		if (mask == scan->u.bmu_bitmap)
723 			scan->bm_bighint = 0;
724 	}
725 	/* Clear the allocated bits from this leaf. */
726 	scan->u.bmu_bitmap &= ~mask;
727 	return ((blk & ~BLIST_BMAP_MASK) + lo);
728 }
729 
730 /*
731  * blist_meta_alloc() -	allocate at a meta in the radix tree.
732  *
733  *	Attempt to allocate at a meta node.  If we can't, we update
734  *	bighint and return a failure.  Updating bighint optimize future
735  *	calls that hit this node.  We have to check for our collapse cases
736  *	and we have a few optimizations strewn in as well.
737  */
738 static daddr_t
739 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
740 {
741 	daddr_t blk, i, next_skip, r, skip;
742 	int child;
743 	bool scan_from_start;
744 
745 	if (radix == BLIST_BMAP_RADIX)
746 		return (blst_leaf_alloc(scan, cursor, count));
747 	if (scan->u.bmu_avail < count) {
748 		/*
749 		 * The meta node's hint must be too large if the allocation
750 		 * exceeds the number of free blocks.  Reduce the hint, and
751 		 * return failure.
752 		 */
753 		scan->bm_bighint = scan->u.bmu_avail;
754 		return (SWAPBLK_NONE);
755 	}
756 	blk = cursor & -radix;
757 	skip = radix_to_skip(radix);
758 	next_skip = skip / BLIST_META_RADIX;
759 
760 	/*
761 	 * An ALL-FREE meta node requires special handling before allocating
762 	 * any of its blocks.
763 	 */
764 	if (scan->u.bmu_avail == radix) {
765 		radix /= BLIST_META_RADIX;
766 
767 		/*
768 		 * Reinitialize each of the meta node's children.  An ALL-FREE
769 		 * meta node cannot have a terminator in any subtree.
770 		 */
771 		for (i = 1; i < skip; i += next_skip) {
772 			if (next_skip == 1)
773 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
774 			else
775 				scan[i].u.bmu_avail = radix;
776 			scan[i].bm_bighint = radix;
777 		}
778 	} else {
779 		radix /= BLIST_META_RADIX;
780 	}
781 
782 	if (count > radix) {
783 		/*
784 		 * The allocation exceeds the number of blocks that are
785 		 * managed by a subtree of this meta node.
786 		 */
787 		panic("allocation too large");
788 	}
789 	scan_from_start = cursor == blk;
790 	child = (cursor - blk) / radix;
791 	blk += child * radix;
792 	for (i = 1 + child * next_skip; i < skip; i += next_skip) {
793 		if (count <= scan[i].bm_bighint) {
794 			/*
795 			 * The allocation might fit beginning in the i'th subtree.
796 			 */
797 			r = blst_meta_alloc(&scan[i],
798 			    cursor > blk ? cursor : blk, count, radix);
799 			if (r != SWAPBLK_NONE) {
800 				scan->u.bmu_avail -= count;
801 				return (r);
802 			}
803 		} else if (scan[i].bm_bighint == (daddr_t)-1) {
804 			/*
805 			 * Terminator
806 			 */
807 			break;
808 		}
809 		blk += radix;
810 	}
811 
812 	/*
813 	 * We couldn't allocate count in this subtree, update bighint.
814 	 */
815 	if (scan_from_start && scan->bm_bighint >= count)
816 		scan->bm_bighint = count - 1;
817 
818 	return (SWAPBLK_NONE);
819 }
820 
821 /*
822  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
823  *
824  */
825 static void
826 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
827 {
828 	u_daddr_t mask;
829 	int n;
830 
831 	/*
832 	 * free some data in this bitmap
833 	 * mask=0000111111111110000
834 	 *          \_________/\__/
835 	 *		count   n
836 	 */
837 	n = blk & BLIST_BMAP_MASK;
838 	mask = ((u_daddr_t)-1 << n) &
839 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
840 	if (scan->u.bmu_bitmap & mask)
841 		panic("freeing free block");
842 	scan->u.bmu_bitmap |= mask;
843 
844 	/*
845 	 * We could probably do a better job here.  We are required to make
846 	 * bighint at least as large as the biggest contiguous block of
847 	 * data.  If we just shoehorn it, a little extra overhead will
848 	 * be incured on the next allocation (but only that one typically).
849 	 */
850 	scan->bm_bighint = BLIST_BMAP_RADIX;
851 }
852 
853 /*
854  * BLST_META_FREE() - free allocated blocks from radix tree meta info
855  *
856  *	This support routine frees a range of blocks from the bitmap.
857  *	The range must be entirely enclosed by this radix node.  If a
858  *	meta node, we break the range down recursively to free blocks
859  *	in subnodes (which means that this code can free an arbitrary
860  *	range whereas the allocation code cannot allocate an arbitrary
861  *	range).
862  */
863 static void
864 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
865 {
866 	daddr_t blk, i, next_skip, skip, v;
867 	int child;
868 
869 	if (scan->bm_bighint == (daddr_t)-1)
870 		panic("freeing invalid range");
871 	if (radix == BLIST_BMAP_RADIX)
872 		return (blst_leaf_free(scan, freeBlk, count));
873 	skip = radix_to_skip(radix);
874 	next_skip = skip / BLIST_META_RADIX;
875 
876 	if (scan->u.bmu_avail == 0) {
877 		/*
878 		 * ALL-ALLOCATED special case, with possible
879 		 * shortcut to ALL-FREE special case.
880 		 */
881 		scan->u.bmu_avail = count;
882 		scan->bm_bighint = count;
883 
884 		if (count != radix)  {
885 			for (i = 1; i < skip; i += next_skip) {
886 				if (scan[i].bm_bighint == (daddr_t)-1)
887 					break;
888 				scan[i].bm_bighint = 0;
889 				if (next_skip == 1) {
890 					scan[i].u.bmu_bitmap = 0;
891 				} else {
892 					scan[i].u.bmu_avail = 0;
893 				}
894 			}
895 			/* fall through */
896 		}
897 	} else {
898 		scan->u.bmu_avail += count;
899 		/* scan->bm_bighint = radix; */
900 	}
901 
902 	/*
903 	 * ALL-FREE special case.
904 	 */
905 
906 	if (scan->u.bmu_avail == radix)
907 		return;
908 	if (scan->u.bmu_avail > radix)
909 		panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
910 		    (long long)count, (long long)scan->u.bmu_avail,
911 		    (long long)radix);
912 
913 	/*
914 	 * Break the free down into its components
915 	 */
916 
917 	blk = freeBlk & -radix;
918 	radix /= BLIST_META_RADIX;
919 
920 	child = (freeBlk - blk) / radix;
921 	blk += child * radix;
922 	i = 1 + child * next_skip;
923 	while (i < skip && blk < freeBlk + count) {
924 		v = blk + radix - freeBlk;
925 		if (v > count)
926 			v = count;
927 		blst_meta_free(&scan[i], freeBlk, v, radix);
928 		if (scan->bm_bighint < scan[i].bm_bighint)
929 			scan->bm_bighint = scan[i].bm_bighint;
930 		count -= v;
931 		freeBlk += v;
932 		blk += radix;
933 		i += next_skip;
934 	}
935 }
936 
937 /*
938  * BLIST_RADIX_COPY() - copy one radix tree to another
939  *
940  *	Locates free space in the source tree and frees it in the destination
941  *	tree.  The space may not already be free in the destination.
942  */
943 static void
944 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
945     daddr_t count)
946 {
947 	daddr_t i, next_skip, skip;
948 
949 	/*
950 	 * Leaf node
951 	 */
952 
953 	if (radix == BLIST_BMAP_RADIX) {
954 		u_daddr_t v = scan->u.bmu_bitmap;
955 
956 		if (v == (u_daddr_t)-1) {
957 			blist_free(dest, blk, count);
958 		} else if (v != 0) {
959 			int i;
960 
961 			for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
962 				if (v & ((u_daddr_t)1 << i))
963 					blist_free(dest, blk + i, 1);
964 			}
965 		}
966 		return;
967 	}
968 
969 	/*
970 	 * Meta node
971 	 */
972 
973 	if (scan->u.bmu_avail == 0) {
974 		/*
975 		 * Source all allocated, leave dest allocated
976 		 */
977 		return;
978 	}
979 	if (scan->u.bmu_avail == radix) {
980 		/*
981 		 * Source all free, free entire dest
982 		 */
983 		if (count < radix)
984 			blist_free(dest, blk, count);
985 		else
986 			blist_free(dest, blk, radix);
987 		return;
988 	}
989 
990 
991 	skip = radix_to_skip(radix);
992 	next_skip = skip / BLIST_META_RADIX;
993 	radix /= BLIST_META_RADIX;
994 
995 	for (i = 1; count && i < skip; i += next_skip) {
996 		if (scan[i].bm_bighint == (daddr_t)-1)
997 			break;
998 
999 		if (count >= radix) {
1000 			blst_copy(&scan[i], blk, radix, dest, radix);
1001 			count -= radix;
1002 		} else {
1003 			if (count) {
1004 				blst_copy(&scan[i], blk, radix, dest, count);
1005 			}
1006 			count = 0;
1007 		}
1008 		blk += radix;
1009 	}
1010 }
1011 
1012 /*
1013  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
1014  *
1015  *	This routine allocates all blocks in the specified range
1016  *	regardless of any existing allocations in that range.  Returns
1017  *	the number of blocks allocated by the call.
1018  */
1019 static daddr_t
1020 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
1021 {
1022 	daddr_t nblks;
1023 	u_daddr_t mask;
1024 	int n;
1025 
1026 	n = blk & BLIST_BMAP_MASK;
1027 	mask = ((u_daddr_t)-1 << n) &
1028 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
1029 
1030 	/* Count the number of blocks that we are allocating. */
1031 	nblks = bitcount64(scan->u.bmu_bitmap & mask);
1032 
1033 	scan->u.bmu_bitmap &= ~mask;
1034 	return (nblks);
1035 }
1036 
1037 /*
1038  * BLIST_META_FILL() -	allocate specific blocks at a meta node
1039  *
1040  *	This routine allocates the specified range of blocks,
1041  *	regardless of any existing allocations in the range.  The
1042  *	range must be within the extent of this node.  Returns the
1043  *	number of blocks allocated by the call.
1044  */
1045 static daddr_t
1046 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1047 {
1048 	daddr_t blk, i, nblks, next_skip, skip, v;
1049 	int child;
1050 
1051 	if (scan->bm_bighint == (daddr_t)-1)
1052 		panic("filling invalid range");
1053 	if (count > radix) {
1054 		/*
1055 		 * The allocation exceeds the number of blocks that are
1056 		 * managed by this node.
1057 		 */
1058 		panic("fill too large");
1059 	}
1060 	if (radix == BLIST_BMAP_RADIX)
1061 		return (blst_leaf_fill(scan, allocBlk, count));
1062 	if (count == radix || scan->u.bmu_avail == 0)  {
1063 		/*
1064 		 * ALL-ALLOCATED special case
1065 		 */
1066 		nblks = scan->u.bmu_avail;
1067 		scan->u.bmu_avail = 0;
1068 		scan->bm_bighint = 0;
1069 		return (nblks);
1070 	}
1071 	skip = radix_to_skip(radix);
1072 	next_skip = skip / BLIST_META_RADIX;
1073 	blk = allocBlk & -radix;
1074 
1075 	/*
1076 	 * An ALL-FREE meta node requires special handling before allocating
1077 	 * any of its blocks.
1078 	 */
1079 	if (scan->u.bmu_avail == radix) {
1080 		radix /= BLIST_META_RADIX;
1081 
1082 		/*
1083 		 * Reinitialize each of the meta node's children.  An ALL-FREE
1084 		 * meta node cannot have a terminator in any subtree.
1085 		 */
1086 		for (i = 1; i < skip; i += next_skip) {
1087 			if (next_skip == 1)
1088 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
1089 			else
1090 				scan[i].u.bmu_avail = radix;
1091 			scan[i].bm_bighint = radix;
1092 		}
1093 	} else {
1094 		radix /= BLIST_META_RADIX;
1095 	}
1096 
1097 	nblks = 0;
1098 	child = (allocBlk - blk) / radix;
1099 	blk += child * radix;
1100 	i = 1 + child * next_skip;
1101 	while (i < skip && blk < allocBlk + count) {
1102 		v = blk + radix - allocBlk;
1103 		if (v > count)
1104 			v = count;
1105 		nblks += blst_meta_fill(&scan[i], allocBlk, v, radix);
1106 		count -= v;
1107 		allocBlk += v;
1108 		blk += radix;
1109 		i += next_skip;
1110 	}
1111 	scan->u.bmu_avail -= nblks;
1112 	return (nblks);
1113 }
1114 
1115 #ifdef BLIST_DEBUG
1116 
1117 static void
1118 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1119 {
1120 	daddr_t i, next_skip, skip;
1121 
1122 	if (radix == BLIST_BMAP_RADIX) {
1123 		printf(
1124 		    "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n",
1125 		    tab, tab, "",
1126 		    (long long)blk, (long long)radix,
1127 		    (long long)scan->u.bmu_bitmap,
1128 		    (long long)scan->bm_bighint
1129 		);
1130 		return;
1131 	}
1132 
1133 	if (scan->u.bmu_avail == 0) {
1134 		printf(
1135 		    "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
1136 		    tab, tab, "",
1137 		    (long long)blk,
1138 		    (long long)radix
1139 		);
1140 		return;
1141 	}
1142 	if (scan->u.bmu_avail == radix) {
1143 		printf(
1144 		    "%*.*s(%08llx,%lld) ALL FREE\n",
1145 		    tab, tab, "",
1146 		    (long long)blk,
1147 		    (long long)radix
1148 		);
1149 		return;
1150 	}
1151 
1152 	printf(
1153 	    "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
1154 	    tab, tab, "",
1155 	    (long long)blk, (long long)radix,
1156 	    (long long)scan->u.bmu_avail,
1157 	    (long long)radix,
1158 	    (long long)scan->bm_bighint
1159 	);
1160 
1161 	skip = radix_to_skip(radix);
1162 	next_skip = skip / BLIST_META_RADIX;
1163 	radix /= BLIST_META_RADIX;
1164 	tab += 4;
1165 
1166 	for (i = 1; i < skip; i += next_skip) {
1167 		if (scan[i].bm_bighint == (daddr_t)-1) {
1168 			printf(
1169 			    "%*.*s(%08llx,%lld): Terminator\n",
1170 			    tab, tab, "",
1171 			    (long long)blk, (long long)radix
1172 			);
1173 			break;
1174 		}
1175 		blst_radix_print(&scan[i], blk, radix, tab);
1176 		blk += radix;
1177 	}
1178 	tab -= 4;
1179 
1180 	printf(
1181 	    "%*.*s}\n",
1182 	    tab, tab, ""
1183 	);
1184 }
1185 
1186 #endif
1187 
1188 #ifdef BLIST_DEBUG
1189 
1190 int
1191 main(int ac, char **av)
1192 {
1193 	int size = 1024;
1194 	int i;
1195 	blist_t bl;
1196 	struct sbuf *s;
1197 
1198 	for (i = 1; i < ac; ++i) {
1199 		const char *ptr = av[i];
1200 		if (*ptr != '-') {
1201 			size = strtol(ptr, NULL, 0);
1202 			continue;
1203 		}
1204 		ptr += 2;
1205 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1206 		exit(1);
1207 	}
1208 	bl = blist_create(size, M_WAITOK);
1209 	blist_free(bl, 0, size);
1210 
1211 	for (;;) {
1212 		char buf[1024];
1213 		long long da = 0;
1214 		long long count = 0;
1215 
1216 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1217 		    (long long)size, (long long)bl->bl_radix);
1218 		fflush(stdout);
1219 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1220 			break;
1221 		switch(buf[0]) {
1222 		case 'r':
1223 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1224 				blist_resize(&bl, count, 1, M_WAITOK);
1225 			} else {
1226 				printf("?\n");
1227 			}
1228 		case 'p':
1229 			blist_print(bl);
1230 			break;
1231 		case 's':
1232 			s = sbuf_new_auto();
1233 			blist_stats(bl, s);
1234 			sbuf_finish(s);
1235 			printf("%s", sbuf_data(s));
1236 			sbuf_delete(s);
1237 			break;
1238 		case 'a':
1239 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1240 				daddr_t blk = blist_alloc(bl, count);
1241 				printf("    R=%08llx\n", (long long)blk);
1242 			} else {
1243 				printf("?\n");
1244 			}
1245 			break;
1246 		case 'f':
1247 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1248 				blist_free(bl, da, count);
1249 			} else {
1250 				printf("?\n");
1251 			}
1252 			break;
1253 		case 'l':
1254 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1255 				printf("    n=%jd\n",
1256 				    (intmax_t)blist_fill(bl, da, count));
1257 			} else {
1258 				printf("?\n");
1259 			}
1260 			break;
1261 		case '?':
1262 		case 'h':
1263 			puts(
1264 			    "p          -print\n"
1265 			    "s          -stats\n"
1266 			    "a %d       -allocate\n"
1267 			    "f %x %d    -free\n"
1268 			    "l %x %d    -fill\n"
1269 			    "r %d       -resize\n"
1270 			    "h/?        -help"
1271 			);
1272 			break;
1273 		default:
1274 			printf("?\n");
1275 			break;
1276 		}
1277 	}
1278 	return(0);
1279 }
1280 
1281 void
1282 panic(const char *ctl, ...)
1283 {
1284 	va_list va;
1285 
1286 	va_start(va, ctl);
1287 	vfprintf(stderr, ctl, va);
1288 	fprintf(stderr, "\n");
1289 	va_end(va);
1290 	exit(1);
1291 }
1292 
1293 #endif
1294