xref: /dragonfly/sys/kern/subr_blist.c (revision 59b0b316)
1 /*
2  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
3  *
4  * Copyright (c) 1998,2004 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *
37  *	This module implements a general bitmap allocator/deallocator.  The
38  *	allocator eats around 2 bits per 'block'.  The module does not
39  *	try to interpret the meaning of a 'block' other then to return
40  *	SWAPBLK_NONE on an allocation failure.
41  *
42  *	A radix tree is used to maintain the bitmap.  Two radix constants are
43  *	involved:  One for the bitmaps contained in the leaf nodes (typically
44  *	32), and one for the meta nodes (typically 16).  Both meta and leaf
45  *	nodes have a hint field.  This field gives us a hint as to the largest
46  *	free contiguous range of blocks under the node.  It may contain a
47  *	value that is too high, but will never contain a value that is too
48  *	low.  When the radix tree is searched, allocation failures in subtrees
49  *	update the hint.
50  *
51  *	The radix tree also implements two collapsed states for meta nodes:
52  *	the ALL-ALLOCATED state and the ALL-FREE state.  If a meta node is
53  *	in either of these two states, all information contained underneath
54  *	the node is considered stale.  These states are used to optimize
55  *	allocation and freeing operations.
56  *
57  * 	The hinting greatly increases code efficiency for allocations while
58  *	the general radix structure optimizes both allocations and frees.  The
59  *	radix tree should be able to operate well no matter how much
60  *	fragmentation there is and no matter how large a bitmap is used.
61  *
62  *	Unlike the rlist code, the blist code wires all necessary memory at
63  *	creation time.  Neither allocations nor frees require interaction with
64  *	the memory subsystem.  In contrast, the rlist code may allocate memory
65  *	on an rlist_free() call.  The non-blocking features of the blist code
66  *	are used to great advantage in the swap code (vm/nswap_pager.c).  The
67  *	rlist code uses a little less overall memory then the blist code (but
68  *	due to swap interleaving not all that much less), but the blist code
69  *	scales much, much better.
70  *
71  *	LAYOUT: The radix tree is layed out recursively using a
72  *	linear array.  Each meta node is immediately followed (layed out
73  *	sequentially in memory) by BLIST_META_RADIX lower level nodes.  This
74  *	is a recursive structure but one that can be easily scanned through
75  *	a very simple 'skip' calculation.  In order to support large radixes,
76  *	portions of the tree may reside outside our memory allocation.  We
77  *	handle this with an early-termination optimization (when bighint is
78  *	set to -1) on the scan.  The memory allocation is only large enough
79  *	to cover the number of blocks requested at creation time even if it
80  *	must be encompassed in larger root-node radix.
81  *
82  *	NOTE: The allocator cannot currently allocate more then
83  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
84  *	large' if you try.  This is an area that could use improvement.  The
85  *	radix is large enough that this restriction does not effect the swap
86  *	system, though.  Currently only the allocation code is effected by
87  *	this algorithmic unfeature.  The freeing code can handle arbitrary
88  *	ranges.
89  *
90  *	NOTE: The radix may exceed BLIST_BMAP_RADIX bits in order to support
91  *	      up to 2^(BLIST_BMAP_RADIX-1) blocks.  The first divison will
92  *	      drop the radix down and fit it within a signed BLIST_BMAP_RADIX
93  *	      bit integer.
94  *
95  *	This code can be compiled stand-alone for debugging.
96  */
97 
98 #ifdef _KERNEL
99 
100 #include <sys/param.h>
101 #include <sys/systm.h>
102 #include <sys/lock.h>
103 #include <sys/kernel.h>
104 #include <sys/blist.h>
105 #include <sys/malloc.h>
106 
107 #else
108 
109 #ifndef BLIST_NO_DEBUG
110 #define BLIST_DEBUG
111 #endif
112 
113 #define SWAPBLK_NONE ((swblk_t)-1)
114 
115 #include <sys/types.h>
116 #include <stdio.h>
117 #include <string.h>
118 #include <stdlib.h>
119 #include <stdarg.h>
120 
121 #define kmalloc(a,b,c)	malloc(a)
122 #define kfree(a,b)	free(a)
123 #define kprintf		printf
124 #define KKASSERT(exp)
125 
126 #include <sys/blist.h>
127 
128 void panic(const char *ctl, ...);
129 
130 #endif
131 
132 /*
133  * static support functions
134  */
135 
136 static swblk_t blst_leaf_alloc(blmeta_t *scan, swblk_t blkat,
137 				swblk_t blk, swblk_t count);
138 static swblk_t blst_meta_alloc(blmeta_t *scan, swblk_t blkat,
139 				swblk_t blk, swblk_t count,
140 				int64_t radix, swblk_t skip);
141 static void blst_leaf_free(blmeta_t *scan, swblk_t relblk, swblk_t count);
142 static void blst_meta_free(blmeta_t *scan, swblk_t freeBlk, swblk_t count,
143 					int64_t radix, swblk_t skip,
144 					swblk_t blk);
145 static swblk_t blst_leaf_fill(blmeta_t *scan, swblk_t blk, swblk_t count);
146 static swblk_t blst_meta_fill(blmeta_t *scan, swblk_t fillBlk, swblk_t count,
147 					int64_t radix, swblk_t skip,
148 					swblk_t blk);
149 static void blst_copy(blmeta_t *scan, swblk_t blk, int64_t radix,
150 				swblk_t skip, blist_t dest, swblk_t count);
151 static swblk_t	blst_radix_init(blmeta_t *scan, int64_t radix,
152 						swblk_t skip, swblk_t count);
153 #ifndef _KERNEL
154 static void	blst_radix_print(blmeta_t *scan, swblk_t blk,
155 					int64_t radix, swblk_t skip, int tab);
156 #endif
157 
158 #ifdef _KERNEL
159 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
160 #endif
161 
162 /*
163  * blist_create() - create a blist capable of handling up to the specified
164  *		    number of blocks
165  *
166  *	blocks must be greater then 0
167  *
168  *	The smallest blist consists of a single leaf node capable of
169  *	managing BLIST_BMAP_RADIX blocks.
170  */
171 
172 blist_t
173 blist_create(swblk_t blocks)
174 {
175 	blist_t bl;
176 	int64_t radix;
177 	swblk_t skip = 0;
178 
179 	/*
180 	 * Calculate radix and skip field used for scanning.
181 	 *
182 	 * Radix can exceed BLIST_BMAP_RADIX bits even if swblk_t is limited
183 	 * to BLIST_BMAP_RADIX bits.
184 	 */
185 	radix = BLIST_BMAP_RADIX;
186 
187 	while (radix < blocks) {
188 		radix *= BLIST_META_RADIX;
189 		skip = (skip + 1) * BLIST_META_RADIX;
190 		KKASSERT(skip > 0);
191 	}
192 
193 	bl = kmalloc(sizeof(struct blist), M_SWAP, M_WAITOK | M_ZERO);
194 
195 	bl->bl_blocks = blocks;
196 	bl->bl_radix = radix;
197 	bl->bl_skip = skip;
198 	bl->bl_rootblks = 1 +
199 	    blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks);
200 	bl->bl_root = kmalloc(sizeof(blmeta_t) * bl->bl_rootblks,
201 			      M_SWAP, M_WAITOK);
202 
203 #if defined(BLIST_DEBUG)
204 	kprintf(
205 		"BLIST representing %d blocks (%d MB of swap)"
206 		", requiring %dK of ram\n",
207 		bl->bl_blocks,
208 		bl->bl_blocks * 4 / 1024,
209 		(bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024
210 	);
211 	kprintf("BLIST raw radix tree contains %d records\n", bl->bl_rootblks);
212 #endif
213 	blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks);
214 
215 	return(bl);
216 }
217 
218 void
219 blist_destroy(blist_t bl)
220 {
221 	kfree(bl->bl_root, M_SWAP);
222 	kfree(bl, M_SWAP);
223 }
224 
225 /*
226  * blist_alloc() - reserve space in the block bitmap.  Return the base
227  *		     of a contiguous region or SWAPBLK_NONE if space could
228  *		     not be allocated.
229  */
230 
231 swblk_t
232 blist_alloc(blist_t bl, swblk_t count)
233 {
234 	swblk_t blk = SWAPBLK_NONE;
235 
236 	if (bl) {
237 		if (bl->bl_radix == BLIST_BMAP_RADIX)
238 			blk = blst_leaf_alloc(bl->bl_root, 0, 0, count);
239 		else
240 			blk = blst_meta_alloc(bl->bl_root, 0, 0, count,
241 					      bl->bl_radix, bl->bl_skip);
242 		if (blk != SWAPBLK_NONE)
243 			bl->bl_free -= count;
244 	}
245 	return(blk);
246 }
247 
248 swblk_t
249 blist_allocat(blist_t bl, swblk_t count, swblk_t blkat)
250 {
251 	swblk_t blk = SWAPBLK_NONE;
252 
253 	if (bl) {
254 		if (bl->bl_radix == BLIST_BMAP_RADIX)
255 			blk = blst_leaf_alloc(bl->bl_root, blkat, 0, count);
256 		else
257 			blk = blst_meta_alloc(bl->bl_root, blkat, 0, count,
258 					      bl->bl_radix, bl->bl_skip);
259 		if (blk != SWAPBLK_NONE)
260 			bl->bl_free -= count;
261 	}
262 	return(blk);
263 }
264 
265 /*
266  * blist_free() -	free up space in the block bitmap.  Return the base
267  *		     	of a contiguous region.  Panic if an inconsistancy is
268  *			found.
269  */
270 
271 void
272 blist_free(blist_t bl, swblk_t blkno, swblk_t count)
273 {
274 	if (bl) {
275 		if (bl->bl_radix == BLIST_BMAP_RADIX)
276 			blst_leaf_free(bl->bl_root, blkno, count);
277 		else
278 			blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0);
279 		bl->bl_free += count;
280 	}
281 }
282 
283 /*
284  * blist_fill() -	mark a region in the block bitmap as off-limits
285  *			to the allocator (i.e. allocate it), ignoring any
286  *			existing allocations.  Return the number of blocks
287  *			actually filled that were free before the call.
288  */
289 
290 swblk_t
291 blist_fill(blist_t bl, swblk_t blkno, swblk_t count)
292 {
293 	swblk_t filled;
294 
295 	if (bl) {
296 		if (bl->bl_radix == BLIST_BMAP_RADIX) {
297 			filled = blst_leaf_fill(bl->bl_root, blkno, count);
298 		} else {
299 			filled = blst_meta_fill(bl->bl_root, blkno, count,
300 			    bl->bl_radix, bl->bl_skip, 0);
301 		}
302 		bl->bl_free -= filled;
303 		return (filled);
304 	} else {
305 		return 0;
306 	}
307 }
308 
309 /*
310  * blist_resize() -	resize an existing radix tree to handle the
311  *			specified number of blocks.  This will reallocate
312  *			the tree and transfer the previous bitmap to the new
313  *			one.  When extending the tree you can specify whether
314  *			the new blocks are to left allocated or freed.
315  */
316 
317 void
318 blist_resize(blist_t *pbl, swblk_t count, int freenew)
319 {
320     blist_t newbl = blist_create(count);
321     blist_t save = *pbl;
322 
323     *pbl = newbl;
324     if (count > save->bl_blocks)
325 	    count = save->bl_blocks;
326     blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count);
327 
328     /*
329      * If resizing upwards, should we free the new space or not?
330      */
331     if (freenew && count < newbl->bl_blocks) {
332 	    blist_free(newbl, count, newbl->bl_blocks - count);
333     }
334     blist_destroy(save);
335 }
336 
337 #ifdef BLIST_DEBUG
338 
339 /*
340  * blist_print()    - dump radix tree
341  */
342 
343 void
344 blist_print(blist_t bl)
345 {
346 	kprintf("BLIST {\n");
347 	blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4);
348 	kprintf("}\n");
349 }
350 
351 #endif
352 
353 /************************************************************************
354  *			  ALLOCATION SUPPORT FUNCTIONS			*
355  ************************************************************************
356  *
357  *	These support functions do all the actual work.  They may seem
358  *	rather longish, but that's because I've commented them up.  The
359  *	actual code is straight forward.
360  *
361  */
362 
363 /*
364  * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
365  *
366  *	This is the core of the allocator and is optimized for the 1 block
367  *	and the BLIST_BMAP_RADIX block allocation cases.  Other cases are
368  *	somewhat slower.  The 1 block allocation case is log2 and extremely
369  *	quick.
370  */
371 
372 static swblk_t
373 blst_leaf_alloc(blmeta_t *scan, swblk_t blkat __unused, swblk_t blk,
374 		swblk_t count)
375 {
376 	u_swblk_t orig = scan->u.bmu_bitmap;
377 
378 	if (orig == 0) {
379 		/*
380 		 * Optimize bitmap all-allocated case.  Also, count = 1
381 		 * case assumes at least 1 bit is free in the bitmap, so
382 		 * we have to take care of this case here.
383 		 */
384 		scan->bm_bighint = 0;
385 		return(SWAPBLK_NONE);
386 	}
387 	if (count == 1) {
388 		/*
389 		 * Optimized code to allocate one bit out of the bitmap
390 		 */
391 		u_swblk_t mask;
392 		int j = BLIST_BMAP_RADIX/2;
393 		int r = 0;
394 
395 		mask = (u_swblk_t)-1 >> (BLIST_BMAP_RADIX/2);
396 
397 		while (j) {
398 			if ((orig & mask) == 0) {
399 			    r += j;
400 			    orig >>= j;
401 			}
402 			j >>= 1;
403 			mask >>= j;
404 		}
405 		scan->u.bmu_bitmap &= ~((swblk_t)1 << r);
406 		return(blk + r);
407 	}
408 	if (count <= BLIST_BMAP_RADIX) {
409 		/*
410 		 * non-optimized code to allocate N bits out of the bitmap.
411 		 * The more bits, the faster the code runs.  It will run
412 		 * the slowest allocating 2 bits, but since there aren't any
413 		 * memory ops in the core loop (or shouldn't be, anyway),
414 		 * you probably won't notice the difference.
415 		 */
416 		int j;
417 		int n = (int)(BLIST_BMAP_RADIX - count);
418 		u_swblk_t mask;
419 
420 		mask = (u_swblk_t)-1 >> n;
421 
422 		for (j = 0; j <= n; ++j) {
423 			if ((orig & mask) == mask) {
424 				scan->u.bmu_bitmap &= ~mask;
425 				return(blk + j);
426 			}
427 			mask = (mask << 1);
428 		}
429 	}
430 
431 	/*
432 	 * We couldn't allocate count in this subtree, update bighint.
433 	 */
434 	scan->bm_bighint = count - 1;
435 
436 	return(SWAPBLK_NONE);
437 }
438 
439 /*
440  * blist_meta_alloc() -	allocate at a meta in the radix tree.
441  *
442  *	Attempt to allocate at a meta node.  If we can't, we update
443  *	bighint and return a failure.  Updating bighint optimize future
444  *	calls that hit this node.  We have to check for our collapse cases
445  *	and we have a few optimizations strewn in as well.
446  */
447 static swblk_t
448 blst_meta_alloc(blmeta_t *scan, swblk_t blkat,
449 		swblk_t blk, swblk_t count,
450 		int64_t radix, swblk_t skip)
451 {
452 	int hintok = (blk >= blkat);
453 	swblk_t next_skip = ((u_int)skip / BLIST_META_RADIX);
454 	swblk_t i;
455 
456 	/*
457 	 * ALL-ALLOCATED special case
458 	 */
459 	if (scan->u.bmu_avail == 0)  {
460 		scan->bm_bighint = 0;
461 		return(SWAPBLK_NONE);
462 	}
463 
464 	/*
465 	 * ALL-FREE special case, initialize uninitialized
466 	 * sublevel.
467 	 *
468 	 * NOTE: radix may exceed 32 bits until first division.
469 	 */
470 	if (scan->u.bmu_avail == radix) {
471 		scan->bm_bighint = radix;
472 
473 		radix /= BLIST_META_RADIX;
474 		for (i = 1; i <= skip; i += next_skip) {
475 			if (scan[i].bm_bighint == (swblk_t)-1)
476 				break;
477 			if (next_skip == 1) {
478 				scan[i].u.bmu_bitmap = (u_swblk_t)-1;
479 				scan[i].bm_bighint = BLIST_BMAP_RADIX;
480 			} else {
481 				scan[i].bm_bighint = (swblk_t)radix;
482 				scan[i].u.bmu_avail = (swblk_t)radix;
483 			}
484 		}
485 	} else {
486 		radix /= BLIST_META_RADIX;
487 	}
488 
489 	for (i = 1; i <= skip; i += next_skip) {
490 		if (count <= scan[i].bm_bighint &&
491 		    blk + (swblk_t)radix > blkat) {
492 			/*
493 			 * count fits in object
494 			 */
495 			swblk_t r;
496 			if (next_skip == 1) {
497 				r = blst_leaf_alloc(&scan[i], blkat,
498 						    blk, count);
499 			} else {
500 				r = blst_meta_alloc(&scan[i], blkat,
501 						    blk, count,
502 						    radix, next_skip - 1);
503 			}
504 			if (r != SWAPBLK_NONE) {
505 				scan->u.bmu_avail -= count;
506 				if (scan->bm_bighint > scan->u.bmu_avail)
507 					scan->bm_bighint = scan->u.bmu_avail;
508 				return(r);
509 			}
510 			/* bighint was updated by recursion */
511 		} else if (scan[i].bm_bighint == (swblk_t)-1) {
512 			/*
513 			 * Terminator
514 			 */
515 			break;
516 		} else if (count > (swblk_t)radix) {
517 			/*
518 			 * count does not fit in object even if it were
519 			 * complete free.
520 			 */
521 			panic("blist_meta_alloc: allocation too large");
522 		}
523 		blk += (swblk_t)radix;
524 	}
525 
526 	/*
527 	 * We couldn't allocate count in this subtree, update bighint.
528 	 */
529 	if (hintok && scan->bm_bighint >= count)
530 		scan->bm_bighint = count - 1;
531 	return(SWAPBLK_NONE);
532 }
533 
534 /*
535  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
536  */
537 static void
538 blst_leaf_free(blmeta_t *scan, swblk_t blk, swblk_t count)
539 {
540 	/*
541 	 * free some data in this bitmap
542 	 *
543 	 * e.g.
544 	 *	0000111111111110000
545 	 *          \_________/\__/
546 	 *		v        n
547 	 */
548 	int n = blk & (BLIST_BMAP_RADIX - 1);
549 	u_swblk_t mask;
550 
551 	mask = ((u_swblk_t)-1 << n) &
552 	    ((u_swblk_t)-1 >> (BLIST_BMAP_RADIX - count - n));
553 
554 	if (scan->u.bmu_bitmap & mask)
555 		panic("blst_radix_free: freeing free block");
556 	scan->u.bmu_bitmap |= mask;
557 
558 	/*
559 	 * We could probably do a better job here.  We are required to make
560 	 * bighint at least as large as the biggest contiguous block of
561 	 * data.  If we just shoehorn it, a little extra overhead will
562 	 * be incured on the next allocation (but only that one typically).
563 	 */
564 	scan->bm_bighint = BLIST_BMAP_RADIX;
565 }
566 
567 /*
568  * BLST_META_FREE() - free allocated blocks from radix tree meta info
569  *
570  *	This support routine frees a range of blocks from the bitmap.
571  *	The range must be entirely enclosed by this radix node.  If a
572  *	meta node, we break the range down recursively to free blocks
573  *	in subnodes (which means that this code can free an arbitrary
574  *	range whereas the allocation code cannot allocate an arbitrary
575  *	range).
576  */
577 
578 static void
579 blst_meta_free(blmeta_t *scan, swblk_t freeBlk, swblk_t count,
580 	       int64_t radix, swblk_t skip, swblk_t blk)
581 {
582 	swblk_t i;
583 	swblk_t next_skip = ((u_int)skip / BLIST_META_RADIX);
584 
585 #if 0
586 	kprintf("FREE (%x,%d) FROM (%x,%lld)\n",
587 	    freeBlk, count,
588 	    blk, (long long)radix
589 	);
590 #endif
591 
592 	/*
593 	 * ALL-ALLOCATED special case, initialize for recursion.
594 	 *
595 	 * We will short-cut the ALL-ALLOCATED -> ALL-FREE case.
596 	 */
597 	if (scan->u.bmu_avail == 0) {
598 		scan->u.bmu_avail = count;
599 		scan->bm_bighint = count;
600 
601 		if (count != radix)  {
602 			for (i = 1; i <= skip; i += next_skip) {
603 				if (scan[i].bm_bighint == (swblk_t)-1)
604 					break;
605 				scan[i].bm_bighint = 0;
606 				if (next_skip == 1) {
607 					scan[i].u.bmu_bitmap = 0;
608 				} else {
609 					scan[i].u.bmu_avail = 0;
610 				}
611 			}
612 			/* fall through */
613 		}
614 	} else {
615 		scan->u.bmu_avail += count;
616 		/* scan->bm_bighint = radix; */
617 	}
618 
619 	/*
620 	 * ALL-FREE special case.
621 	 *
622 	 * Set bighint for higher levels to snoop.
623 	 */
624 	if (scan->u.bmu_avail == radix) {
625 		scan->bm_bighint = radix;
626 		return;
627 	}
628 
629 	/*
630 	 * Break the free down into its components
631 	 */
632 	if (scan->u.bmu_avail > radix) {
633 		panic("blst_meta_free: freeing already "
634 		      "free blocks (%ld) %ld/%lld",
635 		      count, (long)scan->u.bmu_avail, (long long)radix);
636 	}
637 
638 	radix /= BLIST_META_RADIX;
639 
640 	i = (freeBlk - blk) / (swblk_t)radix;
641 	blk += i * (swblk_t)radix;
642 	i = i * next_skip + 1;
643 
644 	while (i <= skip && blk < freeBlk + count) {
645 		swblk_t v;
646 
647 		v = blk + (swblk_t)radix - freeBlk;
648 		if (v > count)
649 			v = count;
650 
651 		if (scan->bm_bighint == (swblk_t)-1)
652 			panic("blst_meta_free: freeing unexpected range");
653 
654 		if (next_skip == 1) {
655 			blst_leaf_free(&scan[i], freeBlk, v);
656 		} else {
657 			blst_meta_free(&scan[i], freeBlk, v,
658 				       radix, next_skip - 1, blk);
659 		}
660 
661 		/*
662 		 * After having dealt with the becomes-all-free case any
663 		 * partial free will not be able to bring us to the
664 		 * becomes-all-free state.
665 		 *
666 		 * We can raise bighint to at least the sub-segment's
667 		 * bighint.
668 		 */
669 		if (scan->bm_bighint < scan[i].bm_bighint) {
670 		    scan->bm_bighint = scan[i].bm_bighint;
671 		}
672 		count -= v;
673 		freeBlk += v;
674 		blk += (swblk_t)radix;
675 		i += next_skip;
676 	}
677 }
678 
679 /*
680  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
681  *
682  *	Allocates all blocks in the specified range regardless of
683  *	any existing allocations in that range.  Returns the number
684  *	of blocks allocated by the call.
685  */
686 static swblk_t
687 blst_leaf_fill(blmeta_t *scan, swblk_t blk, swblk_t count)
688 {
689 	int n = blk & (BLIST_BMAP_RADIX - 1);
690 	swblk_t nblks;
691 	u_swblk_t mask, bitmap;
692 
693 	mask = ((u_swblk_t)-1 << n) &
694 	    ((u_swblk_t)-1 >> (BLIST_BMAP_RADIX - count - n));
695 
696 	/* Count the number of blocks we're about to allocate */
697 	bitmap = scan->u.bmu_bitmap & mask;
698 	for (nblks = 0; bitmap != 0; nblks++)
699 		bitmap &= bitmap - 1;
700 
701 	scan->u.bmu_bitmap &= ~mask;
702 	return (nblks);
703 }
704 
705 /*
706  * BLST_META_FILL() -	allocate specific blocks at a meta node
707  *
708  *	Allocates the specified range of blocks, regardless of
709  *	any existing allocations in the range.  The range must
710  *	be within the extent of this node.  Returns the number
711  *	of blocks allocated by the call.
712  */
713 static swblk_t
714 blst_meta_fill(blmeta_t *scan, swblk_t fillBlk, swblk_t count,
715 	       int64_t radix, swblk_t skip, swblk_t blk)
716 {
717 	swblk_t i;
718 	swblk_t next_skip = ((u_int)skip / BLIST_META_RADIX);
719 	swblk_t nblks = 0;
720 
721 	if (count == radix || scan->u.bmu_avail == 0) {
722 		/*
723 		 * ALL-ALLOCATED special case
724 		 */
725 		nblks = scan->u.bmu_avail;
726 		scan->u.bmu_avail = 0;
727 		scan->bm_bighint = count;
728 		return (nblks);
729 	}
730 
731 	if (scan->u.bmu_avail == radix) {
732 		radix /= BLIST_META_RADIX;
733 
734 		/*
735 		 * ALL-FREE special case, initialize sublevel
736 		 */
737 		for (i = 1; i <= skip; i += next_skip) {
738 			if (scan[i].bm_bighint == (swblk_t)-1)
739 				break;
740 			if (next_skip == 1) {
741 				scan[i].u.bmu_bitmap = (u_swblk_t)-1;
742 				scan[i].bm_bighint = BLIST_BMAP_RADIX;
743 			} else {
744 				scan[i].bm_bighint = (swblk_t)radix;
745 				scan[i].u.bmu_avail = (swblk_t)radix;
746 			}
747 		}
748 	} else {
749 		radix /= BLIST_META_RADIX;
750 	}
751 
752 	if (count > (swblk_t)radix)
753 		panic("blst_meta_fill: allocation too large");
754 
755 	i = (fillBlk - blk) / (swblk_t)radix;
756 	blk += i * (swblk_t)radix;
757 	i = i * next_skip + 1;
758 
759 	while (i <= skip && blk < fillBlk + count) {
760 		swblk_t v;
761 
762 		v = blk + (swblk_t)radix - fillBlk;
763 		if (v > count)
764 			v = count;
765 
766 		if (scan->bm_bighint == (swblk_t)-1)
767 			panic("blst_meta_fill: filling unexpected range");
768 
769 		if (next_skip == 1) {
770 			nblks += blst_leaf_fill(&scan[i], fillBlk, v);
771 		} else {
772 			nblks += blst_meta_fill(&scan[i], fillBlk, v,
773 			    radix, next_skip - 1, blk);
774 		}
775 		count -= v;
776 		fillBlk += v;
777 		blk += (swblk_t)radix;
778 		i += next_skip;
779 	}
780 	scan->u.bmu_avail -= nblks;
781 	return (nblks);
782 }
783 
784 /*
785  * BLIST_RADIX_COPY() - copy one radix tree to another
786  *
787  *	Locates free space in the source tree and frees it in the destination
788  *	tree.  The space may not already be free in the destination.
789  */
790 
791 static void
792 blst_copy(blmeta_t *scan, swblk_t blk, int64_t radix,
793 	  swblk_t skip, blist_t dest, swblk_t count)
794 {
795 	swblk_t next_skip;
796 	swblk_t i;
797 
798 	/*
799 	 * Leaf node
800 	 */
801 
802 	if (radix == BLIST_BMAP_RADIX) {
803 		u_swblk_t v = scan->u.bmu_bitmap;
804 
805 		if (v == (u_swblk_t)-1) {
806 			blist_free(dest, blk, count);
807 		} else if (v != 0) {
808 			swblk_t i;
809 
810 			for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
811 				if (v & ((swblk_t)1 << i))
812 					blist_free(dest, blk + i, 1);
813 			}
814 		}
815 		return;
816 	}
817 
818 	/*
819 	 * Meta node
820 	 */
821 
822 	if (scan->u.bmu_avail == 0) {
823 		/*
824 		 * Source all allocated, leave dest allocated
825 		 */
826 		return;
827 	}
828 	if (scan->u.bmu_avail == radix) {
829 		/*
830 		 * Source all free, free entire dest
831 		 */
832 		if (count < radix)
833 			blist_free(dest, blk, count);
834 		else
835 			blist_free(dest, blk, (swblk_t)radix);
836 		return;
837 	}
838 
839 
840 	radix /= BLIST_META_RADIX;
841 	next_skip = ((u_swblk_t)skip / BLIST_META_RADIX);
842 
843 	for (i = 1; count && i <= skip; i += next_skip) {
844 		if (scan[i].bm_bighint == (swblk_t)-1)
845 			break;
846 
847 		if (count >= (swblk_t)radix) {
848 			blst_copy(
849 			    &scan[i],
850 			    blk,
851 			    radix,
852 			    next_skip - 1,
853 			    dest,
854 			    (swblk_t)radix
855 			);
856 			count -= (swblk_t)radix;
857 		} else {
858 			if (count) {
859 				blst_copy(
860 				    &scan[i],
861 				    blk,
862 				    radix,
863 				    next_skip - 1,
864 				    dest,
865 				    count
866 				);
867 			}
868 			count = 0;
869 		}
870 		blk += (swblk_t)radix;
871 	}
872 }
873 
874 /*
875  * BLST_RADIX_INIT() - initialize radix tree
876  *
877  *	Initialize our meta structures and bitmaps and calculate the exact
878  *	amount of space required to manage 'count' blocks - this space may
879  *	be considerably less then the calculated radix due to the large
880  *	RADIX values we use.
881  */
882 
883 static swblk_t
884 blst_radix_init(blmeta_t *scan, int64_t radix, swblk_t skip, swblk_t count)
885 {
886 	swblk_t i;
887 	swblk_t next_skip;
888 	swblk_t memindex = 0;
889 
890 	/*
891 	 * Leaf node
892 	 */
893 
894 	if (radix == BLIST_BMAP_RADIX) {
895 		if (scan) {
896 			scan->bm_bighint = 0;
897 			scan->u.bmu_bitmap = 0;
898 		}
899 		return(memindex);
900 	}
901 
902 	/*
903 	 * Meta node.  If allocating the entire object we can special
904 	 * case it.  However, we need to figure out how much memory
905 	 * is required to manage 'count' blocks, so we continue on anyway.
906 	 */
907 
908 	if (scan) {
909 		scan->bm_bighint = 0;
910 		scan->u.bmu_avail = 0;
911 	}
912 
913 	radix /= BLIST_META_RADIX;
914 	next_skip = ((u_swblk_t)skip / BLIST_META_RADIX);
915 
916 	for (i = 1; i <= skip; i += next_skip) {
917 		if (count >= (swblk_t)radix) {
918 			/*
919 			 * Allocate the entire object
920 			 */
921 			memindex = i + blst_radix_init(
922 			    ((scan) ? &scan[i] : NULL),
923 			    radix,
924 			    next_skip - 1,
925 			    (swblk_t)radix
926 			);
927 			count -= (swblk_t)radix;
928 		} else if (count > 0) {
929 			/*
930 			 * Allocate a partial object
931 			 */
932 			memindex = i + blst_radix_init(
933 			    ((scan) ? &scan[i] : NULL),
934 			    radix,
935 			    next_skip - 1,
936 			    count
937 			);
938 			count = 0;
939 		} else {
940 			/*
941 			 * Add terminator and break out
942 			 */
943 			if (scan)
944 				scan[i].bm_bighint = (swblk_t)-1;
945 			break;
946 		}
947 	}
948 	if (memindex < i)
949 		memindex = i;
950 	return(memindex);
951 }
952 
953 #ifdef BLIST_DEBUG
954 
955 static void
956 blst_radix_print(blmeta_t *scan, swblk_t blk, int64_t radix, swblk_t skip, int tab)
957 {
958 	swblk_t i;
959 	swblk_t next_skip;
960 
961 	if (radix == BLIST_BMAP_RADIX) {
962 		kprintf(
963 		    "%*.*s(%04x,%lld): bitmap %08x big=%d\n",
964 		    tab, tab, "",
965 		    blk, (long long)radix,
966 		    scan->u.bmu_bitmap,
967 		    scan->bm_bighint
968 		);
969 		return;
970 	}
971 
972 	if (scan->u.bmu_avail == 0) {
973 		kprintf(
974 		    "%*.*s(%04x,%lld) ALL ALLOCATED\n",
975 		    tab, tab, "",
976 		    blk,
977 		    (long long)radix
978 		);
979 		return;
980 	}
981 	if (scan->u.bmu_avail == radix) {
982 		kprintf(
983 		    "%*.*s(%04x,%lld) ALL FREE\n",
984 		    tab, tab, "",
985 		    blk,
986 		    (long long)radix
987 		);
988 		return;
989 	}
990 
991 	kprintf(
992 	    "%*.*s(%04x,%lld): subtree (%d/%lld) big=%d {\n",
993 	    tab, tab, "",
994 	    blk, (long long)radix,
995 	    scan->u.bmu_avail,
996 	    (long long)radix,
997 	    scan->bm_bighint
998 	);
999 
1000 	radix /= BLIST_META_RADIX;
1001 	next_skip = ((u_swblk_t)skip / BLIST_META_RADIX);
1002 	tab += 4;
1003 
1004 	for (i = 1; i <= skip; i += next_skip) {
1005 		if (scan[i].bm_bighint == (swblk_t)-1) {
1006 			kprintf(
1007 			    "%*.*s(%04x,%lld): Terminator\n",
1008 			    tab, tab, "",
1009 			    blk, (long long)radix
1010 			);
1011 			break;
1012 		}
1013 		blst_radix_print(
1014 		    &scan[i],
1015 		    blk,
1016 		    radix,
1017 		    next_skip - 1,
1018 		    tab
1019 		);
1020 		blk += (swblk_t)radix;
1021 	}
1022 	tab -= 4;
1023 
1024 	kprintf(
1025 	    "%*.*s}\n",
1026 	    tab, tab, ""
1027 	);
1028 }
1029 
1030 #endif
1031 
1032 #ifdef BLIST_DEBUG
1033 
1034 int
1035 main(int ac, char **av)
1036 {
1037 	swblk_t size = 1024;
1038 	swblk_t i;
1039 	blist_t bl;
1040 
1041 	for (i = 1; i < ac; ++i) {
1042 		const char *ptr = av[i];
1043 		if (*ptr != '-') {
1044 			size = strtol(ptr, NULL, 0);
1045 			continue;
1046 		}
1047 		ptr += 2;
1048 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1049 		exit(1);
1050 	}
1051 	bl = blist_create(size);
1052 	blist_free(bl, 0, size);
1053 
1054 	for (;;) {
1055 		char buf[1024];
1056 		swblk_t da = 0;
1057 		swblk_t count = 0;
1058 		swblk_t blkat;
1059 
1060 
1061 		kprintf("%d/%d/%lld> ",
1062 			bl->bl_free, size, (long long)bl->bl_radix);
1063 		fflush(stdout);
1064 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1065 			break;
1066 		switch(buf[0]) {
1067 		case 'r':
1068 			if (sscanf(buf + 1, "%d", &count) == 1) {
1069 				blist_resize(&bl, count, 1);
1070 				size = count;
1071 			} else {
1072 				kprintf("?\n");
1073 			}
1074 		case 'p':
1075 			blist_print(bl);
1076 			break;
1077 		case 'a':
1078 			if (sscanf(buf + 1, "%d %d", &count, &blkat) == 1) {
1079 				swblk_t blk = blist_alloc(bl, count);
1080 				kprintf("    R=%04x\n", blk);
1081 			} else if (sscanf(buf + 1, "%d %d", &count, &blkat) == 2) {
1082 				swblk_t blk = blist_allocat(bl, count, blkat);
1083 				kprintf("    R=%04x\n", blk);
1084 			} else {
1085 				kprintf("?\n");
1086 			}
1087 			break;
1088 		case 'f':
1089 			if (sscanf(buf + 1, "%x %d", &da, &count) == 2) {
1090 				blist_free(bl, da, count);
1091 			} else {
1092 				kprintf("?\n");
1093 			}
1094 			break;
1095 		case 'l':
1096 			if (sscanf(buf + 1, "%x %d", &da, &count) == 2) {
1097 				printf("    n=%d\n",
1098 				    blist_fill(bl, da, count));
1099 			} else {
1100 				kprintf("?\n");
1101 			}
1102 			break;
1103 		case '?':
1104 		case 'h':
1105 			puts(
1106 			    "p          -print\n"
1107 			    "a %d       -allocate\n"
1108 			    "f %x %d    -free\n"
1109 			    "l %x %d	-fill\n"
1110 			    "r %d       -resize\n"
1111 			    "h/?        -help"
1112 			);
1113 			break;
1114 		default:
1115 			kprintf("?\n");
1116 			break;
1117 		}
1118 	}
1119 	return(0);
1120 }
1121 
1122 void
1123 panic(const char *ctl, ...)
1124 {
1125 	__va_list va;
1126 
1127 	__va_start(va, ctl);
1128 	vfprintf(stderr, ctl, va);
1129 	fprintf(stderr, "\n");
1130 	__va_end(va);
1131 	exit(1);
1132 }
1133 
1134 #endif
1135 
1136