xref: /openbsd/sys/kern/subr_blist.c (revision b076d4fb)
1 /* $OpenBSD: subr_blist.c,v 1.4 2023/05/30 08:30:01 jsg Exp $ */
2 /* DragonFlyBSD:7b80531f545c7d3c51c1660130c71d01f6bccbe0:/sys/kern/subr_blist.c */
3 /*
4  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
5  *
6  * Copyright (c) 1998,2004 The DragonFly Project.  All rights reserved.
7  *
8  * This code is derived from software contributed to The DragonFly Project
9  * by Matthew Dillon <dillon@backplane.com>
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  *
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in
19  *    the documentation and/or other materials provided with the
20  *    distribution.
21  * 3. Neither the name of The DragonFly Project nor the names of its
22  *    contributors may be used to endorse or promote products derived
23  *    from this software without specific, prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
28  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
29  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
30  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
31  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
32  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
33  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
34  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
35  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *
39  *	This module implements a general bitmap allocator/deallocator.  The
40  *	allocator eats around 2 bits per 'block'.  The module does not
41  *	try to interpret the meaning of a 'block' other than to return
42  *	SWAPBLK_NONE on an allocation failure.
43  *
44  *	A radix tree is used to maintain the bitmap.  Two radix constants are
45  *	involved:  One for the bitmaps contained in the leaf nodes (typically
46  *	32), and one for the meta nodes (typically 16).  Both meta and leaf
47  *	nodes have a hint field.  This field gives us a hint as to the largest
48  *	free contiguous range of blocks under the node.  It may contain a
49  *	value that is too high, but will never contain a value that is too
50  *	low.  When the radix tree is searched, allocation failures in subtrees
51  *	update the hint.
52  *
53  *	The radix tree also implements two collapsed states for meta nodes:
54  *	the ALL-ALLOCATED state and the ALL-FREE state.  If a meta node is
55  *	in either of these two states, all information contained underneath
56  *	the node is considered stale.  These states are used to optimize
57  *	allocation and freeing operations.
58  *
59  * 	The hinting greatly increases code efficiency for allocations while
60  *	the general radix structure optimizes both allocations and frees.  The
61  *	radix tree should be able to operate well no matter how much
62  *	fragmentation there is and no matter how large a bitmap is used.
63  *
64  *	Unlike the rlist code, the blist code wires all necessary memory at
65  *	creation time.  Neither allocations nor frees require interaction with
66  *	the memory subsystem.  In contrast, the rlist code may allocate memory
67  *	on an blist_free() call.  The non-blocking features of the blist code
68  *	are used to great advantage in the swap code (uvm/uvm_swap.c).  The
69  *	rlist code uses a little less overall memory than the blist code (but
70  *	due to swap interleaving not all that much less), but the blist code
71  *	scales much, much better.
72  *
73  *	LAYOUT: The radix tree is laid out recursively using a
74  *	linear array.  Each meta node is immediately followed (laid out
75  *	sequentially in memory) by BLIST_META_RADIX lower level nodes.  This
76  *	is a recursive structure but one that can be easily scanned through
77  *	a very simple 'skip' calculation.  In order to support large radixes,
78  *	portions of the tree may reside outside our memory allocation.  We
79  *	handle this with an early-termination optimization (when bighint is
80  *	set to -1) on the scan.  The memory allocation is only large enough
81  *	to cover the number of blocks requested at creation time even if it
82  *	must be encompassed in larger root-node radix.
83  *
84  *	NOTE: The allocator cannot currently allocate more than
85  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
86  *	large' if you try.  This is an area that could use improvement.  The
87  *	radix is large enough that this restriction does not effect the swap
88  *	system, though.  Currently only the allocation code is effected by
89  *	this algorithmic unfeature.  The freeing code can handle arbitrary
90  *	ranges.
91  *
92  *	NOTE: The radix may exceed BLIST_BMAP_RADIX bits in order to support
93  *	      up to 2^(BLIST_BMAP_RADIX-1) blocks.  The first division will
94  *	      drop the radix down and fit it within a signed BLIST_BMAP_RADIX
95  *	      bit integer.
96  *
97  *	This code can be compiled stand-alone for debugging.
98  */
99 
100 #ifdef _KERNEL
101 
102 #include <sys/param.h>
103 #include <sys/systm.h>
104 #include <sys/blist.h>
105 #include <sys/malloc.h>
106 
107 #else
108 
109 #ifndef BLIST_NO_DEBUG
110 #define BLIST_DEBUG
111 #endif
112 
113 #include <sys/types.h>
114 #include <assert.h>
115 #include <err.h>
116 #include <stdio.h>
117 #include <string.h>
118 #include <stdlib.h>
119 #include <stdarg.h>
120 #include <limits.h>
121 
122 #define malloc(s,t,f)	calloc(1, s)
123 #define mallocarray(n,s,t,f)	reallocarray(NULL, n, s)
124 #define free(p,t,s)	free(p)
125 #define KASSERT(exp)	assert(exp)
126 #define KDASSERT(exp)	assert(exp)
127 
128 #include "../sys/blist.h"
129 
130 #define panic(...)	do { errx(1, __VA_ARGS__); } while (0)
131 
132 #endif
133 
134 /*
135  * static support functions
136  */
137 
138 static swblk_t blst_leaf_alloc(blmeta_t *scan, swblk_t blkat,
139 				swblk_t blk, swblk_t count);
140 static swblk_t blst_meta_alloc(blmeta_t *scan, swblk_t blkat,
141 				swblk_t blk, swblk_t count,
142 				swblk_t radix, swblk_t skip);
143 static void blst_leaf_free(blmeta_t *scan, swblk_t relblk, swblk_t count);
144 static void blst_meta_free(blmeta_t *scan, swblk_t freeBlk, swblk_t count,
145 					swblk_t radix, swblk_t skip,
146 					swblk_t blk);
147 static swblk_t blst_leaf_fill(blmeta_t *scan, swblk_t blk, swblk_t count);
148 static swblk_t blst_meta_fill(blmeta_t *scan, swblk_t fillBlk, swblk_t count,
149 					swblk_t radix, swblk_t skip,
150 					swblk_t blk);
151 static void blst_copy(blmeta_t *scan, swblk_t blk, swblk_t radix,
152 				swblk_t skip, blist_t dest, swblk_t count);
153 static swblk_t	blst_radix_init(blmeta_t *scan, swblk_t radix,
154 						swblk_t skip, swblk_t count);
155 static int blst_radix_gapfind(blmeta_t *scan, swblk_t blk, swblk_t radix, swblk_t skip,
156     int state, swblk_t *maxbp, swblk_t *maxep, swblk_t *bp, swblk_t *ep);
157 
158 #if defined(BLIST_DEBUG) || defined(DDB)
159 static void	blst_radix_print(blmeta_t *scan, swblk_t blk,
160 					swblk_t radix, swblk_t skip, int tab);
161 #endif
162 
163 /*
164  * blist_create() - create a blist capable of handling up to the specified
165  *		    number of blocks
166  *
167  *	blocks must be greater than 0
168  *
169  *	The smallest blist consists of a single leaf node capable of
170  *	managing BLIST_BMAP_RADIX blocks.
171  *
172  *	The pages are addressable in range [0, nblocks[
173  */
174 
175 blist_t
blist_create(swblk_t blocks)176 blist_create(swblk_t blocks)
177 {
178 	blist_t bl;
179 	swblk_t radix;
180 	swblk_t skip = 0;
181 
182 	KASSERT(blocks > 0);
183 
184 	/*
185 	 * Calculate radix and skip field used for scanning.
186 	 *
187 	 * Radix can exceed BLIST_BMAP_RADIX bits even if swblk_t is limited
188 	 * to BLIST_BMAP_RADIX bits.
189 	 *
190 	 * XXX check overflow
191 	 */
192 	radix = BLIST_BMAP_RADIX;
193 
194 	while (radix < blocks) {
195 		radix *= BLIST_META_RADIX;
196 		skip = (skip + 1) * BLIST_META_RADIX;
197 		KASSERT(skip > 0);
198 	}
199 
200 	bl = malloc(sizeof(struct blist), M_VMSWAP, M_WAITOK | M_ZERO);
201 
202 	bl->bl_blocks = blocks;
203 	bl->bl_radix = radix;
204 	bl->bl_skip = skip;
205 	bl->bl_rootblks = 1 +
206 	    blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks);
207 	bl->bl_root = mallocarray(bl->bl_rootblks, sizeof(blmeta_t),
208 			      M_VMSWAP, M_WAITOK);
209 
210 #if defined(BLIST_DEBUG)
211 	printf(
212 		"BLIST representing %lu blocks (%lu MB of swap)"
213 		", requiring %6.2fM of ram\n",
214 		bl->bl_blocks,
215 		bl->bl_blocks * 4 / 1024,
216 		(bl->bl_rootblks * sizeof(blmeta_t) + 1023) / (1024.0 * 1024.0)
217 	);
218 	printf("BLIST raw radix tree: %lu records, top-radix %lu\n",
219 		bl->bl_rootblks, bl->bl_radix);
220 #endif
221 	blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks);
222 
223 	return(bl);
224 }
225 
226 void
blist_destroy(blist_t bl)227 blist_destroy(blist_t bl)
228 {
229 	KASSERT(bl != NULL);
230 
231 	free(bl->bl_root, M_VMSWAP, sizeof(blmeta_t) * bl->bl_rootblks);
232 	free(bl, M_VMSWAP, sizeof(struct blist));
233 }
234 
235 /*
236  * blist_alloc() - reserve space in the block bitmap.  Return the base
237  *		     of a contiguous region or SWAPBLK_NONE if space could
238  *		     not be allocated.
239  */
240 
241 swblk_t
blist_alloc(blist_t bl,swblk_t count)242 blist_alloc(blist_t bl, swblk_t count)
243 {
244 	swblk_t blk = SWAPBLK_NONE;
245 
246 	if (bl) {
247 		if (bl->bl_radix == BLIST_BMAP_RADIX)
248 			blk = blst_leaf_alloc(bl->bl_root, 0, 0, count);
249 		else
250 			blk = blst_meta_alloc(bl->bl_root, 0, 0, count,
251 					      bl->bl_radix, bl->bl_skip);
252 		if (blk != SWAPBLK_NONE) {
253 			bl->bl_free -= count;
254 
255 			KDASSERT(blk < bl->bl_blocks);
256 			KDASSERT(bl->bl_free <= bl->bl_blocks);
257 		}
258 	}
259 	return(blk);
260 }
261 
262 swblk_t
blist_allocat(blist_t bl,swblk_t count,swblk_t blkat)263 blist_allocat(blist_t bl, swblk_t count, swblk_t blkat)
264 {
265 	swblk_t blk = SWAPBLK_NONE;
266 
267 	if (bl) {
268 		KDASSERT(blkat < bl->bl_blocks);
269 		KDASSERT(blkat + count <= bl->bl_blocks);
270 
271 		if (bl->bl_radix == BLIST_BMAP_RADIX)
272 			blk = blst_leaf_alloc(bl->bl_root, blkat, 0, count);
273 		else
274 			blk = blst_meta_alloc(bl->bl_root, blkat, 0, count,
275 					      bl->bl_radix, bl->bl_skip);
276 		if (blk != SWAPBLK_NONE) {
277 			bl->bl_free -= count;
278 
279 			KDASSERT(blk < bl->bl_blocks);
280 			KDASSERT(bl->bl_free <= bl->bl_blocks);
281 		}
282 	}
283 	return(blk);
284 }
285 
286 /*
287  * blist_free() -	free up space in the block bitmap.  Return the base
288  *		     	of a contiguous region.  Panic if an inconsistency is
289  *			found.
290  */
291 
292 void
blist_free(blist_t bl,swblk_t blkno,swblk_t count)293 blist_free(blist_t bl, swblk_t blkno, swblk_t count)
294 {
295 	if (bl) {
296 		KDASSERT(blkno < bl->bl_blocks);
297 		KDASSERT(blkno + count <= bl->bl_blocks);
298 
299 		if (bl->bl_radix == BLIST_BMAP_RADIX)
300 			blst_leaf_free(bl->bl_root, blkno, count);
301 		else
302 			blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0);
303 		bl->bl_free += count;
304 
305 		KDASSERT(bl->bl_free <= bl->bl_blocks);
306 	}
307 }
308 
309 /*
310  * blist_fill() -	mark a region in the block bitmap as off-limits
311  *			to the allocator (i.e. allocate it), ignoring any
312  *			existing allocations.  Return the number of blocks
313  *			actually filled that were free before the call.
314  */
315 
316 swblk_t
blist_fill(blist_t bl,swblk_t blkno,swblk_t count)317 blist_fill(blist_t bl, swblk_t blkno, swblk_t count)
318 {
319 	swblk_t filled;
320 
321 	if (bl) {
322 		KDASSERT(blkno < bl->bl_blocks);
323 		KDASSERT(blkno + count <= bl->bl_blocks);
324 
325 		if (bl->bl_radix == BLIST_BMAP_RADIX) {
326 			filled = blst_leaf_fill(bl->bl_root, blkno, count);
327 		} else {
328 			filled = blst_meta_fill(bl->bl_root, blkno, count,
329 			    bl->bl_radix, bl->bl_skip, 0);
330 		}
331 		bl->bl_free -= filled;
332 		KDASSERT(bl->bl_free <= bl->bl_blocks);
333 		return (filled);
334 	} else {
335 		return 0;
336 	}
337 }
338 
339 /*
340  * blist_resize() -	resize an existing radix tree to handle the
341  *			specified number of blocks.  This will reallocate
342  *			the tree and transfer the previous bitmap to the new
343  *			one.  When extending the tree you can specify whether
344  *			the new blocks are to left allocated or freed.
345  */
346 
347 void
blist_resize(blist_t * pbl,swblk_t count,int freenew)348 blist_resize(blist_t *pbl, swblk_t count, int freenew)
349 {
350     blist_t newbl = blist_create(count);
351     blist_t save = *pbl;
352 
353     *pbl = newbl;
354     if (count > save->bl_blocks)
355 	    count = save->bl_blocks;
356     blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count);
357 
358     /*
359      * If resizing upwards, should we free the new space or not?
360      */
361     if (freenew && count < newbl->bl_blocks) {
362 	    blist_free(newbl, count, newbl->bl_blocks - count);
363     }
364     blist_destroy(save);
365 }
366 
367 #define	GAPFIND_FIRSTFREE 0
368 #define	GAPFIND_FIRSTUSED 1
369 
370 /*
371  * blist_gapfind() -	return the largest gap (free pages) in blist.
372  *			the blist isn't modified. the returned range
373  *			is [maxbp, maxep[ . The size of the gap is
374  *			maxep - maxbp. If not found, the size is 0.
375  */
376 
377 void
blist_gapfind(blist_t bl,swblk_t * maxbp,swblk_t * maxep)378 blist_gapfind(blist_t bl, swblk_t *maxbp, swblk_t *maxep)
379 {
380 	int state;
381 	swblk_t b, e;
382 
383 	/* initialize gaps (max and current) */
384 	*maxbp = *maxep = 0;
385 	b = e = 0;
386 
387 	/* search the larger gap from block 0 */
388 	state = blst_radix_gapfind(bl->bl_root, 0, bl->bl_radix, bl->bl_skip,
389 	    GAPFIND_FIRSTFREE, maxbp, maxep, &b, &e);
390 
391 	if (state == GAPFIND_FIRSTUSED) {
392 		e = bl->bl_blocks;
393 		if (*maxep - *maxbp < e - b) {
394 			*maxbp = b;
395 			*maxep = e;
396 		}
397 	}
398 
399 	KDASSERT(*maxbp <= *maxep);
400 	KDASSERT(*maxbp <  bl->bl_blocks);
401 	KDASSERT(*maxep <= bl->bl_blocks);
402 }
403 
404 /*
405  * blst_radix_gapfind -	search the larger gap in one pass
406  *
407  * - search first free block, from X -> set B
408  * - search first used block, from B -> set E
409  * - if the size (E - B) is larger than max, update it
410  * - loop (with X=E) until end of blist
411  * - max is the larger free gap
412  */
413 static int
blst_radix_gapfind(blmeta_t * scan,swblk_t blk,swblk_t radix,swblk_t skip,int state,swblk_t * maxbp,swblk_t * maxep,swblk_t * bp,swblk_t * ep)414 blst_radix_gapfind(blmeta_t *scan, swblk_t blk, swblk_t radix, swblk_t skip,
415     int state, swblk_t *maxbp, swblk_t *maxep, swblk_t *bp, swblk_t *ep)
416 {
417 	swblk_t i;
418 	swblk_t next_skip;
419 
420 	if (radix == BLIST_BMAP_RADIX) {
421 		/* leaf node: we consider only completely free bitmaps as free */
422 		if (state == GAPFIND_FIRSTFREE) {
423 			if (scan->u.bmu_bitmap == (u_swblk_t)-1) {
424 				/* node is fully free */
425 				*bp = blk;
426 				return GAPFIND_FIRSTUSED;
427 			}
428 
429 			/* it isn't fully free, not found, keep state */
430 			return state;
431 
432 		} else if (state == GAPFIND_FIRSTUSED) {
433 			if (scan->u.bmu_bitmap == (u_swblk_t)-1) {
434 				/* it is free, not found, keep state */
435 				return state;
436 			}
437 
438 			/* it is (at least partially) used */
439 			*ep = blk;
440 			if (*maxep - *maxbp < *ep - *bp) {
441 				*maxbp = *bp;
442 				*maxep = *ep;
443 			}
444 			return GAPFIND_FIRSTFREE;
445 		}
446 	}
447 
448 	if (scan->u.bmu_avail == 0) {
449 		/* ALL-ALLOCATED */
450 		if (state == GAPFIND_FIRSTFREE) {
451 			/* searching free block, not found, keep state */
452 			return state;
453 
454 		} else if (state == GAPFIND_FIRSTUSED) {
455 			/* searching used block, found */
456 			*ep = blk;
457 			if (*maxep - *maxbp < *ep - *bp) {
458 				*maxbp = *bp;
459 				*maxep = *ep;
460 			}
461 			return GAPFIND_FIRSTFREE;
462 		}
463 	}
464 
465 	if (scan->u.bmu_avail == radix) {
466 		/* ALL-FREE */
467 		if (state == GAPFIND_FIRSTFREE) {
468 			/* searching free block, found */
469 			*bp = blk;
470 			return GAPFIND_FIRSTUSED;
471 
472 		} else if (state == GAPFIND_FIRSTUSED) {
473 			/* searching used block, not found, keep state */
474 			return state;
475 		}
476 	}
477 
478 	radix /= BLIST_META_RADIX;
479 	next_skip = (skip / BLIST_META_RADIX);
480 
481 	for (i = 1; i <= skip; i += next_skip) {
482 		if (scan[i].bm_bighint == (swblk_t)-1)
483 			/* Terminator */
484 			break;
485 
486 		state = blst_radix_gapfind(&scan[i], blk, radix, next_skip - 1,
487 		    state, maxbp, maxep, bp, ep);
488 
489 		blk += radix;
490 	}
491 
492 	return state;
493 }
494 
495 #if defined(BLIST_DEBUG) || defined(DDB)
496 
497 /*
498  * blist_print()    - dump radix tree
499  */
500 
501 void
blist_print(blist_t bl)502 blist_print(blist_t bl)
503 {
504 	printf("BLIST {\n");
505 	blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4);
506 	printf("}\n");
507 }
508 
509 #endif
510 
511 /************************************************************************
512  *			  ALLOCATION SUPPORT FUNCTIONS			*
513  ************************************************************************
514  *
515  *	These support functions do all the actual work.  They may seem
516  *	rather longish, but that's because I've commented them up.  The
517  *	actual code is straight forward.
518  *
519  */
520 
521 /*
522  * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
523  *
524  *	This is the core of the allocator and is optimized for the 1 block
525  *	and the BLIST_BMAP_RADIX block allocation cases.  Other cases are
526  *	somewhat slower.  The 1 block allocation case is log2 and extremely
527  *	quick.
528  */
529 
530 static swblk_t
blst_leaf_alloc(blmeta_t * scan,swblk_t blkat __unused,swblk_t blk,swblk_t count)531 blst_leaf_alloc(blmeta_t *scan, swblk_t blkat __unused, swblk_t blk,
532 		swblk_t count)
533 {
534 	u_swblk_t orig = scan->u.bmu_bitmap;
535 
536 	if (orig == 0) {
537 		/*
538 		 * Optimize bitmap all-allocated case.  Also, count = 1
539 		 * case assumes at least 1 bit is free in the bitmap, so
540 		 * we have to take care of this case here.
541 		 */
542 		scan->bm_bighint = 0;
543 		return(SWAPBLK_NONE);
544 	}
545 	if (count == 1) {
546 		/*
547 		 * Optimized code to allocate one bit out of the bitmap
548 		 */
549 		u_swblk_t mask;
550 		int j = BLIST_BMAP_RADIX/2;
551 		int r = 0;
552 
553 		mask = (u_swblk_t)-1 >> (BLIST_BMAP_RADIX/2);
554 
555 		while (j) {
556 			if ((orig & mask) == 0) {
557 			    r += j;
558 			    orig >>= j;
559 			}
560 			j >>= 1;
561 			mask >>= j;
562 		}
563 		scan->u.bmu_bitmap &= ~((u_swblk_t)1 << r);
564 		return(blk + r);
565 	}
566 	if (count <= BLIST_BMAP_RADIX) {
567 		/*
568 		 * non-optimized code to allocate N bits out of the bitmap.
569 		 * The more bits, the faster the code runs.  It will run
570 		 * the slowest allocating 2 bits, but since there aren't any
571 		 * memory ops in the core loop (or shouldn't be, anyway),
572 		 * you probably won't notice the difference.
573 		 */
574 		int j;
575 		int n = (int)(BLIST_BMAP_RADIX - count);
576 		u_swblk_t mask;
577 
578 		mask = (u_swblk_t)-1 >> n;
579 
580 		for (j = 0; j <= n; ++j) {
581 			if ((orig & mask) == mask) {
582 				scan->u.bmu_bitmap &= ~mask;
583 				return(blk + j);
584 			}
585 			mask = (mask << 1);
586 		}
587 	}
588 
589 	/*
590 	 * We couldn't allocate count in this subtree, update bighint.
591 	 */
592 	scan->bm_bighint = count - 1;
593 
594 	return(SWAPBLK_NONE);
595 }
596 
597 /*
598  * blist_meta_alloc() -	allocate at a meta in the radix tree.
599  *
600  *	Attempt to allocate at a meta node.  If we can't, we update
601  *	bighint and return a failure.  Updating bighint optimize future
602  *	calls that hit this node.  We have to check for our collapse cases
603  *	and we have a few optimizations strewn in as well.
604  */
605 static swblk_t
blst_meta_alloc(blmeta_t * scan,swblk_t blkat,swblk_t blk,swblk_t count,swblk_t radix,swblk_t skip)606 blst_meta_alloc(blmeta_t *scan, swblk_t blkat,
607 		swblk_t blk, swblk_t count,
608 		swblk_t radix, swblk_t skip)
609 {
610 	int hintok = (blk >= blkat);
611 	swblk_t next_skip = ((swblk_t)skip / BLIST_META_RADIX);
612 	swblk_t i;
613 
614 #ifndef _KERNEL
615 	printf("blist_meta_alloc blkat %lu blk %lu count %lu radix %lu\n",
616 		blkat, blk, count, radix);
617 #endif
618 
619 	/*
620 	 * ALL-ALLOCATED special case
621 	 */
622 	if (scan->u.bmu_avail == 0)  {
623 		scan->bm_bighint = 0;
624 		return(SWAPBLK_NONE);
625 	}
626 
627 	/*
628 	 * ALL-FREE special case, initialize uninitialized
629 	 * sublevel.
630 	 *
631 	 * NOTE: radix may exceed 32 bits until first division.
632 	 */
633 	if (scan->u.bmu_avail == radix) {
634 		scan->bm_bighint = radix;
635 
636 		radix /= BLIST_META_RADIX;
637 		for (i = 1; i <= skip; i += next_skip) {
638 			if (scan[i].bm_bighint == (swblk_t)-1)
639 				break;
640 			if (next_skip == 1) {
641 				scan[i].u.bmu_bitmap = (u_swblk_t)-1;
642 				scan[i].bm_bighint = BLIST_BMAP_RADIX;
643 			} else {
644 				scan[i].bm_bighint = (swblk_t)radix;
645 				scan[i].u.bmu_avail = (swblk_t)radix;
646 			}
647 		}
648 	} else {
649 		radix /= BLIST_META_RADIX;
650 	}
651 
652 	for (i = 1; i <= skip; i += next_skip) {
653 		if (scan[i].bm_bighint == (swblk_t)-1) {
654 			/*
655 			 * Terminator
656 			 *
657 			 * note: check it first, as swblk_t may be unsigned.
658 			 *   otherwise, the second if() might match and the
659 			 *   Terminator will be ignored.
660 			 */
661 			break;
662 		}
663 
664 		if (count <= scan[i].bm_bighint &&
665 		    blk + (swblk_t)radix > blkat) {
666 			/*
667 			 * count fits in object
668 			 */
669 			swblk_t r;
670 			if (next_skip == 1) {
671 				r = blst_leaf_alloc(&scan[i], blkat,
672 						    blk, count);
673 			} else {
674 				r = blst_meta_alloc(&scan[i], blkat,
675 						    blk, count,
676 						    radix, next_skip - 1);
677 			}
678 			if (r != SWAPBLK_NONE) {
679 				scan->u.bmu_avail -= count;
680 				if (scan->bm_bighint > scan->u.bmu_avail)
681 					scan->bm_bighint = scan->u.bmu_avail;
682 				return(r);
683 			}
684 			/* bighint was updated by recursion */
685 		} else if (count > (swblk_t)radix) {
686 			/*
687 			 * count does not fit in object even if it were
688 			 * complete free.
689 			 */
690 			panic("%s: allocation too large %lu/%lu",
691 			      __func__, count, radix);
692 		}
693 		blk += (swblk_t)radix;
694 	}
695 
696 	/*
697 	 * We couldn't allocate count in this subtree, update bighint.
698 	 */
699 	if (hintok && scan->bm_bighint >= count)
700 		scan->bm_bighint = count - 1;
701 	return(SWAPBLK_NONE);
702 }
703 
704 /*
705  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
706  */
707 static void
blst_leaf_free(blmeta_t * scan,swblk_t blk,swblk_t count)708 blst_leaf_free(blmeta_t *scan, swblk_t blk, swblk_t count)
709 {
710 	/*
711 	 * free some data in this bitmap
712 	 *
713 	 * e.g.
714 	 *	0000111111111110000
715 	 *          \_________/\__/
716 	 *		v        n
717 	 */
718 	int n = blk & (BLIST_BMAP_RADIX - 1);
719 	u_swblk_t mask;
720 
721 	mask = ((u_swblk_t)-1 << n) &
722 	    ((u_swblk_t)-1 >> (BLIST_BMAP_RADIX - count - n));
723 
724 	if (scan->u.bmu_bitmap & mask)
725 		panic("%s: freeing free block", __func__);
726 	scan->u.bmu_bitmap |= mask;
727 
728 	/*
729 	 * We could probably do a better job here.  We are required to make
730 	 * bighint at least as large as the biggest contiguous block of
731 	 * data.  If we just shoehorn it, a little extra overhead will
732 	 * be incurred on the next allocation (but only that one typically).
733 	 */
734 	scan->bm_bighint = BLIST_BMAP_RADIX;
735 }
736 
737 /*
738  * BLST_META_FREE() - free allocated blocks from radix tree meta info
739  *
740  *	This support routine frees a range of blocks from the bitmap.
741  *	The range must be entirely enclosed by this radix node.  If a
742  *	meta node, we break the range down recursively to free blocks
743  *	in subnodes (which means that this code can free an arbitrary
744  *	range whereas the allocation code cannot allocate an arbitrary
745  *	range).
746  */
747 
748 static void
blst_meta_free(blmeta_t * scan,swblk_t freeBlk,swblk_t count,swblk_t radix,swblk_t skip,swblk_t blk)749 blst_meta_free(blmeta_t *scan, swblk_t freeBlk, swblk_t count,
750 	       swblk_t radix, swblk_t skip, swblk_t blk)
751 {
752 	swblk_t i;
753 	swblk_t next_skip = ((swblk_t)skip / BLIST_META_RADIX);
754 
755 #if 0
756 	printf("FREE (%04lx,%lu) FROM (%04lx,%lu)\n",
757 	    freeBlk, count,
758 	    blk, radix
759 	);
760 #endif
761 
762 	/*
763 	 * ALL-ALLOCATED special case, initialize for recursion.
764 	 *
765 	 * We will short-cut the ALL-ALLOCATED -> ALL-FREE case.
766 	 */
767 	if (scan->u.bmu_avail == 0) {
768 		scan->u.bmu_avail = count;
769 		scan->bm_bighint = count;
770 
771 		if (count != radix)  {
772 			for (i = 1; i <= skip; i += next_skip) {
773 				if (scan[i].bm_bighint == (swblk_t)-1)
774 					break;
775 				scan[i].bm_bighint = 0;
776 				if (next_skip == 1) {
777 					scan[i].u.bmu_bitmap = 0;
778 				} else {
779 					scan[i].u.bmu_avail = 0;
780 				}
781 			}
782 			/* fall through */
783 		}
784 	} else {
785 		scan->u.bmu_avail += count;
786 		/* scan->bm_bighint = radix; */
787 	}
788 
789 	/*
790 	 * ALL-FREE special case.
791 	 *
792 	 * Set bighint for higher levels to snoop.
793 	 */
794 	if (scan->u.bmu_avail == radix) {
795 		scan->bm_bighint = radix;
796 		return;
797 	}
798 
799 	/*
800 	 * Break the free down into its components
801 	 */
802 	if (scan->u.bmu_avail > radix) {
803 		panic("%s: freeing already "
804 		      "free blocks (%lu) %lu/%lu",
805 		      __func__, count, (long)scan->u.bmu_avail, radix);
806 	}
807 
808 	radix /= BLIST_META_RADIX;
809 
810 	i = (freeBlk - blk) / (swblk_t)radix;
811 	blk += i * (swblk_t)radix;
812 	i = i * next_skip + 1;
813 
814 	while (i <= skip && blk < freeBlk + count) {
815 		swblk_t v;
816 
817 		v = blk + (swblk_t)radix - freeBlk;
818 		if (v > count)
819 			v = count;
820 
821 		if (scan->bm_bighint == (swblk_t)-1)
822 			panic("%s: freeing unexpected range", __func__);
823 
824 		if (next_skip == 1) {
825 			blst_leaf_free(&scan[i], freeBlk, v);
826 		} else {
827 			blst_meta_free(&scan[i], freeBlk, v,
828 				       radix, next_skip - 1, blk);
829 		}
830 
831 		/*
832 		 * After having dealt with the becomes-all-free case any
833 		 * partial free will not be able to bring us to the
834 		 * becomes-all-free state.
835 		 *
836 		 * We can raise bighint to at least the sub-segment's
837 		 * bighint.
838 		 */
839 		if (scan->bm_bighint < scan[i].bm_bighint) {
840 		    scan->bm_bighint = scan[i].bm_bighint;
841 		}
842 		count -= v;
843 		freeBlk += v;
844 		blk += (swblk_t)radix;
845 		i += next_skip;
846 	}
847 }
848 
849 /*
850  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
851  *
852  *	Allocates all blocks in the specified range regardless of
853  *	any existing allocations in that range.  Returns the number
854  *	of blocks allocated by the call.
855  */
856 static swblk_t
blst_leaf_fill(blmeta_t * scan,swblk_t blk,swblk_t count)857 blst_leaf_fill(blmeta_t *scan, swblk_t blk, swblk_t count)
858 {
859 	int n = blk & (BLIST_BMAP_RADIX - 1);
860 	swblk_t nblks;
861 	u_swblk_t mask, bitmap;
862 
863 	mask = ((u_swblk_t)-1 << n) &
864 	    ((u_swblk_t)-1 >> (BLIST_BMAP_RADIX - count - n));
865 
866 	/* Count the number of blocks we're about to allocate */
867 	bitmap = scan->u.bmu_bitmap & mask;
868 	for (nblks = 0; bitmap != 0; nblks++)
869 		bitmap &= bitmap - 1;
870 
871 	scan->u.bmu_bitmap &= ~mask;
872 	return (nblks);
873 }
874 
875 /*
876  * BLST_META_FILL() -	allocate specific blocks at a meta node
877  *
878  *	Allocates the specified range of blocks, regardless of
879  *	any existing allocations in the range.  The range must
880  *	be within the extent of this node.  Returns the number
881  *	of blocks allocated by the call.
882  */
883 static swblk_t
blst_meta_fill(blmeta_t * scan,swblk_t fillBlk,swblk_t count,swblk_t radix,swblk_t skip,swblk_t blk)884 blst_meta_fill(blmeta_t *scan, swblk_t fillBlk, swblk_t count,
885 	       swblk_t radix, swblk_t skip, swblk_t blk)
886 {
887 	swblk_t i;
888 	swblk_t next_skip = ((swblk_t)skip / BLIST_META_RADIX);
889 	swblk_t nblks = 0;
890 
891 	if (count == radix || scan->u.bmu_avail == 0) {
892 		/*
893 		 * ALL-ALLOCATED special case
894 		 */
895 		nblks = scan->u.bmu_avail;
896 		scan->u.bmu_avail = 0;
897 		scan->bm_bighint = count;
898 		return (nblks);
899 	}
900 
901 	if (scan->u.bmu_avail == radix) {
902 		radix /= BLIST_META_RADIX;
903 
904 		/*
905 		 * ALL-FREE special case, initialize sublevel
906 		 */
907 		for (i = 1; i <= skip; i += next_skip) {
908 			if (scan[i].bm_bighint == (swblk_t)-1)
909 				break;
910 			if (next_skip == 1) {
911 				scan[i].u.bmu_bitmap = (u_swblk_t)-1;
912 				scan[i].bm_bighint = BLIST_BMAP_RADIX;
913 			} else {
914 				scan[i].bm_bighint = (swblk_t)radix;
915 				scan[i].u.bmu_avail = (swblk_t)radix;
916 			}
917 		}
918 	} else {
919 		radix /= BLIST_META_RADIX;
920 	}
921 
922 	if (count > (swblk_t)radix)
923 		panic("%s: allocation too large", __func__);
924 
925 	i = (fillBlk - blk) / (swblk_t)radix;
926 	blk += i * (swblk_t)radix;
927 	i = i * next_skip + 1;
928 
929 	while (i <= skip && blk < fillBlk + count) {
930 		swblk_t v;
931 
932 		v = blk + (swblk_t)radix - fillBlk;
933 		if (v > count)
934 			v = count;
935 
936 		if (scan->bm_bighint == (swblk_t)-1)
937 			panic("%s: filling unexpected range", __func__);
938 
939 		if (next_skip == 1) {
940 			nblks += blst_leaf_fill(&scan[i], fillBlk, v);
941 		} else {
942 			nblks += blst_meta_fill(&scan[i], fillBlk, v,
943 			    radix, next_skip - 1, blk);
944 		}
945 		count -= v;
946 		fillBlk += v;
947 		blk += (swblk_t)radix;
948 		i += next_skip;
949 	}
950 	scan->u.bmu_avail -= nblks;
951 	return (nblks);
952 }
953 
954 /*
955  * BLIST_RADIX_COPY() - copy one radix tree to another
956  *
957  *	Locates free space in the source tree and frees it in the destination
958  *	tree.  The space may not already be free in the destination.
959  */
960 
961 static void
blst_copy(blmeta_t * scan,swblk_t blk,swblk_t radix,swblk_t skip,blist_t dest,swblk_t count)962 blst_copy(blmeta_t *scan, swblk_t blk, swblk_t radix,
963 	  swblk_t skip, blist_t dest, swblk_t count)
964 {
965 	swblk_t next_skip;
966 	swblk_t i;
967 
968 	/*
969 	 * Leaf node
970 	 */
971 
972 	if (radix == BLIST_BMAP_RADIX) {
973 		u_swblk_t v = scan->u.bmu_bitmap;
974 
975 		if (v == (u_swblk_t)-1) {
976 			blist_free(dest, blk, count);
977 		} else if (v != 0) {
978 			for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
979 				if (v & ((swblk_t)1 << i))
980 					blist_free(dest, blk + i, 1);
981 			}
982 		}
983 		return;
984 	}
985 
986 	/*
987 	 * Meta node
988 	 */
989 
990 	if (scan->u.bmu_avail == 0) {
991 		/*
992 		 * Source all allocated, leave dest allocated
993 		 */
994 		return;
995 	}
996 	if (scan->u.bmu_avail == radix) {
997 		/*
998 		 * Source all free, free entire dest
999 		 */
1000 		if (count < radix)
1001 			blist_free(dest, blk, count);
1002 		else
1003 			blist_free(dest, blk, (swblk_t)radix);
1004 		return;
1005 	}
1006 
1007 
1008 	radix /= BLIST_META_RADIX;
1009 	next_skip = ((u_swblk_t)skip / BLIST_META_RADIX);
1010 
1011 	for (i = 1; count && i <= skip; i += next_skip) {
1012 		if (scan[i].bm_bighint == (swblk_t)-1)
1013 			break;
1014 
1015 		if (count >= (swblk_t)radix) {
1016 			blst_copy(
1017 			    &scan[i],
1018 			    blk,
1019 			    radix,
1020 			    next_skip - 1,
1021 			    dest,
1022 			    (swblk_t)radix
1023 			);
1024 			count -= (swblk_t)radix;
1025 		} else {
1026 			if (count) {
1027 				blst_copy(
1028 				    &scan[i],
1029 				    blk,
1030 				    radix,
1031 				    next_skip - 1,
1032 				    dest,
1033 				    count
1034 				);
1035 			}
1036 			count = 0;
1037 		}
1038 		blk += (swblk_t)radix;
1039 	}
1040 }
1041 
1042 /*
1043  * BLST_RADIX_INIT() - initialize radix tree
1044  *
1045  *	Initialize our meta structures and bitmaps and calculate the exact
1046  *	amount of space required to manage 'count' blocks - this space may
1047  *	be considerably less than the calculated radix due to the large
1048  *	RADIX values we use.
1049  */
1050 
1051 static swblk_t
blst_radix_init(blmeta_t * scan,swblk_t radix,swblk_t skip,swblk_t count)1052 blst_radix_init(blmeta_t *scan, swblk_t radix, swblk_t skip, swblk_t count)
1053 {
1054 	swblk_t i;
1055 	swblk_t next_skip;
1056 	swblk_t memindex = 0;
1057 
1058 	/*
1059 	 * Leaf node
1060 	 */
1061 
1062 	if (radix == BLIST_BMAP_RADIX) {
1063 		if (scan) {
1064 			scan->bm_bighint = 0;
1065 			scan->u.bmu_bitmap = 0;
1066 		}
1067 		return(memindex);
1068 	}
1069 
1070 	/*
1071 	 * Meta node.  If allocating the entire object we can special
1072 	 * case it.  However, we need to figure out how much memory
1073 	 * is required to manage 'count' blocks, so we continue on anyway.
1074 	 */
1075 
1076 	if (scan) {
1077 		scan->bm_bighint = 0;
1078 		scan->u.bmu_avail = 0;
1079 	}
1080 
1081 	radix /= BLIST_META_RADIX;
1082 	next_skip = ((u_swblk_t)skip / BLIST_META_RADIX);
1083 
1084 	for (i = 1; i <= skip; i += next_skip) {
1085 		if (count >= (swblk_t)radix) {
1086 			/*
1087 			 * Allocate the entire object
1088 			 */
1089 			memindex = i + blst_radix_init(
1090 			    ((scan) ? &scan[i] : NULL),
1091 			    radix,
1092 			    next_skip - 1,
1093 			    (swblk_t)radix
1094 			);
1095 			count -= (swblk_t)radix;
1096 		} else if (count > 0) {
1097 			/*
1098 			 * Allocate a partial object
1099 			 */
1100 			memindex = i + blst_radix_init(
1101 			    ((scan) ? &scan[i] : NULL),
1102 			    radix,
1103 			    next_skip - 1,
1104 			    count
1105 			);
1106 			count = 0;
1107 		} else {
1108 			/*
1109 			 * Add terminator and break out
1110 			 */
1111 			if (scan)
1112 				scan[i].bm_bighint = (swblk_t)-1;
1113 			break;
1114 		}
1115 	}
1116 	if (memindex < i)
1117 		memindex = i;
1118 	return(memindex);
1119 }
1120 
1121 #if defined(BLIST_DEBUG) || defined(DDB)
1122 
1123 static void
blst_radix_print(blmeta_t * scan,swblk_t blk,swblk_t radix,swblk_t skip,int tab)1124 blst_radix_print(blmeta_t *scan, swblk_t blk, swblk_t radix, swblk_t skip, int tab)
1125 {
1126 	swblk_t i;
1127 	swblk_t next_skip;
1128 
1129 	if (radix == BLIST_BMAP_RADIX) {
1130 		printf(
1131 		    "%*.*s(%04lx,%lu): bitmap %0*llx big=%lu\n",
1132 		    tab, tab, "",
1133 		    blk, radix,
1134 		    (int)(1 + (BLIST_BMAP_RADIX - 1) / 4),
1135 		    scan->u.bmu_bitmap,
1136 		    scan->bm_bighint
1137 		);
1138 		return;
1139 	}
1140 
1141 	if (scan->u.bmu_avail == 0) {
1142 		printf(
1143 		    "%*.*s(%04lx,%lu) ALL ALLOCATED\n",
1144 		    tab, tab, "",
1145 		    blk,
1146 		    radix
1147 		);
1148 		return;
1149 	}
1150 	if (scan->u.bmu_avail == radix) {
1151 		printf(
1152 		    "%*.*s(%04lx,%lu) ALL FREE\n",
1153 		    tab, tab, "",
1154 		    blk,
1155 		    radix
1156 		);
1157 		return;
1158 	}
1159 
1160 	printf(
1161 	    "%*.*s(%04lx,%lu): subtree (%lu/%lu) big=%lu {\n",
1162 	    tab, tab, "",
1163 	    blk, radix,
1164 	    scan->u.bmu_avail,
1165 	    radix,
1166 	    scan->bm_bighint
1167 	);
1168 
1169 	radix /= BLIST_META_RADIX;
1170 	next_skip = ((u_swblk_t)skip / BLIST_META_RADIX);
1171 	tab += 4;
1172 
1173 	for (i = 1; i <= skip; i += next_skip) {
1174 		if (scan[i].bm_bighint == (swblk_t)-1) {
1175 			printf(
1176 			    "%*.*s(%04lx,%lu): Terminator\n",
1177 			    tab, tab, "",
1178 			    blk, radix
1179 			);
1180 			break;
1181 		}
1182 		blst_radix_print(
1183 		    &scan[i],
1184 		    blk,
1185 		    radix,
1186 		    next_skip - 1,
1187 		    tab
1188 		);
1189 		blk += (swblk_t)radix;
1190 	}
1191 	tab -= 4;
1192 
1193 	printf(
1194 	    "%*.*s}\n",
1195 	    tab, tab, ""
1196 	);
1197 }
1198 
1199 #endif
1200 
1201 #if !defined(_KERNEL) && defined(BLIST_DEBUG)
1202 
1203 int
main(int ac,char ** av)1204 main(int ac, char **av)
1205 {
1206 	swblk_t size = 1024;
1207 	swblk_t i;
1208 	blist_t bl;
1209 
1210 	for (i = 1; i < (swblk_t)ac; ++i) {
1211 		const char *ptr = av[i];
1212 		if (*ptr != '-') {
1213 			size = strtol(ptr, NULL, 0);
1214 			continue;
1215 		}
1216 		ptr += 2;
1217 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1218 		exit(1);
1219 	}
1220 	bl = blist_create(size);
1221 	blist_free(bl, 0, size);
1222 
1223 	for (;;) {
1224 		char buf[1024];
1225 		swblk_t da = 0;
1226 		swblk_t count = 0;
1227 		swblk_t blkat;
1228 
1229 
1230 		printf("%lu/%lu/%lu> ",
1231 			bl->bl_free, size, bl->bl_radix);
1232 		fflush(stdout);
1233 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1234 			break;
1235 		switch(buf[0]) {
1236 		case '#':
1237 			continue;
1238 		case 'r':
1239 			if (sscanf(buf + 1, "%li", &count) == 1) {
1240 				blist_resize(&bl, count, 1);
1241 				size = count;
1242 			} else {
1243 				printf("?\n");
1244 			}
1245 		case 'p':
1246 			blist_print(bl);
1247 			break;
1248 		case 'a':
1249 			if (sscanf(buf + 1, "%li %li", &count, &blkat) == 1) {
1250 				printf("count %lu\n", count);
1251 				swblk_t blk = blist_alloc(bl, count);
1252 				if (blk == SWAPBLK_NONE)
1253 					printf("    R=SWAPBLK_NONE\n");
1254 				else
1255 					printf("    R=%04lx\n", blk);
1256 			} else if (sscanf(buf + 1, "%li %li", &count, &blkat) == 2) {
1257 				swblk_t blk = blist_allocat(bl, count, blkat);
1258 				if (blk == SWAPBLK_NONE)
1259 					printf("    R=SWAPBLK_NONE\n");
1260 				else
1261 					printf("    R=%04lx\n", blk);
1262 			} else {
1263 				printf("?\n");
1264 			}
1265 			break;
1266 		case 'f':
1267 			if (sscanf(buf + 1, "%li %li", &da, &count) == 2) {
1268 				blist_free(bl, da, count);
1269 			} else {
1270 				printf("?\n");
1271 			}
1272 			break;
1273 		case 'g': {
1274 				swblk_t b, e;
1275 				blist_gapfind(bl, &b, &e);
1276 				printf("gapfind: begin=%04lx end=%04lx size=%lu\n",
1277 				    b, e, e-b);
1278 				break;
1279 			}
1280 		case 'l':
1281 			if (sscanf(buf + 1, "%li %li", &da, &count) == 2) {
1282 				printf("    n=%lu\n",
1283 				    blist_fill(bl, da, count));
1284 			} else {
1285 				printf("?\n");
1286 			}
1287 			break;
1288 		case '?':
1289 		case 'h':
1290 			puts(
1291 			    "p          -print\n"
1292 			    "a %li      -allocate\n"
1293 			    "f %li %li  -free\n"
1294 			    "l %li %li	-fill\n"
1295 			    "g		-gapfind\n"
1296 			    "r %li      -resize\n"
1297 			    "h/?        -help\n"
1298 			    "    hex may be specified with 0x prefix\n"
1299 			);
1300 			break;
1301 		default:
1302 			printf("?\n");
1303 			break;
1304 		}
1305 	}
1306 	return(0);
1307 }
1308 
1309 #endif
1310