1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 /* 30 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting 31 * 32 * This module implements a general bitmap allocator/deallocator. The 33 * allocator eats around 2 bits per 'block'. The module does not 34 * try to interpret the meaning of a 'block' other than to return 35 * SWAPBLK_NONE on an allocation failure. 36 * 37 * A radix tree controls access to pieces of the bitmap, and includes 38 * auxiliary information at each interior node about the availabilty of 39 * contiguous free blocks in the subtree rooted at that node. Two radix 40 * constants are involved: one for the size of the bitmaps contained in the 41 * leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of 42 * each of the meta (interior) nodes (BLIST_META_RADIX). Each subtree is 43 * associated with a range of blocks. The root of any subtree stores a 44 * hint field that defines an upper bound on the size of the largest 45 * allocation that can begin in the associated block range. A hint is an 46 * upper bound on a potential allocation, but not necessarily a tight upper 47 * bound. 48 * 49 * The bitmap field in each node directs the search for available blocks. 50 * For a leaf node, a bit is set if the corresponding block is free. For a 51 * meta node, a bit is set if the corresponding subtree contains a free 52 * block somewhere within it. The search at a meta node considers only 53 * children of that node that represent a range that includes a free block. 54 * 55 * The hinting greatly increases code efficiency for allocations while 56 * the general radix structure optimizes both allocations and frees. The 57 * radix tree should be able to operate well no matter how much 58 * fragmentation there is and no matter how large a bitmap is used. 59 * 60 * The blist code wires all necessary memory at creation time. Neither 61 * allocations nor frees require interaction with the memory subsystem. 62 * The non-blocking nature of allocations and frees is required by swap 63 * code (vm/swap_pager.c). 64 * 65 * LAYOUT: The radix tree is laid out recursively using a linear array. 66 * Each meta node is immediately followed (laid out sequentially in 67 * memory) by BLIST_META_RADIX lower level nodes. This is a recursive 68 * structure but one that can be easily scanned through a very simple 69 * 'skip' calculation. The memory allocation is only large enough to 70 * cover the number of blocks requested at creation time. Nodes that 71 * represent blocks beyond that limit, nodes that would never be read 72 * or written, are not allocated, so that the last of the 73 * BLIST_META_RADIX lower level nodes of a some nodes may not be 74 * allocated. 75 * 76 * NOTE: the allocator cannot currently allocate more than 77 * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too 78 * large' if you try. This is an area that could use improvement. The 79 * radix is large enough that this restriction does not effect the swap 80 * system, though. Currently only the allocation code is affected by 81 * this algorithmic unfeature. The freeing code can handle arbitrary 82 * ranges. 83 * 84 * This code can be compiled stand-alone for debugging. 85 */ 86 87 #include <sys/cdefs.h> 88 __FBSDID("$FreeBSD$"); 89 90 #ifdef _KERNEL 91 92 #include <sys/param.h> 93 #include <sys/systm.h> 94 #include <sys/lock.h> 95 #include <sys/kernel.h> 96 #include <sys/blist.h> 97 #include <sys/malloc.h> 98 #include <sys/sbuf.h> 99 #include <sys/proc.h> 100 #include <sys/mutex.h> 101 102 #else 103 104 #ifndef BLIST_NO_DEBUG 105 #define BLIST_DEBUG 106 #endif 107 108 #include <sys/errno.h> 109 #include <sys/types.h> 110 #include <sys/malloc.h> 111 #include <sys/sbuf.h> 112 #include <assert.h> 113 #include <stdio.h> 114 #include <string.h> 115 #include <stddef.h> 116 #include <stdlib.h> 117 #include <stdarg.h> 118 #include <stdbool.h> 119 120 #define bitcount64(x) __bitcount64((uint64_t)(x)) 121 #define malloc(a,b,c) calloc(a, 1) 122 #define free(a,b) free(a) 123 #define ummin(a,b) ((a) < (b) ? (a) : (b)) 124 #define KASSERT(a,b) assert(a) 125 126 #include <sys/blist.h> 127 128 #endif 129 130 /* 131 * static support functions 132 */ 133 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count); 134 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, 135 u_daddr_t radix); 136 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); 137 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 138 u_daddr_t radix); 139 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 140 blist_t dest, daddr_t count); 141 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count); 142 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, 143 u_daddr_t radix); 144 #ifndef _KERNEL 145 static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, 146 int tab); 147 #endif 148 149 #ifdef _KERNEL 150 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space"); 151 #endif 152 153 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0, 154 "radix divisibility error"); 155 #define BLIST_BMAP_MASK (BLIST_BMAP_RADIX - 1) 156 #define BLIST_META_MASK (BLIST_META_RADIX - 1) 157 158 /* 159 * For a subtree that can represent the state of up to 'radix' blocks, the 160 * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX. If 'm' 161 * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h 162 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h, 163 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip' 164 * in the 'meta' functions that process subtrees. Since integer division 165 * discards remainders, we can express this computation as 166 * skip = (m * m**h) / (m - 1) 167 * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1) 168 * and since m divides BLIST_BMAP_RADIX, we can simplify further to 169 * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1) 170 * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1)) 171 * so that simple integer division by a constant can safely be used for the 172 * calculation. 173 */ 174 static inline daddr_t 175 radix_to_skip(daddr_t radix) 176 { 177 178 return (radix / 179 ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK)); 180 } 181 182 /* 183 * Provide a mask with count bits set, starting as position n. 184 */ 185 static inline u_daddr_t 186 bitrange(int n, int count) 187 { 188 189 return (((u_daddr_t)-1 << n) & 190 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - (n + count)))); 191 } 192 193 194 /* 195 * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t. 196 * Assumes that the argument has only one bit set. 197 */ 198 static inline int 199 bitpos(u_daddr_t mask) 200 { 201 int hi, lo, mid; 202 203 switch (sizeof(mask)) { 204 #ifdef HAVE_INLINE_FFSLL 205 case sizeof(long long): 206 return (ffsll(mask) - 1); 207 #endif 208 default: 209 lo = 0; 210 hi = BLIST_BMAP_RADIX; 211 while (lo + 1 < hi) { 212 mid = (lo + hi) >> 1; 213 if ((mask >> mid) != 0) 214 lo = mid; 215 else 216 hi = mid; 217 } 218 return (lo); 219 } 220 } 221 222 /* 223 * blist_create() - create a blist capable of handling up to the specified 224 * number of blocks 225 * 226 * blocks - must be greater than 0 227 * flags - malloc flags 228 * 229 * The smallest blist consists of a single leaf node capable of 230 * managing BLIST_BMAP_RADIX blocks. 231 */ 232 blist_t 233 blist_create(daddr_t blocks, int flags) 234 { 235 blist_t bl; 236 u_daddr_t nodes, radix; 237 238 KASSERT(blocks > 0, ("invalid block count")); 239 240 /* 241 * Calculate the radix and node count used for scanning. 242 */ 243 nodes = 1; 244 radix = BLIST_BMAP_RADIX; 245 while (radix <= blocks) { 246 nodes += 1 + (blocks - 1) / radix; 247 radix *= BLIST_META_RADIX; 248 } 249 250 bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags | 251 M_ZERO); 252 if (bl == NULL) 253 return (NULL); 254 255 bl->bl_blocks = blocks; 256 bl->bl_radix = radix; 257 258 #if defined(BLIST_DEBUG) 259 printf( 260 "BLIST representing %lld blocks (%lld MB of swap)" 261 ", requiring %lldK of ram\n", 262 (long long)bl->bl_blocks, 263 (long long)bl->bl_blocks * 4 / 1024, 264 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024 265 ); 266 printf("BLIST raw radix tree contains %lld records\n", 267 (long long)nodes); 268 #endif 269 270 return (bl); 271 } 272 273 void 274 blist_destroy(blist_t bl) 275 { 276 277 free(bl, M_SWAP); 278 } 279 280 /* 281 * blist_alloc() - reserve space in the block bitmap. Return the base 282 * of a contiguous region or SWAPBLK_NONE if space could 283 * not be allocated. 284 */ 285 daddr_t 286 blist_alloc(blist_t bl, daddr_t count) 287 { 288 daddr_t blk, cursor; 289 290 KASSERT(count <= BLIST_MAX_ALLOC, 291 ("allocation too large: %d", (int)count)); 292 293 /* 294 * This loop iterates at most twice. An allocation failure in the 295 * first iteration leads to a second iteration only if the cursor was 296 * non-zero. When the cursor is zero, an allocation failure will 297 * stop further iterations. 298 */ 299 cursor = bl->bl_cursor; 300 for (;;) { 301 blk = blst_meta_alloc(bl->bl_root, cursor, count, 302 bl->bl_radix); 303 if (blk != SWAPBLK_NONE) { 304 bl->bl_avail -= count; 305 bl->bl_cursor = blk + count; 306 if (bl->bl_cursor == bl->bl_blocks) 307 bl->bl_cursor = 0; 308 return (blk); 309 } else if (cursor == 0) 310 return (SWAPBLK_NONE); 311 cursor = 0; 312 } 313 } 314 315 /* 316 * blist_avail() - return the number of free blocks. 317 */ 318 daddr_t 319 blist_avail(blist_t bl) 320 { 321 322 return (bl->bl_avail); 323 } 324 325 /* 326 * blist_free() - free up space in the block bitmap. Return the base 327 * of a contiguous region. 328 */ 329 void 330 blist_free(blist_t bl, daddr_t blkno, daddr_t count) 331 { 332 333 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, 334 ("freeing invalid range: blkno %jx, count %d, blocks %jd", 335 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); 336 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix); 337 bl->bl_avail += count; 338 } 339 340 /* 341 * blist_fill() - mark a region in the block bitmap as off-limits 342 * to the allocator (i.e. allocate it), ignoring any 343 * existing allocations. Return the number of blocks 344 * actually filled that were free before the call. 345 */ 346 daddr_t 347 blist_fill(blist_t bl, daddr_t blkno, daddr_t count) 348 { 349 daddr_t filled; 350 351 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, 352 ("filling invalid range: blkno %jx, count %d, blocks %jd", 353 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); 354 filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix); 355 bl->bl_avail -= filled; 356 return (filled); 357 } 358 359 /* 360 * blist_resize() - resize an existing radix tree to handle the 361 * specified number of blocks. This will reallocate 362 * the tree and transfer the previous bitmap to the new 363 * one. When extending the tree you can specify whether 364 * the new blocks are to left allocated or freed. 365 */ 366 void 367 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags) 368 { 369 blist_t newbl = blist_create(count, flags); 370 blist_t save = *pbl; 371 372 *pbl = newbl; 373 if (count > save->bl_blocks) 374 count = save->bl_blocks; 375 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count); 376 377 /* 378 * If resizing upwards, should we free the new space or not? 379 */ 380 if (freenew && count < newbl->bl_blocks) { 381 blist_free(newbl, count, newbl->bl_blocks - count); 382 } 383 blist_destroy(save); 384 } 385 386 #ifdef BLIST_DEBUG 387 388 /* 389 * blist_print() - dump radix tree 390 */ 391 void 392 blist_print(blist_t bl) 393 { 394 printf("BLIST avail = %jd, cursor = %08jx {\n", 395 (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor); 396 397 if (bl->bl_root->bm_bitmap != 0) 398 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4); 399 printf("}\n"); 400 } 401 402 #endif 403 404 static const u_daddr_t fib[] = { 405 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 406 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 407 514229, 832040, 1346269, 2178309, 3524578, 408 }; 409 410 /* 411 * Use 'gap' to describe a maximal range of unallocated blocks/bits. 412 */ 413 struct gap_stats { 414 daddr_t start; /* current gap start, or SWAPBLK_NONE */ 415 daddr_t num; /* number of gaps observed */ 416 daddr_t max; /* largest gap size */ 417 daddr_t avg; /* average gap size */ 418 daddr_t err; /* sum - num * avg */ 419 daddr_t histo[nitems(fib)]; /* # gaps in each size range */ 420 int max_bucket; /* last histo elt with nonzero val */ 421 }; 422 423 /* 424 * gap_stats_counting() - is the state 'counting 1 bits'? 425 * or 'skipping 0 bits'? 426 */ 427 static inline bool 428 gap_stats_counting(const struct gap_stats *stats) 429 { 430 431 return (stats->start != SWAPBLK_NONE); 432 } 433 434 /* 435 * init_gap_stats() - initialize stats on gap sizes 436 */ 437 static inline void 438 init_gap_stats(struct gap_stats *stats) 439 { 440 441 bzero(stats, sizeof(*stats)); 442 stats->start = SWAPBLK_NONE; 443 } 444 445 /* 446 * update_gap_stats() - update stats on gap sizes 447 */ 448 static void 449 update_gap_stats(struct gap_stats *stats, daddr_t posn) 450 { 451 daddr_t size; 452 int hi, lo, mid; 453 454 if (!gap_stats_counting(stats)) { 455 stats->start = posn; 456 return; 457 } 458 size = posn - stats->start; 459 stats->start = SWAPBLK_NONE; 460 if (size > stats->max) 461 stats->max = size; 462 463 /* 464 * Find the fibonacci range that contains size, 465 * expecting to find it in an early range. 466 */ 467 lo = 0; 468 hi = 1; 469 while (hi < nitems(fib) && fib[hi] <= size) { 470 lo = hi; 471 hi *= 2; 472 } 473 if (hi >= nitems(fib)) 474 hi = nitems(fib); 475 while (lo + 1 != hi) { 476 mid = (lo + hi) >> 1; 477 if (fib[mid] <= size) 478 lo = mid; 479 else 480 hi = mid; 481 } 482 stats->histo[lo]++; 483 if (lo > stats->max_bucket) 484 stats->max_bucket = lo; 485 stats->err += size - stats->avg; 486 stats->num++; 487 stats->avg += stats->err / stats->num; 488 stats->err %= stats->num; 489 } 490 491 /* 492 * dump_gap_stats() - print stats on gap sizes 493 */ 494 static inline void 495 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s) 496 { 497 int i; 498 499 sbuf_printf(s, "number of maximal free ranges: %jd\n", 500 (intmax_t)stats->num); 501 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max); 502 sbuf_printf(s, "average maximal free range size: %jd\n", 503 (intmax_t)stats->avg); 504 sbuf_printf(s, "number of maximal free ranges of different sizes:\n"); 505 sbuf_printf(s, " count | size range\n"); 506 sbuf_printf(s, " ----- | ----------\n"); 507 for (i = 0; i < stats->max_bucket; i++) { 508 if (stats->histo[i] != 0) { 509 sbuf_printf(s, "%20jd | ", 510 (intmax_t)stats->histo[i]); 511 if (fib[i] != fib[i + 1] - 1) 512 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 513 (intmax_t)fib[i + 1] - 1); 514 else 515 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]); 516 } 517 } 518 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]); 519 if (stats->histo[i] > 1) 520 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 521 (intmax_t)stats->max); 522 else 523 sbuf_printf(s, "%jd\n", (intmax_t)stats->max); 524 } 525 526 /* 527 * blist_stats() - dump radix tree stats 528 */ 529 void 530 blist_stats(blist_t bl, struct sbuf *s) 531 { 532 struct gap_stats gstats; 533 struct gap_stats *stats = &gstats; 534 daddr_t i, nodes, radix; 535 u_daddr_t bit, diff, mask; 536 537 init_gap_stats(stats); 538 nodes = 0; 539 i = bl->bl_radix; 540 while (i < bl->bl_radix + bl->bl_blocks) { 541 /* 542 * Find max size subtree starting at i. 543 */ 544 radix = BLIST_BMAP_RADIX; 545 while (((i / radix) & BLIST_META_MASK) == 0) 546 radix *= BLIST_META_RADIX; 547 548 /* 549 * Check for skippable subtrees starting at i. 550 */ 551 while (radix > BLIST_BMAP_RADIX) { 552 if (bl->bl_root[nodes].bm_bitmap == 0) { 553 if (gap_stats_counting(stats)) 554 update_gap_stats(stats, i); 555 break; 556 } 557 558 /* 559 * Skip subtree root. 560 */ 561 nodes++; 562 radix /= BLIST_META_RADIX; 563 } 564 if (radix == BLIST_BMAP_RADIX) { 565 /* 566 * Scan leaf. 567 */ 568 mask = bl->bl_root[nodes].bm_bitmap; 569 diff = mask ^ (mask << 1); 570 if (gap_stats_counting(stats)) 571 diff ^= 1; 572 while (diff != 0) { 573 bit = diff & -diff; 574 update_gap_stats(stats, i + bitpos(bit)); 575 diff ^= bit; 576 } 577 } 578 nodes += radix_to_skip(radix); 579 i += radix; 580 } 581 update_gap_stats(stats, i); 582 dump_gap_stats(stats, s); 583 } 584 585 /************************************************************************ 586 * ALLOCATION SUPPORT FUNCTIONS * 587 ************************************************************************ 588 * 589 * These support functions do all the actual work. They may seem 590 * rather longish, but that's because I've commented them up. The 591 * actual code is straight forward. 592 * 593 */ 594 595 /* 596 * BLST_NEXT_LEAF_ALLOC() - allocate the first few blocks in the next leaf. 597 * 598 * 'scan' is a leaf node, associated with a block containing 'blk'. 599 * The next leaf node could be adjacent, or several nodes away if the 600 * least common ancestor of 'scan' and its neighbor is several levels 601 * up. Use 'blk' to determine how many meta-nodes lie between the 602 * leaves. If the next leaf has enough initial bits set, clear them 603 * and clear the bits in the meta nodes on the path up to the least 604 * common ancestor to mark any subtrees made completely empty. 605 */ 606 static int 607 blst_next_leaf_alloc(blmeta_t *scan, daddr_t blk, int count) 608 { 609 blmeta_t *next; 610 u_daddr_t radix; 611 int digit; 612 613 next = scan + 1; 614 blk += BLIST_BMAP_RADIX; 615 radix = BLIST_BMAP_RADIX; 616 while ((digit = ((blk / radix) & BLIST_META_MASK)) == 0 && 617 (next->bm_bitmap & 1) == 1) { 618 next++; 619 radix *= BLIST_META_RADIX; 620 } 621 if (((next->bm_bitmap + 1) & ~((u_daddr_t)-1 << count)) != 0) { 622 /* 623 * The next leaf doesn't have enough free blocks at the 624 * beginning to complete the spanning allocation. 625 */ 626 return (ENOMEM); 627 } 628 /* Clear the first 'count' bits in the next leaf to allocate. */ 629 next->bm_bitmap &= (u_daddr_t)-1 << count; 630 631 /* 632 * Update bitmaps of next-ancestors, up to least common ancestor. 633 */ 634 while (next->bm_bitmap == 0) { 635 if (--next == scan) { 636 scan[-digit * radix_to_skip(radix)].bm_bitmap ^= 637 (u_daddr_t)1 << digit; 638 break; 639 } 640 next->bm_bitmap ^= 1; 641 } 642 return (0); 643 } 644 645 /* 646 * Given a bitmask, flip all the bits from the least-significant 1-bit to the 647 * most significant bit. If the result is non-zero, then the least-significant 648 * 1-bit of the result is in the same position as the least-signification 0-bit 649 * in mask that is followed by a 1-bit. 650 */ 651 static inline u_daddr_t 652 flip_hibits(u_daddr_t mask) 653 { 654 655 return (-mask & ~mask); 656 } 657 658 /* 659 * BLST_LEAF_ALLOC() - allocate at a leaf in the radix tree (a bitmap). 660 * 661 * This function is the core of the allocator. Its execution time is 662 * proportional to log(count), plus height of the tree if the allocation 663 * crosses a leaf boundary. 664 */ 665 static daddr_t 666 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count) 667 { 668 u_daddr_t cursor_mask, mask; 669 int count1, hi, lo, num_shifts, range1, range_ext; 670 671 range1 = 0; 672 count1 = count - 1; 673 num_shifts = fls(count1); 674 mask = scan->bm_bitmap; 675 while (flip_hibits(mask) != 0 && num_shifts > 0) { 676 /* 677 * If bit i is set in mask, then bits in [i, i+range1] are set 678 * in scan->bm_bitmap. The value of range1 is equal to count1 679 * >> num_shifts. Grow range1 and reduce num_shifts to 0, 680 * while preserving these invariants. The updates to mask 681 * leave fewer bits set, but each bit that remains set 682 * represents a longer string of consecutive bits set in 683 * scan->bm_bitmap. If more updates to mask cannot clear more 684 * bits, because mask is partitioned with all 0 bits preceding 685 * all 1 bits, the loop terminates immediately. 686 */ 687 num_shifts--; 688 range_ext = range1 + ((count1 >> num_shifts) & 1); 689 /* 690 * mask is a signed quantity for the shift because when it is 691 * shifted right, the sign bit should copied; when the last 692 * block of the leaf is free, pretend, for a while, that all the 693 * blocks that follow it are also free. 694 */ 695 mask &= (daddr_t)mask >> range_ext; 696 range1 += range_ext; 697 } 698 if (mask == 0) { 699 /* 700 * Update bighint. There is no allocation bigger than range1 701 * starting in this leaf. 702 */ 703 scan->bm_bighint = range1; 704 return (SWAPBLK_NONE); 705 } 706 707 /* Discard any candidates that appear before blk. */ 708 if ((blk & BLIST_BMAP_MASK) != 0) { 709 cursor_mask = mask & bitrange(0, blk & BLIST_BMAP_MASK); 710 if (cursor_mask != 0) { 711 mask ^= cursor_mask; 712 if (mask == 0) 713 return (SWAPBLK_NONE); 714 715 /* 716 * Bighint change for last block allocation cannot 717 * assume that any other blocks are allocated, so the 718 * bighint cannot be reduced much. 719 */ 720 range1 = BLIST_MAX_ALLOC - 1; 721 } 722 blk &= ~BLIST_BMAP_MASK; 723 } 724 725 /* 726 * The least significant set bit in mask marks the start of the first 727 * available range of sufficient size. Clear all the bits but that one, 728 * and then find its position. 729 */ 730 mask &= -mask; 731 lo = bitpos(mask); 732 733 hi = lo + count; 734 if (hi > BLIST_BMAP_RADIX) { 735 /* 736 * An allocation within this leaf is impossible, so a successful 737 * allocation depends on the next leaf providing some of the blocks. 738 */ 739 if (blst_next_leaf_alloc(scan, blk, hi - BLIST_BMAP_RADIX) != 0) 740 /* 741 * The hint cannot be updated, because the same 742 * allocation request could be satisfied later, by this 743 * leaf, if the state of the next leaf changes, and 744 * without any changes to this leaf. 745 */ 746 return (SWAPBLK_NONE); 747 hi = BLIST_BMAP_RADIX; 748 } 749 750 /* Set the bits of mask at position 'lo' and higher. */ 751 mask = -mask; 752 if (hi == BLIST_BMAP_RADIX) { 753 /* 754 * Update bighint. There is no allocation bigger than range1 755 * available in this leaf after this allocation completes. 756 */ 757 scan->bm_bighint = range1; 758 } else { 759 /* Clear the bits of mask at position 'hi' and higher. */ 760 mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi); 761 } 762 /* Clear the allocated bits from this leaf. */ 763 scan->bm_bitmap &= ~mask; 764 return (blk + lo); 765 } 766 767 /* 768 * blist_meta_alloc() - allocate at a meta in the radix tree. 769 * 770 * Attempt to allocate at a meta node. If we can't, we update 771 * bighint and return a failure. Updating bighint optimize future 772 * calls that hit this node. We have to check for our collapse cases 773 * and we have a few optimizations strewn in as well. 774 */ 775 static daddr_t 776 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix) 777 { 778 daddr_t blk, i, r, skip; 779 u_daddr_t bit, mask; 780 bool scan_from_start; 781 int digit; 782 783 if (radix == BLIST_BMAP_RADIX) 784 return (blst_leaf_alloc(scan, cursor, count)); 785 blk = cursor & -radix; 786 scan_from_start = (cursor == blk); 787 radix /= BLIST_META_RADIX; 788 skip = radix_to_skip(radix); 789 mask = scan->bm_bitmap; 790 791 /* Discard any candidates that appear before cursor. */ 792 digit = (cursor / radix) & BLIST_META_MASK; 793 mask &= (u_daddr_t)-1 << digit; 794 if (mask == 0) 795 return (SWAPBLK_NONE); 796 797 /* 798 * If the first try is for a block that includes the cursor, pre-undo 799 * the digit * radix offset in the first call; otherwise, ignore the 800 * cursor entirely. 801 */ 802 if (((mask >> digit) & 1) == 1) 803 cursor -= digit * radix; 804 else 805 cursor = blk; 806 807 /* 808 * Examine the nonempty subtree associated with each bit set in mask. 809 */ 810 do { 811 bit = mask & -mask; 812 digit = bitpos(bit); 813 i = 1 + digit * skip; 814 if (count <= scan[i].bm_bighint) { 815 /* 816 * The allocation might fit beginning in the i'th subtree. 817 */ 818 r = blst_meta_alloc(&scan[i], cursor + digit * radix, 819 count, radix); 820 if (r != SWAPBLK_NONE) { 821 if (scan[i].bm_bitmap == 0) 822 scan->bm_bitmap ^= bit; 823 return (r); 824 } 825 } 826 cursor = blk; 827 } while ((mask ^= bit) != 0); 828 829 /* 830 * We couldn't allocate count in this subtree. If the whole tree was 831 * scanned, and the last tree node is allocated, update bighint. 832 */ 833 if (scan_from_start && !(digit == BLIST_META_RADIX - 1 && 834 scan[i].bm_bighint == BLIST_MAX_ALLOC)) 835 scan->bm_bighint = count - 1; 836 837 return (SWAPBLK_NONE); 838 } 839 840 /* 841 * BLST_LEAF_FREE() - free allocated block from leaf bitmap 842 * 843 */ 844 static void 845 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count) 846 { 847 u_daddr_t mask; 848 849 /* 850 * free some data in this bitmap 851 * mask=0000111111111110000 852 * \_________/\__/ 853 * count n 854 */ 855 mask = bitrange(blk & BLIST_BMAP_MASK, count); 856 KASSERT((scan->bm_bitmap & mask) == 0, 857 ("freeing free block: %jx, size %d, mask %jx", 858 (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask)); 859 scan->bm_bitmap |= mask; 860 } 861 862 /* 863 * BLST_META_FREE() - free allocated blocks from radix tree meta info 864 * 865 * This support routine frees a range of blocks from the bitmap. 866 * The range must be entirely enclosed by this radix node. If a 867 * meta node, we break the range down recursively to free blocks 868 * in subnodes (which means that this code can free an arbitrary 869 * range whereas the allocation code cannot allocate an arbitrary 870 * range). 871 */ 872 static void 873 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix) 874 { 875 daddr_t blk, endBlk, i, skip; 876 int digit, endDigit; 877 878 /* 879 * We could probably do a better job here. We are required to make 880 * bighint at least as large as the biggest allocable block of data. 881 * If we just shoehorn it, a little extra overhead will be incurred 882 * on the next allocation (but only that one typically). 883 */ 884 scan->bm_bighint = BLIST_MAX_ALLOC; 885 886 if (radix == BLIST_BMAP_RADIX) 887 return (blst_leaf_free(scan, freeBlk, count)); 888 889 endBlk = ummin(freeBlk + count, (freeBlk + radix) & -radix); 890 radix /= BLIST_META_RADIX; 891 skip = radix_to_skip(radix); 892 blk = freeBlk & -radix; 893 digit = (blk / radix) & BLIST_META_MASK; 894 endDigit = 1 + (((endBlk - 1) / radix) & BLIST_META_MASK); 895 scan->bm_bitmap |= bitrange(digit, endDigit - digit); 896 for (i = 1 + digit * skip; blk < endBlk; i += skip) { 897 blk += radix; 898 count = ummin(blk, endBlk) - freeBlk; 899 blst_meta_free(&scan[i], freeBlk, count, radix); 900 freeBlk = blk; 901 } 902 } 903 904 /* 905 * BLST_COPY() - copy one radix tree to another 906 * 907 * Locates free space in the source tree and frees it in the destination 908 * tree. The space may not already be free in the destination. 909 */ 910 static void 911 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest, 912 daddr_t count) 913 { 914 daddr_t endBlk, i, skip; 915 916 /* 917 * Leaf node 918 */ 919 920 if (radix == BLIST_BMAP_RADIX) { 921 u_daddr_t v = scan->bm_bitmap; 922 923 if (v == (u_daddr_t)-1) { 924 blist_free(dest, blk, count); 925 } else if (v != 0) { 926 int i; 927 928 for (i = 0; i < count; ++i) { 929 if (v & ((u_daddr_t)1 << i)) 930 blist_free(dest, blk + i, 1); 931 } 932 } 933 return; 934 } 935 936 /* 937 * Meta node 938 */ 939 940 if (scan->bm_bitmap == 0) { 941 /* 942 * Source all allocated, leave dest allocated 943 */ 944 return; 945 } 946 947 endBlk = blk + count; 948 radix /= BLIST_META_RADIX; 949 skip = radix_to_skip(radix); 950 for (i = 1; blk < endBlk; i += skip) { 951 blk += radix; 952 count = radix; 953 if (blk >= endBlk) 954 count -= blk - endBlk; 955 blst_copy(&scan[i], blk - radix, radix, dest, count); 956 } 957 } 958 959 /* 960 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap 961 * 962 * This routine allocates all blocks in the specified range 963 * regardless of any existing allocations in that range. Returns 964 * the number of blocks allocated by the call. 965 */ 966 static daddr_t 967 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count) 968 { 969 daddr_t nblks; 970 u_daddr_t mask; 971 972 mask = bitrange(blk & BLIST_BMAP_MASK, count); 973 974 /* Count the number of blocks that we are allocating. */ 975 nblks = bitcount64(scan->bm_bitmap & mask); 976 977 scan->bm_bitmap &= ~mask; 978 return (nblks); 979 } 980 981 /* 982 * BLIST_META_FILL() - allocate specific blocks at a meta node 983 * 984 * This routine allocates the specified range of blocks, 985 * regardless of any existing allocations in the range. The 986 * range must be within the extent of this node. Returns the 987 * number of blocks allocated by the call. 988 */ 989 static daddr_t 990 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix) 991 { 992 daddr_t blk, endBlk, i, nblks, skip; 993 int digit; 994 995 if (radix == BLIST_BMAP_RADIX) 996 return (blst_leaf_fill(scan, allocBlk, count)); 997 998 endBlk = ummin(allocBlk + count, (allocBlk + radix) & -radix); 999 radix /= BLIST_META_RADIX; 1000 skip = radix_to_skip(radix); 1001 blk = allocBlk & -radix; 1002 nblks = 0; 1003 while (blk < endBlk) { 1004 digit = (blk / radix) & BLIST_META_MASK; 1005 i = 1 + digit * skip; 1006 blk += radix; 1007 count = ummin(blk, endBlk) - allocBlk; 1008 nblks += blst_meta_fill(&scan[i], allocBlk, count, radix); 1009 if (scan[i].bm_bitmap == 0) 1010 scan->bm_bitmap &= ~((u_daddr_t)1 << digit); 1011 allocBlk = blk; 1012 } 1013 return (nblks); 1014 } 1015 1016 #ifdef BLIST_DEBUG 1017 1018 static void 1019 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab) 1020 { 1021 daddr_t skip; 1022 u_daddr_t bit, mask; 1023 int digit; 1024 1025 if (radix == BLIST_BMAP_RADIX) { 1026 printf( 1027 "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n", 1028 tab, tab, "", 1029 (long long)blk, (long long)radix, 1030 1 + (BLIST_BMAP_RADIX - 1) / 4, 1031 (long long)scan->bm_bitmap, 1032 (long long)scan->bm_bighint 1033 ); 1034 return; 1035 } 1036 1037 printf( 1038 "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n", 1039 tab, tab, "", 1040 (long long)blk, (long long)radix, 1041 (long long)radix, 1042 1 + (BLIST_META_RADIX - 1) / 4, 1043 (long long)scan->bm_bitmap, 1044 (long long)scan->bm_bighint 1045 ); 1046 1047 radix /= BLIST_META_RADIX; 1048 skip = radix_to_skip(radix); 1049 tab += 4; 1050 1051 mask = scan->bm_bitmap; 1052 /* Examine the nonempty subtree associated with each bit set in mask */ 1053 do { 1054 bit = mask & -mask; 1055 digit = bitpos(bit); 1056 blst_radix_print(&scan[1 + digit * skip], blk + digit * radix, 1057 radix, tab); 1058 } while ((mask ^= bit) != 0); 1059 tab -= 4; 1060 1061 printf( 1062 "%*.*s}\n", 1063 tab, tab, "" 1064 ); 1065 } 1066 1067 #endif 1068 1069 #ifdef BLIST_DEBUG 1070 1071 int 1072 main(int ac, char **av) 1073 { 1074 int size = BLIST_META_RADIX * BLIST_BMAP_RADIX; 1075 int i; 1076 blist_t bl; 1077 struct sbuf *s; 1078 1079 for (i = 1; i < ac; ++i) { 1080 const char *ptr = av[i]; 1081 if (*ptr != '-') { 1082 size = strtol(ptr, NULL, 0); 1083 continue; 1084 } 1085 ptr += 2; 1086 fprintf(stderr, "Bad option: %s\n", ptr - 2); 1087 exit(1); 1088 } 1089 bl = blist_create(size, M_WAITOK); 1090 blist_free(bl, 0, size); 1091 1092 for (;;) { 1093 char buf[1024]; 1094 long long da = 0; 1095 long long count = 0; 1096 1097 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl), 1098 (long long)size, (long long)bl->bl_radix); 1099 fflush(stdout); 1100 if (fgets(buf, sizeof(buf), stdin) == NULL) 1101 break; 1102 switch(buf[0]) { 1103 case 'r': 1104 if (sscanf(buf + 1, "%lld", &count) == 1) { 1105 blist_resize(&bl, count, 1, M_WAITOK); 1106 } else { 1107 printf("?\n"); 1108 } 1109 case 'p': 1110 blist_print(bl); 1111 break; 1112 case 's': 1113 s = sbuf_new_auto(); 1114 blist_stats(bl, s); 1115 sbuf_finish(s); 1116 printf("%s", sbuf_data(s)); 1117 sbuf_delete(s); 1118 break; 1119 case 'a': 1120 if (sscanf(buf + 1, "%lld", &count) == 1) { 1121 daddr_t blk = blist_alloc(bl, count); 1122 printf(" R=%08llx\n", (long long)blk); 1123 } else { 1124 printf("?\n"); 1125 } 1126 break; 1127 case 'f': 1128 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { 1129 blist_free(bl, da, count); 1130 } else { 1131 printf("?\n"); 1132 } 1133 break; 1134 case 'l': 1135 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { 1136 printf(" n=%jd\n", 1137 (intmax_t)blist_fill(bl, da, count)); 1138 } else { 1139 printf("?\n"); 1140 } 1141 break; 1142 case '?': 1143 case 'h': 1144 puts( 1145 "p -print\n" 1146 "s -stats\n" 1147 "a %d -allocate\n" 1148 "f %x %d -free\n" 1149 "l %x %d -fill\n" 1150 "r %d -resize\n" 1151 "h/? -help" 1152 ); 1153 break; 1154 default: 1155 printf("?\n"); 1156 break; 1157 } 1158 } 1159 return (0); 1160 } 1161 1162 #endif 1163