1 /* 2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/kernel.h> 37 #include <sys/fcntl.h> 38 #include <sys/buf.h> 39 #include <sys/proc.h> 40 #include <sys/namei.h> 41 #include <sys/mount.h> 42 #include <sys/vnode.h> 43 #include <sys/mountctl.h> 44 #include <vm/vm_kern.h> 45 #include <vm/vm_extern.h> 46 47 #include "hammer2.h" 48 49 /* 50 * breadth-first search 51 */ 52 typedef struct hammer2_chain_save { 53 TAILQ_ENTRY(hammer2_chain_save) entry; 54 hammer2_chain_t *chain; 55 int pri; 56 } hammer2_chain_save_t; 57 58 TAILQ_HEAD(hammer2_chain_save_list, hammer2_chain_save); 59 typedef struct hammer2_chain_save_list hammer2_chain_save_list_t; 60 61 typedef struct hammer2_bulkfree_info { 62 hammer2_dev_t *hmp; 63 kmem_anon_desc_t kp; 64 hammer2_off_t sbase; /* sub-loop iteration */ 65 hammer2_off_t sstop; 66 hammer2_bmap_data_t *bmap; 67 int depth; 68 long count_10_00; /* staged->free */ 69 long count_11_10; /* allocated->staged */ 70 long count_00_11; /* (should not happen) */ 71 long count_01_11; /* (should not happen) */ 72 long count_10_11; /* staged->allocated */ 73 long count_l0cleans; 74 long count_linadjusts; 75 long count_inodes_scanned; 76 long count_dedup_factor; 77 long bytes_scanned; 78 hammer2_off_t adj_free; 79 hammer2_tid_t mtid; 80 hammer2_tid_t saved_mirror_tid; 81 time_t save_time; 82 hammer2_chain_save_list_t list; 83 hammer2_dedup_t *dedup; 84 int pri; 85 } hammer2_bulkfree_info_t; 86 87 static int h2_bulkfree_test(hammer2_bulkfree_info_t *info, 88 hammer2_blockref_t *bref, int pri); 89 90 /* 91 * General bulk scan function with callback. Called with a referenced 92 * but UNLOCKED parent. The parent is returned in the same state. 93 */ 94 static 95 int 96 hammer2_bulk_scan(hammer2_chain_t *parent, 97 int (*func)(hammer2_bulkfree_info_t *info, 98 hammer2_blockref_t *bref), 99 hammer2_bulkfree_info_t *info) 100 { 101 hammer2_blockref_t bref; 102 hammer2_chain_t *chain; 103 int cache_index = -1; 104 int doabort = 0; 105 int first = 1; 106 107 ++info->pri; 108 109 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 110 HAMMER2_RESOLVE_SHARED); 111 chain = NULL; 112 113 /* 114 * Generally loop on the contents if we have not been flagged 115 * for abort. 116 * 117 * Remember that these chains are completely isolated from 118 * the frontend, so we can release locks temporarily without 119 * imploding. 120 */ 121 while ((doabort & HAMMER2_BULK_ABORT) == 0 && 122 hammer2_chain_scan(parent, &chain, &bref, &first, 123 &cache_index, 124 HAMMER2_LOOKUP_NODATA | 125 HAMMER2_LOOKUP_SHARED) != NULL) { 126 /* 127 * Process bref, chain is only non-NULL if the bref 128 * might be recursable (its possible that we sometimes get 129 * a non-NULL chain where the bref cannot be recursed). 130 */ 131 #if 0 132 kprintf("SCAN %016jx\n", bref.data_off); 133 int xerr = tsleep(&info->pri, PCATCH, "slp", hz / 10); 134 if (xerr == EINTR || xerr == ERESTART) { 135 doabort |= HAMMER2_BULK_ABORT; 136 } 137 #endif 138 ++info->pri; 139 if (h2_bulkfree_test(info, &bref, 1)) 140 continue; 141 142 doabort |= func(info, &bref); 143 144 if (doabort & HAMMER2_BULK_ABORT) 145 break; 146 147 /* 148 * A non-null chain is always returned if it is 149 * recursive, otherwise a non-null chain might be 150 * returned but usually is not when not recursive. 151 */ 152 if (chain == NULL) 153 continue; 154 155 /* 156 * Else check type and setup depth-first scan. 157 * 158 * Account for bytes actually read. 159 */ 160 info->bytes_scanned += chain->bytes; 161 162 switch(chain->bref.type) { 163 case HAMMER2_BREF_TYPE_INODE: 164 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 165 case HAMMER2_BREF_TYPE_INDIRECT: 166 case HAMMER2_BREF_TYPE_VOLUME: 167 case HAMMER2_BREF_TYPE_FREEMAP: 168 ++info->depth; 169 if (info->depth > 16) { 170 hammer2_chain_save_t *save; 171 save = kmalloc(sizeof(*save), M_HAMMER2, 172 M_WAITOK | M_ZERO); 173 save->chain = chain; 174 hammer2_chain_ref(chain); 175 TAILQ_INSERT_TAIL(&info->list, save, entry); 176 177 /* guess */ 178 info->pri += 10; 179 } else { 180 int savepri = info->pri; 181 182 hammer2_chain_unlock(chain); 183 info->pri = 0; 184 doabort |= hammer2_bulk_scan(chain, func, info); 185 info->pri += savepri; 186 hammer2_chain_lock(chain, 187 HAMMER2_RESOLVE_ALWAYS | 188 HAMMER2_RESOLVE_SHARED); 189 } 190 --info->depth; 191 break; 192 default: 193 /* does not recurse */ 194 break; 195 } 196 } 197 if (chain) { 198 hammer2_chain_unlock(chain); 199 hammer2_chain_drop(chain); 200 } 201 202 /* 203 * Save with higher pri now that we know what it is. 204 */ 205 h2_bulkfree_test(info, &parent->bref, info->pri + 1); 206 207 hammer2_chain_unlock(parent); 208 209 return doabort; 210 } 211 212 /* 213 * Bulkfree algorithm 214 * 215 * Repeat { 216 * Chain flush (partial synchronization) XXX removed 217 * Scan the whole topology - build in-memory freemap (mark 11) 218 * Reconcile the in-memory freemap against the on-disk freemap. 219 * ondisk xx -> ondisk 11 (if allocated) 220 * ondisk 11 -> ondisk 10 (if free in-memory) 221 * ondisk 10 -> ondisk 00 (if free in-memory) - on next pass 222 * } 223 * 224 * The topology scan may have to be performed multiple times to window 225 * freemaps which are too large to fit in kernel memory. 226 * 227 * Races are handled using a double-transition (11->10, 10->00). The bulkfree 228 * scan snapshots the volume root's blockset and thus can run concurrent with 229 * normal operations, as long as a full flush is made between each pass to 230 * synchronize any modified chains (otherwise their blocks might be improperly 231 * freed). 232 * 233 * Temporary memory in multiples of 64KB is required to reconstruct the leaf 234 * hammer2_bmap_data blocks so they can later be compared against the live 235 * freemap. Each 64KB block represents 128 x 16KB x 1024 = ~2 GB of storage. 236 * A 32MB save area thus represents around ~1 TB. The temporary memory 237 * allocated can be specified. If it is not sufficient multiple topology 238 * passes will be made. 239 */ 240 241 /* 242 * Bulkfree callback info 243 */ 244 static int h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo, 245 hammer2_blockref_t *bref); 246 static void h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo); 247 static void h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, 248 hammer2_bmap_data_t *live, hammer2_bmap_data_t *bmap, 249 int nofree); 250 251 int 252 hammer2_bulkfree_pass(hammer2_dev_t *hmp, hammer2_ioc_bulkfree_t *bfi) 253 { 254 hammer2_bulkfree_info_t cbinfo; 255 hammer2_chain_t *vchain; 256 hammer2_chain_save_t *save; 257 hammer2_off_t incr; 258 size_t size; 259 int doabort = 0; 260 261 /* 262 * A bulkfree operations lock is required for the duration. We 263 * must hold it across our flushes to guarantee that we never run 264 * two bulkfree passes in a row without a flush in the middle. 265 */ 266 lockmgr(&hmp->bulklk, LK_EXCLUSIVE); 267 268 /* 269 * We have to clear the live dedup cache as it might have entries 270 * that are freeable as of now. Any new entries in the dedup cache 271 * made after this point, even if they become freeable, will have 272 * previously been fully allocated and will be protected by the 273 * 2-stage bulkfree. 274 */ 275 hammer2_dedup_clear(hmp); 276 277 #if 0 278 /* 279 * XXX This has been removed. Instead of trying to flush, which 280 * appears to have a ton of races against life chains even with 281 * the two-stage scan, we simply refuse to free any blocks 282 * related to freemap chains modified after the last filesystem 283 * sync. 284 * 285 * Do a quick flush so we can snapshot vchain for any blocks that 286 * have been allocated prior to this point. We don't need to 287 * flush vnodes, logical buffers, or dirty inodes that have not 288 * allocated blocks yet. We do not want to flush the device buffers 289 * nor do we want to flush the actual volume root to disk here, 290 * that is not needed to perform the snapshot. 291 */ 292 hammer2_flush_quick(hmp); 293 #endif 294 295 /* 296 * Setup for free pass 297 */ 298 bzero(&cbinfo, sizeof(cbinfo)); 299 size = (bfi->size + HAMMER2_FREEMAP_LEVELN_PSIZE - 1) & 300 ~(size_t)(HAMMER2_FREEMAP_LEVELN_PSIZE - 1); 301 cbinfo.hmp = hmp; 302 cbinfo.bmap = kmem_alloc_swapbacked(&cbinfo.kp, size); 303 cbinfo.saved_mirror_tid = hmp->voldata.mirror_tid; 304 305 cbinfo.dedup = kmalloc(sizeof(*cbinfo.dedup) * HAMMER2_DEDUP_HEUR_SIZE, 306 M_HAMMER2, M_WAITOK | M_ZERO); 307 308 /* 309 * Normalize start point to a 2GB boundary. We operate on a 310 * 64KB leaf bitmap boundary which represents 2GB of storage. 311 */ 312 cbinfo.sbase = bfi->sbase; 313 if (cbinfo.sbase > hmp->voldata.volu_size) 314 cbinfo.sbase = hmp->voldata.volu_size; 315 cbinfo.sbase &= ~HAMMER2_FREEMAP_LEVEL1_MASK; 316 317 /* 318 * The primary storage scan must use a snapshot of the volume 319 * root to avoid racing renames and other frontend work. 320 * 321 * Note that snapshots only snap synchronized storage, so 322 * we have to flush between each pass or we risk freeing 323 * storage allocated by the frontend. 324 */ 325 vchain = hammer2_chain_bulksnap(&hmp->vchain); 326 TAILQ_INIT(&cbinfo.list); 327 328 /* 329 * Loop on a full meta-data scan as many times as required to 330 * get through all available storage. 331 */ 332 while (cbinfo.sbase < hmp->voldata.volu_size) { 333 /* 334 * We have enough ram to represent (incr) bytes of storage. 335 * Each 64KB of ram represents 2GB of storage. 336 */ 337 bzero(cbinfo.bmap, size); 338 incr = size / HAMMER2_FREEMAP_LEVELN_PSIZE * 339 HAMMER2_FREEMAP_LEVEL1_SIZE; 340 if (hmp->voldata.volu_size - cbinfo.sbase < incr) 341 cbinfo.sstop = hmp->voldata.volu_size; 342 else 343 cbinfo.sstop = cbinfo.sbase + incr; 344 if (hammer2_debug & 1) 345 kprintf("bulkfree pass %016jx/%jdGB\n", 346 (intmax_t)cbinfo.sbase, 347 (intmax_t)incr / HAMMER2_FREEMAP_LEVEL1_SIZE); 348 349 hammer2_trans_init(hmp->spmp, 0); 350 cbinfo.mtid = hammer2_trans_sub(hmp->spmp); 351 cbinfo.pri = 0; 352 doabort |= hammer2_bulk_scan(vchain, h2_bulkfree_callback, 353 &cbinfo); 354 355 while ((save = TAILQ_FIRST(&cbinfo.list)) != NULL && 356 doabort == 0) { 357 TAILQ_REMOVE(&cbinfo.list, save, entry); 358 cbinfo.pri = 0; 359 doabort |= hammer2_bulk_scan(save->chain, 360 h2_bulkfree_callback, 361 &cbinfo); 362 hammer2_chain_drop(save->chain); 363 kfree(save, M_HAMMER2); 364 } 365 while (save) { 366 TAILQ_REMOVE(&cbinfo.list, save, entry); 367 hammer2_chain_drop(save->chain); 368 kfree(save, M_HAMMER2); 369 save = TAILQ_FIRST(&cbinfo.list); 370 } 371 372 kprintf("bulkfree lastdrop %d %d\n", 373 vchain->refs, vchain->core.chain_count); 374 375 /* 376 * If complete scan succeeded we can synchronize our 377 * in-memory freemap against live storage. If an abort 378 * did occur we cannot safely synchronize our partially 379 * filled-out in-memory freemap. 380 */ 381 if (doabort == 0) { 382 h2_bulkfree_sync(&cbinfo); 383 384 hammer2_voldata_lock(hmp); 385 hammer2_voldata_modify(hmp); 386 hmp->voldata.allocator_free += cbinfo.adj_free; 387 hammer2_voldata_unlock(hmp); 388 } 389 390 /* 391 * Cleanup for next loop. 392 */ 393 hammer2_trans_done(hmp->spmp); 394 if (doabort) 395 break; 396 cbinfo.sbase = cbinfo.sstop; 397 } 398 hammer2_chain_bulkdrop(vchain); 399 kmem_free_swapbacked(&cbinfo.kp); 400 kfree(cbinfo.dedup, M_HAMMER2); 401 cbinfo.dedup = NULL; 402 403 bfi->sstop = cbinfo.sbase; 404 405 incr = bfi->sstop / (hmp->voldata.volu_size / 10000); 406 if (incr > 10000) 407 incr = 10000; 408 409 kprintf("bulkfree pass statistics (%d.%02d%% storage processed):\n", 410 (int)incr / 100, 411 (int)incr % 100); 412 413 kprintf(" transition->free %ld\n", cbinfo.count_10_00); 414 kprintf(" transition->staged %ld\n", cbinfo.count_11_10); 415 kprintf(" ERR(00)->allocated %ld\n", cbinfo.count_00_11); 416 kprintf(" ERR(01)->allocated %ld\n", cbinfo.count_01_11); 417 kprintf(" staged->allocated %ld\n", cbinfo.count_10_11); 418 kprintf(" ~2MB segs cleaned %ld\n", cbinfo.count_l0cleans); 419 kprintf(" linear adjusts %ld\n", cbinfo.count_linadjusts); 420 kprintf(" dedup factor %ld\n", cbinfo.count_dedup_factor); 421 422 lockmgr(&hmp->bulklk, LK_RELEASE); 423 424 return doabort; 425 } 426 427 static int 428 h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref) 429 { 430 hammer2_bmap_data_t *bmap; 431 hammer2_off_t data_off; 432 uint16_t class; 433 size_t bytes; 434 int radix; 435 int error; 436 437 /* 438 * Check for signal and allow yield to userland during scan 439 */ 440 if (hammer2_signal_check(&cbinfo->save_time)) 441 return HAMMER2_BULK_ABORT; 442 if (bref->type == HAMMER2_BREF_TYPE_INODE) { 443 ++cbinfo->count_inodes_scanned; 444 if ((cbinfo->count_inodes_scanned & 1023) == 0) 445 kprintf(" inodes %6ld bytes %9ld\n", 446 cbinfo->count_inodes_scanned, 447 cbinfo->bytes_scanned); 448 } 449 450 451 /* 452 * Calculate the data offset and determine if it is within 453 * the current freemap range being gathered. 454 */ 455 error = 0; 456 data_off = bref->data_off & ~HAMMER2_OFF_MASK_RADIX; 457 if (data_off < cbinfo->sbase || data_off > cbinfo->sstop) 458 return 0; 459 if (data_off < cbinfo->hmp->voldata.allocator_beg) 460 return 0; 461 if (data_off > cbinfo->hmp->voldata.volu_size) 462 return 0; 463 464 /* 465 * Calculate the information needed to generate the in-memory 466 * freemap record. 467 * 468 * Hammer2 does not allow allocations to cross the L1 (2GB) boundary, 469 * it's a problem if it does. (Or L0 (2MB) for that matter). 470 */ 471 radix = (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 472 bytes = (size_t)1 << radix; 473 class = (bref->type << 8) | hammer2_devblkradix(radix); 474 475 if (data_off + bytes > cbinfo->sstop) { 476 kprintf("hammer2_bulkfree_scan: illegal 2GB boundary " 477 "%016jx %016jx/%d\n", 478 (intmax_t)bref->data_off, 479 (intmax_t)bref->key, 480 bref->keybits); 481 bytes = cbinfo->sstop - data_off; /* XXX */ 482 } 483 484 /* 485 * Convert to a storage offset relative to the beginning of the 486 * storage range we are collecting. Then lookup the level0 bmap entry. 487 */ 488 data_off -= cbinfo->sbase; 489 bmap = cbinfo->bmap + (data_off >> HAMMER2_FREEMAP_LEVEL0_RADIX); 490 491 /* 492 * Convert data_off to a bmap-relative value (~2MB storage range). 493 * Adjust linear, class, and avail. 494 * 495 * Hammer2 does not allow allocations to cross the L0 (2MB) boundary, 496 */ 497 data_off &= HAMMER2_FREEMAP_LEVEL0_MASK; 498 if (data_off + bytes > HAMMER2_FREEMAP_LEVEL0_SIZE) { 499 kprintf("hammer2_bulkfree_scan: illegal 2MB boundary " 500 "%016jx %016jx/%d\n", 501 (intmax_t)bref->data_off, 502 (intmax_t)bref->key, 503 bref->keybits); 504 bytes = HAMMER2_FREEMAP_LEVEL0_SIZE - data_off; 505 } 506 507 if (bmap->class == 0) { 508 bmap->class = class; 509 bmap->avail = HAMMER2_FREEMAP_LEVEL0_SIZE; 510 } 511 if (bmap->class != class) { 512 kprintf("hammer2_bulkfree_scan: illegal mixed class " 513 "%016jx %016jx/%d (%04x vs %04x)\n", 514 (intmax_t)bref->data_off, 515 (intmax_t)bref->key, 516 bref->keybits, 517 class, bmap->class); 518 } 519 if (bmap->linear < (int32_t)data_off + (int32_t)bytes) 520 bmap->linear = (int32_t)data_off + (int32_t)bytes; 521 522 /* 523 * Adjust the hammer2_bitmap_t bitmap[HAMMER2_BMAP_ELEMENTS]. 524 * 64-bit entries, 2 bits per entry, to code 11. 525 * 526 * NOTE: The allocation can be smaller than HAMMER2_FREEMAP_BLOCK_SIZE. 527 */ 528 while (bytes > 0) { 529 int bindex; 530 hammer2_bitmap_t bmask; 531 532 bindex = (int)data_off >> (HAMMER2_FREEMAP_BLOCK_RADIX + 533 HAMMER2_BMAP_INDEX_RADIX); 534 bmask = (hammer2_bitmap_t)3 << 535 ((((int)data_off & HAMMER2_BMAP_INDEX_MASK) >> 536 HAMMER2_FREEMAP_BLOCK_RADIX) << 1); 537 538 /* 539 * NOTE! The (avail) calculation is bitmap-granular. Multiple 540 * sub-granular records can wind up at the same bitmap 541 * position. 542 */ 543 if ((bmap->bitmapq[bindex] & bmask) == 0) { 544 if (bytes < HAMMER2_FREEMAP_BLOCK_SIZE) { 545 bmap->avail -= HAMMER2_FREEMAP_BLOCK_SIZE; 546 } else { 547 bmap->avail -= bytes; 548 } 549 bmap->bitmapq[bindex] |= bmask; 550 } 551 data_off += HAMMER2_FREEMAP_BLOCK_SIZE; 552 if (bytes < HAMMER2_FREEMAP_BLOCK_SIZE) 553 bytes = 0; 554 else 555 bytes -= HAMMER2_FREEMAP_BLOCK_SIZE; 556 } 557 return error; 558 } 559 560 /* 561 * Synchronize the in-memory bitmap with the live freemap. This is not a 562 * direct copy. Instead the bitmaps must be compared: 563 * 564 * In-memory Live-freemap 565 * 00 11 -> 10 (do nothing if live modified) 566 * 10 -> 00 (do nothing if live modified) 567 * 11 10 -> 11 handles race against live 568 * ** -> 11 nominally warn of corruption 569 * 570 */ 571 static void 572 h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo) 573 { 574 hammer2_off_t data_off; 575 hammer2_key_t key; 576 hammer2_key_t key_dummy; 577 hammer2_bmap_data_t *bmap; 578 hammer2_bmap_data_t *live; 579 hammer2_chain_t *live_parent; 580 hammer2_chain_t *live_chain; 581 int cache_index = -1; 582 int bmapindex; 583 int nofree; 584 585 kprintf("hammer2_bulkfree - range %016jx-%016jx\n", 586 (intmax_t)cbinfo->sbase, 587 (intmax_t)cbinfo->sstop); 588 589 data_off = cbinfo->sbase; 590 bmap = cbinfo->bmap; 591 592 live_parent = &cbinfo->hmp->fchain; 593 hammer2_chain_ref(live_parent); 594 hammer2_chain_lock(live_parent, HAMMER2_RESOLVE_ALWAYS); 595 live_chain = NULL; 596 nofree = 1; /* safety */ 597 598 /* 599 * Iterate each hammer2_bmap_data_t line (128 bytes) managing 600 * 4MB of storage. 601 */ 602 while (data_off < cbinfo->sstop) { 603 /* 604 * The freemap is not used below allocator_beg or beyond 605 * volu_size. 606 */ 607 if (data_off < cbinfo->hmp->voldata.allocator_beg) 608 goto next; 609 if (data_off > cbinfo->hmp->voldata.volu_size) 610 goto next; 611 612 /* 613 * Locate the freemap leaf on the live filesystem 614 */ 615 key = (data_off & ~HAMMER2_FREEMAP_LEVEL1_MASK); 616 if (live_chain == NULL || live_chain->bref.key != key) { 617 if (live_chain) { 618 hammer2_chain_unlock(live_chain); 619 hammer2_chain_drop(live_chain); 620 } 621 live_chain = hammer2_chain_lookup( 622 &live_parent, 623 &key_dummy, 624 key, 625 key + HAMMER2_FREEMAP_LEVEL1_MASK, 626 &cache_index, 627 HAMMER2_LOOKUP_ALWAYS); 628 /* 629 * If recent allocations were made we avoid races by 630 * not staging or freeing any blocks. We can still 631 * remark blocks as fully allocated. 632 */ 633 if (live_chain) { 634 kprintf("live_chain %016jx\n", (intmax_t)key); 635 if (live_chain->bref.mirror_tid > 636 cbinfo->saved_mirror_tid) { 637 kprintf("hammer2_bulkfree: " 638 "avoid %016jx\n", 639 data_off); 640 nofree = 1; 641 } else { 642 nofree = 0; 643 } 644 } 645 646 } 647 if (live_chain == NULL) { 648 /* 649 * XXX if we implement a full recovery mode we need 650 * to create/recreate missing freemap chains if our 651 * bmap has any allocated blocks. 652 */ 653 if (bmap->class && 654 bmap->avail != HAMMER2_FREEMAP_LEVEL0_SIZE) { 655 kprintf("hammer2_bulkfree: cannot locate " 656 "live leaf for allocated data " 657 "near %016jx\n", 658 (intmax_t)data_off); 659 } 660 goto next; 661 } 662 if (live_chain->error) { 663 kprintf("hammer2_bulkfree: error %s looking up " 664 "live leaf for allocated data near %016jx\n", 665 hammer2_error_str(live_chain->error), 666 (intmax_t)data_off); 667 hammer2_chain_unlock(live_chain); 668 hammer2_chain_drop(live_chain); 669 live_chain = NULL; 670 goto next; 671 } 672 673 bmapindex = (data_off & HAMMER2_FREEMAP_LEVEL1_MASK) >> 674 HAMMER2_FREEMAP_LEVEL0_RADIX; 675 live = &live_chain->data->bmdata[bmapindex]; 676 677 /* 678 * TODO - we could shortcut this by testing that both 679 * live->class and bmap->class are 0, and both avails are 680 * set to HAMMER2_FREEMAP_LEVEL0_SIZE (4MB). 681 */ 682 if (bcmp(live->bitmapq, bmap->bitmapq, 683 sizeof(bmap->bitmapq)) == 0) { 684 goto next; 685 } 686 if (hammer2_debug & 1) 687 kprintf("live %016jx %04d.%04x (avail=%d)\n", 688 data_off, bmapindex, live->class, live->avail); 689 690 hammer2_chain_modify(live_chain, cbinfo->mtid, 0, 0); 691 692 h2_bulkfree_sync_adjust(cbinfo, live, bmap, nofree); 693 next: 694 data_off += HAMMER2_FREEMAP_LEVEL0_SIZE; 695 ++bmap; 696 } 697 if (live_chain) { 698 hammer2_chain_unlock(live_chain); 699 hammer2_chain_drop(live_chain); 700 } 701 if (live_parent) { 702 hammer2_chain_unlock(live_parent); 703 hammer2_chain_drop(live_parent); 704 } 705 } 706 707 /* 708 * Merge the bulkfree bitmap against the existing bitmap. 709 * 710 * If nofree is non-zero the merge will only mark free blocks as allocated 711 * and will refuse to free any blocks. 712 */ 713 static 714 void 715 h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, 716 hammer2_bmap_data_t *live, hammer2_bmap_data_t *bmap, 717 int nofree) 718 { 719 int bindex; 720 int scount; 721 hammer2_bitmap_t lmask; 722 hammer2_bitmap_t mmask; 723 724 for (bindex = 0; bindex < HAMMER2_BMAP_ELEMENTS; ++bindex) { 725 lmask = live->bitmapq[bindex]; 726 mmask = bmap->bitmapq[bindex]; 727 if (lmask == mmask) 728 continue; 729 730 for (scount = 0; 731 scount < HAMMER2_BMAP_BITS_PER_ELEMENT; 732 scount += 2) { 733 if ((mmask & 3) == 0) { 734 /* 735 * in-memory 00 live 11 -> 10 736 * live 10 -> 00 737 * 738 * Storage might be marked allocated or 739 * staged and must be remarked staged or 740 * free. 741 */ 742 switch (lmask & 3) { 743 case 0: /* 00 */ 744 break; 745 case 1: /* 01 */ 746 kprintf("hammer2_bulkfree: cannot " 747 "transition m=00/l=01\n"); 748 break; 749 case 2: /* 10 -> 00 */ 750 if (nofree) 751 break; 752 live->bitmapq[bindex] &= 753 ~((hammer2_bitmap_t)2 << scount); 754 live->avail += 755 HAMMER2_FREEMAP_BLOCK_SIZE; 756 if (live->avail > 757 HAMMER2_FREEMAP_LEVEL0_SIZE) { 758 live->avail = 759 HAMMER2_FREEMAP_LEVEL0_SIZE; 760 } 761 cbinfo->adj_free += 762 HAMMER2_FREEMAP_BLOCK_SIZE; 763 ++cbinfo->count_10_00; 764 break; 765 case 3: /* 11 -> 10 */ 766 if (nofree) 767 break; 768 live->bitmapq[bindex] &= 769 ~((hammer2_bitmap_t)1 << scount); 770 ++cbinfo->count_11_10; 771 break; 772 } 773 } else if ((mmask & 3) == 3) { 774 /* 775 * in-memory 11 live 10 -> 11 776 * live ** -> 11 777 * 778 * Storage might be incorrectly marked free 779 * or staged and must be remarked fully 780 * allocated. 781 */ 782 switch (lmask & 3) { 783 case 0: /* 00 */ 784 ++cbinfo->count_00_11; 785 cbinfo->adj_free -= 786 HAMMER2_FREEMAP_BLOCK_SIZE; 787 live->avail -= 788 HAMMER2_FREEMAP_BLOCK_SIZE; 789 if ((int32_t)live->avail < 0) 790 live->avail = 0; 791 break; 792 case 1: /* 01 */ 793 ++cbinfo->count_01_11; 794 break; 795 case 2: /* 10 -> 11 */ 796 ++cbinfo->count_10_11; 797 break; 798 case 3: /* 11 */ 799 break; 800 } 801 live->bitmapq[bindex] |= 802 ((hammer2_bitmap_t)3 << scount); 803 } 804 mmask >>= 2; 805 lmask >>= 2; 806 } 807 } 808 809 /* 810 * Determine if the live bitmap is completely free and reset its 811 * fields if so. Otherwise check to see if we can reduce the linear 812 * offset. 813 */ 814 for (bindex = HAMMER2_BMAP_ELEMENTS - 1; bindex >= 0; --bindex) { 815 if (live->bitmapq[bindex] != 0) 816 break; 817 } 818 if (nofree) { 819 /* do nothing */ 820 } else if (bindex < 0) { 821 live->avail = HAMMER2_FREEMAP_LEVEL0_SIZE; 822 live->class = 0; 823 live->linear = 0; 824 ++cbinfo->count_l0cleans; 825 } else if (bindex < 7) { 826 ++bindex; 827 if (live->linear > bindex * HAMMER2_FREEMAP_BLOCK_SIZE) { 828 live->linear = bindex * HAMMER2_FREEMAP_BLOCK_SIZE; 829 ++cbinfo->count_linadjusts; 830 } 831 832 /* 833 * XXX this fine-grained measure still has some issues. 834 */ 835 if (live->linear < bindex * HAMMER2_FREEMAP_BLOCK_SIZE) { 836 live->linear = bindex * HAMMER2_FREEMAP_BLOCK_SIZE; 837 ++cbinfo->count_linadjusts; 838 } 839 } else { 840 live->linear = HAMMER2_SEGSIZE; 841 } 842 843 #if 0 844 if (bmap->class) { 845 kprintf("%016jx %04d.%04x (avail=%7d) " 846 "%08x %08x %08x %08x %08x %08x %08x %08x\n", 847 (intmax_t)data_off, 848 (int)((data_off & 849 HAMMER2_FREEMAP_LEVEL1_MASK) >> 850 HAMMER2_FREEMAP_LEVEL0_RADIX), 851 bmap->class, 852 bmap->avail, 853 bmap->bitmap[0], bmap->bitmap[1], 854 bmap->bitmap[2], bmap->bitmap[3], 855 bmap->bitmap[4], bmap->bitmap[5], 856 bmap->bitmap[6], bmap->bitmap[7]); 857 } 858 #endif 859 } 860 861 /* 862 * BULKFREE DEDUP HEURISTIC 863 * 864 * WARNING! This code is SMP safe but the heuristic allows SMP collisions. 865 * All fields must be loaded into locals and validated. 866 */ 867 static 868 int 869 h2_bulkfree_test(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref, 870 int pri) 871 { 872 hammer2_dedup_t *dedup; 873 int best; 874 int n; 875 int i; 876 877 n = hammer2_icrc32(&bref->data_off, sizeof(bref->data_off)); 878 dedup = cbinfo->dedup + (n & (HAMMER2_DEDUP_HEUR_MASK & ~7)); 879 880 for (i = best = 0; i < 8; ++i) { 881 if (dedup[i].data_off == bref->data_off) { 882 if (dedup[i].ticks < pri) 883 dedup[i].ticks = pri; 884 if (pri == 1) 885 cbinfo->count_dedup_factor += dedup[i].ticks; 886 return 1; 887 } 888 if (dedup[i].ticks < dedup[best].ticks) 889 best = i; 890 } 891 dedup[best].data_off = bref->data_off; 892 dedup[best].ticks = pri; 893 894 return 0; 895 } 896