1 /* 2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/kernel.h> 37 #include <sys/fcntl.h> 38 #include <sys/buf.h> 39 #include <sys/proc.h> 40 #include <sys/namei.h> 41 #include <sys/mount.h> 42 #include <sys/vnode.h> 43 #include <sys/mountctl.h> 44 #include <vm/vm_kern.h> 45 #include <vm/vm_extern.h> 46 47 #include "hammer2.h" 48 49 #define H2FMBASE(key, radix) ((key) & ~(((hammer2_off_t)1 << (radix)) - 1)) 50 #define H2FMSHIFT(radix) ((hammer2_off_t)1 << (radix)) 51 52 /* 53 * breadth-first search 54 */ 55 typedef struct hammer2_chain_save { 56 TAILQ_ENTRY(hammer2_chain_save) entry; 57 hammer2_chain_t *chain; 58 int pri; 59 } hammer2_chain_save_t; 60 61 TAILQ_HEAD(hammer2_chain_save_list, hammer2_chain_save); 62 typedef struct hammer2_chain_save_list hammer2_chain_save_list_t; 63 64 typedef struct hammer2_bulkfree_info { 65 hammer2_dev_t *hmp; 66 kmem_anon_desc_t kp; 67 hammer2_off_t sbase; /* sub-loop iteration */ 68 hammer2_off_t sstop; 69 hammer2_bmap_data_t *bmap; 70 int depth; 71 long count_10_00; /* staged->free */ 72 long count_11_10; /* allocated->staged */ 73 long count_00_11; /* (should not happen) */ 74 long count_01_11; /* (should not happen) */ 75 long count_10_11; /* staged->allocated */ 76 long count_l0cleans; 77 long count_linadjusts; 78 long count_inodes_scanned; 79 long count_dedup_factor; 80 long bytes_scanned; 81 hammer2_off_t adj_free; 82 hammer2_tid_t mtid; 83 hammer2_tid_t saved_mirror_tid; 84 time_t save_time; 85 hammer2_chain_save_list_t list; 86 hammer2_dedup_t *dedup; 87 int pri; 88 } hammer2_bulkfree_info_t; 89 90 static int h2_bulkfree_test(hammer2_bulkfree_info_t *info, 91 hammer2_blockref_t *bref, int pri); 92 93 /* 94 * General bulk scan function with callback. Called with a referenced 95 * but UNLOCKED parent. The parent is returned in the same state. 96 */ 97 static 98 int 99 hammer2_bulk_scan(hammer2_chain_t *parent, 100 int (*func)(hammer2_bulkfree_info_t *info, 101 hammer2_blockref_t *bref), 102 hammer2_bulkfree_info_t *info) 103 { 104 hammer2_blockref_t bref; 105 hammer2_chain_t *chain; 106 int cache_index = -1; 107 int doabort = 0; 108 int first = 1; 109 110 ++info->pri; 111 112 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 113 HAMMER2_RESOLVE_SHARED); 114 chain = NULL; 115 116 /* 117 * Generally loop on the contents if we have not been flagged 118 * for abort. 119 * 120 * Remember that these chains are completely isolated from 121 * the frontend, so we can release locks temporarily without 122 * imploding. 123 */ 124 while ((doabort & HAMMER2_BULK_ABORT) == 0 && 125 hammer2_chain_scan(parent, &chain, &bref, &first, 126 &cache_index, 127 HAMMER2_LOOKUP_NODATA | 128 HAMMER2_LOOKUP_SHARED) != NULL) { 129 /* 130 * Process bref, chain is only non-NULL if the bref 131 * might be recursable (its possible that we sometimes get 132 * a non-NULL chain where the bref cannot be recursed). 133 */ 134 #if 0 135 kprintf("SCAN %016jx\n", bref.data_off); 136 int xerr = tsleep(&info->pri, PCATCH, "slp", hz / 10); 137 if (xerr == EINTR || xerr == ERESTART) { 138 doabort |= HAMMER2_BULK_ABORT; 139 } 140 #endif 141 ++info->pri; 142 if (h2_bulkfree_test(info, &bref, 1)) 143 continue; 144 145 doabort |= func(info, &bref); 146 147 if (doabort & HAMMER2_BULK_ABORT) 148 break; 149 150 /* 151 * A non-null chain is always returned if it is 152 * recursive, otherwise a non-null chain might be 153 * returned but usually is not when not recursive. 154 */ 155 if (chain == NULL) 156 continue; 157 158 /* 159 * Else check type and setup depth-first scan. 160 * 161 * Account for bytes actually read. 162 */ 163 info->bytes_scanned += chain->bytes; 164 165 switch(chain->bref.type) { 166 case HAMMER2_BREF_TYPE_INODE: 167 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 168 case HAMMER2_BREF_TYPE_INDIRECT: 169 case HAMMER2_BREF_TYPE_VOLUME: 170 case HAMMER2_BREF_TYPE_FREEMAP: 171 ++info->depth; 172 if (info->depth > 16) { 173 hammer2_chain_save_t *save; 174 save = kmalloc(sizeof(*save), M_HAMMER2, 175 M_WAITOK | M_ZERO); 176 save->chain = chain; 177 hammer2_chain_ref(chain); 178 TAILQ_INSERT_TAIL(&info->list, save, entry); 179 180 /* guess */ 181 info->pri += 10; 182 } else { 183 int savepri = info->pri; 184 185 hammer2_chain_unlock(chain); 186 info->pri = 0; 187 doabort |= hammer2_bulk_scan(chain, func, info); 188 info->pri += savepri; 189 hammer2_chain_lock(chain, 190 HAMMER2_RESOLVE_ALWAYS | 191 HAMMER2_RESOLVE_SHARED); 192 } 193 --info->depth; 194 break; 195 default: 196 /* does not recurse */ 197 break; 198 } 199 } 200 if (chain) { 201 hammer2_chain_unlock(chain); 202 hammer2_chain_drop(chain); 203 } 204 205 /* 206 * Save with higher pri now that we know what it is. 207 */ 208 h2_bulkfree_test(info, &parent->bref, info->pri + 1); 209 210 hammer2_chain_unlock(parent); 211 212 return doabort; 213 } 214 215 /* 216 * Bulkfree algorithm 217 * 218 * Repeat { 219 * Chain flush (partial synchronization) XXX removed 220 * Scan the whole topology - build in-memory freemap (mark 11) 221 * Reconcile the in-memory freemap against the on-disk freemap. 222 * ondisk xx -> ondisk 11 (if allocated) 223 * ondisk 11 -> ondisk 10 (if free in-memory) 224 * ondisk 10 -> ondisk 00 (if free in-memory) - on next pass 225 * } 226 * 227 * The topology scan may have to be performed multiple times to window 228 * freemaps which are too large to fit in kernel memory. 229 * 230 * Races are handled using a double-transition (11->10, 10->00). The bulkfree 231 * scan snapshots the volume root's blockset and thus can run concurrent with 232 * normal operations, as long as a full flush is made between each pass to 233 * synchronize any modified chains (otherwise their blocks might be improperly 234 * freed). 235 * 236 * Temporary memory in multiples of 64KB is required to reconstruct the leaf 237 * hammer2_bmap_data blocks so they can later be compared against the live 238 * freemap. Each 64KB block represents 128 x 16KB x 1024 = ~2 GB of storage. 239 * A 32MB save area thus represents around ~1 TB. The temporary memory 240 * allocated can be specified. If it is not sufficient multiple topology 241 * passes will be made. 242 */ 243 244 /* 245 * Bulkfree callback info 246 */ 247 static void cbinfo_bmap_init(hammer2_bulkfree_info_t *cbinfo, size_t size); 248 static int h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo, 249 hammer2_blockref_t *bref); 250 static void h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo); 251 static void h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, 252 hammer2_off_t data_off, hammer2_bmap_data_t *live, 253 hammer2_bmap_data_t *bmap, int nofree); 254 255 int 256 hammer2_bulkfree_pass(hammer2_dev_t *hmp, hammer2_ioc_bulkfree_t *bfi) 257 { 258 hammer2_bulkfree_info_t cbinfo; 259 hammer2_chain_t *vchain; 260 hammer2_chain_save_t *save; 261 hammer2_off_t incr; 262 size_t size; 263 int doabort = 0; 264 265 /* 266 * A bulkfree operations lock is required for the duration. We 267 * must hold it across our flushes to guarantee that we never run 268 * two bulkfree passes in a row without a flush in the middle. 269 */ 270 lockmgr(&hmp->bulklk, LK_EXCLUSIVE); 271 272 /* 273 * We have to clear the live dedup cache as it might have entries 274 * that are freeable as of now. Any new entries in the dedup cache 275 * made after this point, even if they become freeable, will have 276 * previously been fully allocated and will be protected by the 277 * 2-stage bulkfree. 278 */ 279 hammer2_dedup_clear(hmp); 280 281 #if 1 282 /* 283 * XXX This has been put back in. The below check is currently 284 * disabled because it creates quite a bit of confusion, so we're 285 * going to try to fix this in a different way. 286 * 287 * XXX This has been removed. Instead of trying to flush, which 288 * appears to have a ton of races against life chains even with 289 * the two-stage scan, we simply refuse to free any blocks 290 * related to freemap chains modified after the last filesystem 291 * sync. 292 * 293 * Do a quick flush so we can snapshot vchain for any blocks that 294 * have been allocated prior to this point. We don't need to 295 * flush vnodes, logical buffers, or dirty inodes that have not 296 * allocated blocks yet. We do not want to flush the device buffers 297 * nor do we want to flush the actual volume root to disk here, 298 * that is not needed to perform the snapshot. 299 */ 300 hammer2_flush_quick(hmp); 301 #endif 302 303 /* 304 * Setup for free pass 305 */ 306 bzero(&cbinfo, sizeof(cbinfo)); 307 size = (bfi->size + HAMMER2_FREEMAP_LEVELN_PSIZE - 1) & 308 ~(size_t)(HAMMER2_FREEMAP_LEVELN_PSIZE - 1); 309 cbinfo.hmp = hmp; 310 cbinfo.bmap = kmem_alloc_swapbacked(&cbinfo.kp, size); 311 cbinfo.saved_mirror_tid = hmp->voldata.mirror_tid; 312 313 cbinfo.dedup = kmalloc(sizeof(*cbinfo.dedup) * HAMMER2_DEDUP_HEUR_SIZE, 314 M_HAMMER2, M_WAITOK | M_ZERO); 315 316 /* 317 * Normalize start point to a 2GB boundary. We operate on a 318 * 64KB leaf bitmap boundary which represents 2GB of storage. 319 */ 320 cbinfo.sbase = bfi->sbase; 321 if (cbinfo.sbase > hmp->voldata.volu_size) 322 cbinfo.sbase = hmp->voldata.volu_size; 323 cbinfo.sbase &= ~HAMMER2_FREEMAP_LEVEL1_MASK; 324 325 /* 326 * The primary storage scan must use a snapshot of the volume 327 * root to avoid racing renames and other frontend work. 328 * 329 * Note that snapshots only snap synchronized storage, so 330 * we have to flush between each pass or we risk freeing 331 * storage allocated by the frontend. 332 */ 333 vchain = hammer2_chain_bulksnap(&hmp->vchain); 334 TAILQ_INIT(&cbinfo.list); 335 336 /* 337 * Loop on a full meta-data scan as many times as required to 338 * get through all available storage. 339 */ 340 while (cbinfo.sbase < hmp->voldata.volu_size) { 341 /* 342 * We have enough ram to represent (incr) bytes of storage. 343 * Each 64KB of ram represents 2GB of storage. 344 */ 345 cbinfo_bmap_init(&cbinfo, size); 346 incr = size / HAMMER2_FREEMAP_LEVELN_PSIZE * 347 HAMMER2_FREEMAP_LEVEL1_SIZE; 348 if (hmp->voldata.volu_size - cbinfo.sbase < incr) 349 cbinfo.sstop = hmp->voldata.volu_size; 350 else 351 cbinfo.sstop = cbinfo.sbase + incr; 352 if (hammer2_debug & 1) { 353 kprintf("bulkfree pass %016jx/%jdGB\n", 354 (intmax_t)cbinfo.sbase, 355 (intmax_t)incr / HAMMER2_FREEMAP_LEVEL1_SIZE); 356 } 357 358 /* 359 * Scan topology for stuff inside this range. 360 */ 361 hammer2_trans_init(hmp->spmp, 0); 362 cbinfo.mtid = hammer2_trans_sub(hmp->spmp); 363 cbinfo.pri = 0; 364 doabort |= hammer2_bulk_scan(vchain, h2_bulkfree_callback, 365 &cbinfo); 366 367 while ((save = TAILQ_FIRST(&cbinfo.list)) != NULL && 368 doabort == 0) { 369 TAILQ_REMOVE(&cbinfo.list, save, entry); 370 cbinfo.pri = 0; 371 doabort |= hammer2_bulk_scan(save->chain, 372 h2_bulkfree_callback, 373 &cbinfo); 374 hammer2_chain_drop(save->chain); 375 kfree(save, M_HAMMER2); 376 } 377 while (save) { 378 TAILQ_REMOVE(&cbinfo.list, save, entry); 379 hammer2_chain_drop(save->chain); 380 kfree(save, M_HAMMER2); 381 save = TAILQ_FIRST(&cbinfo.list); 382 } 383 384 kprintf("bulkfree lastdrop %d %d doabort=%d\n", 385 vchain->refs, vchain->core.chain_count, doabort); 386 387 /* 388 * If complete scan succeeded we can synchronize our 389 * in-memory freemap against live storage. If an abort 390 * did occur we cannot safely synchronize our partially 391 * filled-out in-memory freemap. 392 */ 393 if (doabort == 0) { 394 h2_bulkfree_sync(&cbinfo); 395 396 hammer2_voldata_lock(hmp); 397 hammer2_voldata_modify(hmp); 398 hmp->voldata.allocator_free += cbinfo.adj_free; 399 hammer2_voldata_unlock(hmp); 400 } 401 402 /* 403 * Cleanup for next loop. 404 */ 405 hammer2_trans_done(hmp->spmp); 406 if (doabort) 407 break; 408 cbinfo.sbase = cbinfo.sstop; 409 cbinfo.adj_free = 0; 410 } 411 hammer2_chain_bulkdrop(vchain); 412 kmem_free_swapbacked(&cbinfo.kp); 413 kfree(cbinfo.dedup, M_HAMMER2); 414 cbinfo.dedup = NULL; 415 416 bfi->sstop = cbinfo.sbase; 417 418 incr = bfi->sstop / (hmp->voldata.volu_size / 10000); 419 if (incr > 10000) 420 incr = 10000; 421 422 kprintf("bulkfree pass statistics (%d.%02d%% storage processed):\n", 423 (int)incr / 100, 424 (int)incr % 100); 425 426 kprintf(" transition->free %ld\n", cbinfo.count_10_00); 427 kprintf(" transition->staged %ld\n", cbinfo.count_11_10); 428 kprintf(" ERR(00)->allocated %ld\n", cbinfo.count_00_11); 429 kprintf(" ERR(01)->allocated %ld\n", cbinfo.count_01_11); 430 kprintf(" staged->allocated %ld\n", cbinfo.count_10_11); 431 kprintf(" ~2MB segs cleaned %ld\n", cbinfo.count_l0cleans); 432 kprintf(" linear adjusts %ld\n", cbinfo.count_linadjusts); 433 kprintf(" dedup factor %ld\n", cbinfo.count_dedup_factor); 434 435 lockmgr(&hmp->bulklk, LK_RELEASE); 436 /* hammer2_vfs_sync(mp, MNT_WAIT); sync needed */ 437 438 return doabort; 439 } 440 441 static void 442 cbinfo_bmap_init(hammer2_bulkfree_info_t *cbinfo, size_t size) 443 { 444 hammer2_bmap_data_t *bmap = cbinfo->bmap; 445 hammer2_key_t key = cbinfo->sbase; 446 hammer2_key_t lokey; 447 hammer2_key_t hikey; 448 449 lokey = (cbinfo->hmp->voldata.allocator_beg + HAMMER2_SEGMASK64) & 450 ~HAMMER2_SEGMASK64; 451 hikey = cbinfo->hmp->voldata.volu_size & ~HAMMER2_SEGMASK64; 452 453 bzero(bmap, size); 454 while (size) { 455 if (lokey < H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX) + 456 HAMMER2_ZONE_SEG64) { 457 lokey = H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX) + 458 HAMMER2_ZONE_SEG64; 459 } 460 if (key < lokey || key >= hikey) { 461 memset(bmap->bitmapq, -1, 462 sizeof(bmap->bitmapq)); 463 bmap->avail = 0; 464 bmap->linear = HAMMER2_SEGSIZE; 465 } else { 466 bmap->avail = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX); 467 } 468 size -= sizeof(*bmap); 469 key += HAMMER2_FREEMAP_LEVEL0_SIZE; 470 ++bmap; 471 } 472 } 473 474 static int 475 h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref) 476 { 477 hammer2_bmap_data_t *bmap; 478 hammer2_off_t data_off; 479 uint16_t class; 480 size_t bytes; 481 int radix; 482 int error; 483 484 /* 485 * Check for signal and allow yield to userland during scan 486 */ 487 if (hammer2_signal_check(&cbinfo->save_time)) 488 return HAMMER2_BULK_ABORT; 489 if (bref->type == HAMMER2_BREF_TYPE_INODE) { 490 ++cbinfo->count_inodes_scanned; 491 if ((cbinfo->count_inodes_scanned & 65535) == 0) 492 kprintf(" inodes %6ld bytes %9ld\n", 493 cbinfo->count_inodes_scanned, 494 cbinfo->bytes_scanned); 495 } 496 497 /* 498 * Calculate the data offset and determine if it is within 499 * the current freemap range being gathered. 500 */ 501 error = 0; 502 data_off = bref->data_off & ~HAMMER2_OFF_MASK_RADIX; 503 if (data_off < cbinfo->sbase || data_off >= cbinfo->sstop) 504 return 0; 505 if (data_off < cbinfo->hmp->voldata.allocator_beg) 506 return 0; 507 if (data_off >= cbinfo->hmp->voldata.volu_size) 508 return 0; 509 510 /* 511 * Calculate the information needed to generate the in-memory 512 * freemap record. 513 * 514 * Hammer2 does not allow allocations to cross the L1 (2GB) boundary, 515 * it's a problem if it does. (Or L0 (2MB) for that matter). 516 */ 517 radix = (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 518 bytes = (size_t)1 << radix; 519 class = (bref->type << 8) | hammer2_devblkradix(radix); 520 521 if (data_off + bytes >= cbinfo->sstop) { 522 kprintf("hammer2_bulkfree_scan: illegal 2GB boundary " 523 "%016jx %016jx/%d\n", 524 (intmax_t)bref->data_off, 525 (intmax_t)bref->key, 526 bref->keybits); 527 bytes = cbinfo->sstop - data_off; /* XXX */ 528 } 529 530 /* 531 * Convert to a storage offset relative to the beginning of the 532 * storage range we are collecting. Then lookup the level0 bmap entry. 533 */ 534 data_off -= cbinfo->sbase; 535 bmap = cbinfo->bmap + (data_off >> HAMMER2_FREEMAP_LEVEL0_RADIX); 536 537 /* 538 * Convert data_off to a bmap-relative value (~2MB storage range). 539 * Adjust linear, class, and avail. 540 * 541 * Hammer2 does not allow allocations to cross the L0 (2MB) boundary, 542 */ 543 data_off &= HAMMER2_FREEMAP_LEVEL0_MASK; 544 if (data_off + bytes > HAMMER2_FREEMAP_LEVEL0_SIZE) { 545 kprintf("hammer2_bulkfree_scan: illegal 2MB boundary " 546 "%016jx %016jx/%d\n", 547 (intmax_t)bref->data_off, 548 (intmax_t)bref->key, 549 bref->keybits); 550 bytes = HAMMER2_FREEMAP_LEVEL0_SIZE - data_off; 551 } 552 553 if (bmap->class == 0) { 554 bmap->class = class; 555 bmap->avail = HAMMER2_FREEMAP_LEVEL0_SIZE; 556 } 557 if (bmap->class != class) { 558 kprintf("hammer2_bulkfree_scan: illegal mixed class " 559 "%016jx %016jx/%d (%04x vs %04x)\n", 560 (intmax_t)bref->data_off, 561 (intmax_t)bref->key, 562 bref->keybits, 563 class, bmap->class); 564 } 565 if (bmap->linear < (int32_t)data_off + (int32_t)bytes) 566 bmap->linear = (int32_t)data_off + (int32_t)bytes; 567 568 /* 569 * Adjust the hammer2_bitmap_t bitmap[HAMMER2_BMAP_ELEMENTS]. 570 * 64-bit entries, 2 bits per entry, to code 11. 571 * 572 * NOTE: The allocation can be smaller than HAMMER2_FREEMAP_BLOCK_SIZE. 573 */ 574 while (bytes > 0) { 575 int bindex; 576 hammer2_bitmap_t bmask; 577 578 bindex = (int)data_off >> (HAMMER2_FREEMAP_BLOCK_RADIX + 579 HAMMER2_BMAP_INDEX_RADIX); 580 bmask = (hammer2_bitmap_t)3 << 581 ((((int)data_off & HAMMER2_BMAP_INDEX_MASK) >> 582 HAMMER2_FREEMAP_BLOCK_RADIX) << 1); 583 584 /* 585 * NOTE! The (avail) calculation is bitmap-granular. Multiple 586 * sub-granular records can wind up at the same bitmap 587 * position. 588 */ 589 if ((bmap->bitmapq[bindex] & bmask) == 0) { 590 if (bytes < HAMMER2_FREEMAP_BLOCK_SIZE) { 591 bmap->avail -= HAMMER2_FREEMAP_BLOCK_SIZE; 592 } else { 593 bmap->avail -= bytes; 594 } 595 bmap->bitmapq[bindex] |= bmask; 596 } 597 data_off += HAMMER2_FREEMAP_BLOCK_SIZE; 598 if (bytes < HAMMER2_FREEMAP_BLOCK_SIZE) 599 bytes = 0; 600 else 601 bytes -= HAMMER2_FREEMAP_BLOCK_SIZE; 602 } 603 return error; 604 } 605 606 /* 607 * Synchronize the in-memory bitmap with the live freemap. This is not a 608 * direct copy. Instead the bitmaps must be compared: 609 * 610 * In-memory Live-freemap 611 * 00 11 -> 10 (do nothing if live modified) 612 * 10 -> 00 (do nothing if live modified) 613 * 11 10 -> 11 handles race against live 614 * ** -> 11 nominally warn of corruption 615 * 616 */ 617 static void 618 h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo) 619 { 620 hammer2_off_t data_off; 621 hammer2_key_t key; 622 hammer2_key_t key_dummy; 623 hammer2_bmap_data_t *bmap; 624 hammer2_bmap_data_t *live; 625 hammer2_chain_t *live_parent; 626 hammer2_chain_t *live_chain; 627 int cache_index = -1; 628 int bmapindex; 629 630 kprintf("hammer2_bulkfree - range "); 631 632 if (cbinfo->sbase < cbinfo->hmp->voldata.allocator_beg) 633 kprintf("%016jx-", 634 (intmax_t)cbinfo->hmp->voldata.allocator_beg); 635 else 636 kprintf("%016jx-", 637 (intmax_t)cbinfo->sbase); 638 639 if (cbinfo->sstop > cbinfo->hmp->voldata.volu_size) 640 kprintf("%016jx\n", 641 (intmax_t)cbinfo->hmp->voldata.volu_size); 642 else 643 kprintf("%016jx\n", 644 (intmax_t)cbinfo->sstop); 645 646 data_off = cbinfo->sbase; 647 bmap = cbinfo->bmap; 648 649 live_parent = &cbinfo->hmp->fchain; 650 hammer2_chain_ref(live_parent); 651 hammer2_chain_lock(live_parent, HAMMER2_RESOLVE_ALWAYS); 652 live_chain = NULL; 653 654 /* 655 * Iterate each hammer2_bmap_data_t line (128 bytes) managing 656 * 4MB of storage. 657 */ 658 while (data_off < cbinfo->sstop) { 659 /* 660 * The freemap is not used below allocator_beg or beyond 661 * volu_size. 662 */ 663 int nofree; 664 665 if (data_off < cbinfo->hmp->voldata.allocator_beg) 666 goto next; 667 if (data_off >= cbinfo->hmp->voldata.volu_size) 668 goto next; 669 670 /* 671 * Locate the freemap leaf on the live filesystem 672 */ 673 key = (data_off & ~HAMMER2_FREEMAP_LEVEL1_MASK); 674 nofree = 0; 675 676 if (live_chain == NULL || live_chain->bref.key != key) { 677 if (live_chain) { 678 hammer2_chain_unlock(live_chain); 679 hammer2_chain_drop(live_chain); 680 } 681 live_chain = hammer2_chain_lookup( 682 &live_parent, 683 &key_dummy, 684 key, 685 key + HAMMER2_FREEMAP_LEVEL1_MASK, 686 &cache_index, 687 HAMMER2_LOOKUP_ALWAYS); 688 689 #if 0 690 /* 691 * If recent allocations were made we avoid races by 692 * not staging or freeing any blocks. We can still 693 * remark blocks as fully allocated. 694 */ 695 if (live_chain) { 696 if (hammer2_debug & 1) { 697 kprintf("live_chain %016jx\n", 698 (intmax_t)key); 699 } 700 if (live_chain->bref.mirror_tid > 701 cbinfo->saved_mirror_tid) { 702 kprintf("hammer2_bulkfree: " 703 "avoid %016jx\n", 704 data_off); 705 nofree = 1; 706 } else { 707 nofree = 0; 708 } 709 } 710 #endif 711 } 712 if (live_chain == NULL) { 713 /* 714 * XXX if we implement a full recovery mode we need 715 * to create/recreate missing freemap chains if our 716 * bmap has any allocated blocks. 717 */ 718 if (bmap->class && 719 bmap->avail != HAMMER2_FREEMAP_LEVEL0_SIZE) { 720 kprintf("hammer2_bulkfree: cannot locate " 721 "live leaf for allocated data " 722 "near %016jx\n", 723 (intmax_t)data_off); 724 } 725 goto next; 726 } 727 if (live_chain->error) { 728 kprintf("hammer2_bulkfree: error %s looking up " 729 "live leaf for allocated data near %016jx\n", 730 hammer2_error_str(live_chain->error), 731 (intmax_t)data_off); 732 hammer2_chain_unlock(live_chain); 733 hammer2_chain_drop(live_chain); 734 live_chain = NULL; 735 goto next; 736 } 737 738 bmapindex = (data_off & HAMMER2_FREEMAP_LEVEL1_MASK) >> 739 HAMMER2_FREEMAP_LEVEL0_RADIX; 740 live = &live_chain->data->bmdata[bmapindex]; 741 742 /* 743 * TODO - we could shortcut this by testing that both 744 * live->class and bmap->class are 0, and both avails are 745 * set to HAMMER2_FREEMAP_LEVEL0_SIZE (4MB). 746 */ 747 if (bcmp(live->bitmapq, bmap->bitmapq, 748 sizeof(bmap->bitmapq)) == 0) { 749 goto next; 750 } 751 if (hammer2_debug & 1) { 752 kprintf("live %016jx %04d.%04x (avail=%d)\n", 753 data_off, bmapindex, live->class, live->avail); 754 } 755 756 hammer2_chain_modify(live_chain, cbinfo->mtid, 0, 0); 757 live = &live_chain->data->bmdata[bmapindex]; 758 759 h2_bulkfree_sync_adjust(cbinfo, data_off, live, bmap, nofree); 760 next: 761 data_off += HAMMER2_FREEMAP_LEVEL0_SIZE; 762 ++bmap; 763 } 764 if (live_chain) { 765 hammer2_chain_unlock(live_chain); 766 hammer2_chain_drop(live_chain); 767 } 768 if (live_parent) { 769 hammer2_chain_unlock(live_parent); 770 hammer2_chain_drop(live_parent); 771 } 772 } 773 774 /* 775 * When bulkfree is finally able to free a block it must make sure that 776 * the INVALOK bit in any cached DIO is cleared prior to the block being 777 * reused. 778 */ 779 static 780 void 781 fixup_dio(hammer2_dev_t *hmp, hammer2_off_t data_off, int bindex, int scount) 782 { 783 data_off += (scount >> 1) * HAMMER2_FREEMAP_BLOCK_SIZE; 784 data_off += bindex * 785 (HAMMER2_FREEMAP_BLOCK_SIZE * HAMMER2_BMAP_BLOCKS_PER_ELEMENT); 786 hammer2_io_resetinval(hmp, data_off); 787 } 788 789 /* 790 * Merge the bulkfree bitmap against the existing bitmap. 791 * 792 * If nofree is non-zero the merge will only mark free blocks as allocated 793 * and will refuse to free any blocks. 794 */ 795 static 796 void 797 h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, 798 hammer2_off_t data_off, hammer2_bmap_data_t *live, 799 hammer2_bmap_data_t *bmap, int nofree) 800 { 801 int bindex; 802 int scount; 803 hammer2_bitmap_t lmask; 804 hammer2_bitmap_t mmask; 805 806 for (bindex = 0; bindex < HAMMER2_BMAP_ELEMENTS; ++bindex) { 807 lmask = live->bitmapq[bindex]; /* live */ 808 mmask = bmap->bitmapq[bindex]; /* snapshotted bulkfree */ 809 if (lmask == mmask) 810 continue; 811 812 for (scount = 0; 813 scount < HAMMER2_BMAP_BITS_PER_ELEMENT; 814 scount += 2) { 815 if ((mmask & 3) == 0) { 816 /* 817 * in-memory 00 live 11 -> 10 818 * live 10 -> 00 819 * 820 * Storage might be marked allocated or 821 * staged and must be remarked staged or 822 * free. 823 */ 824 switch (lmask & 3) { 825 case 0: /* 00 */ 826 break; 827 case 1: /* 01 */ 828 kprintf("hammer2_bulkfree: cannot " 829 "transition m=00/l=01\n"); 830 break; 831 case 2: /* 10 -> 00 */ 832 if (nofree) 833 break; 834 live->bitmapq[bindex] &= 835 ~((hammer2_bitmap_t)2 << scount); 836 live->avail += 837 HAMMER2_FREEMAP_BLOCK_SIZE; 838 if (live->avail > 839 HAMMER2_FREEMAP_LEVEL0_SIZE) { 840 live->avail = 841 HAMMER2_FREEMAP_LEVEL0_SIZE; 842 } 843 cbinfo->adj_free += 844 HAMMER2_FREEMAP_BLOCK_SIZE; 845 ++cbinfo->count_10_00; 846 break; 847 case 3: /* 11 -> 10 */ 848 if (nofree) 849 break; 850 live->bitmapq[bindex] &= 851 ~((hammer2_bitmap_t)1 << scount); 852 ++cbinfo->count_11_10; 853 fixup_dio(cbinfo->hmp, data_off, 854 bindex, scount); 855 break; 856 } 857 } else if ((mmask & 3) == 3) { 858 /* 859 * in-memory 11 live 10 -> 11 860 * live ** -> 11 861 * 862 * Storage might be incorrectly marked free 863 * or staged and must be remarked fully 864 * allocated. 865 */ 866 switch (lmask & 3) { 867 case 0: /* 00 */ 868 ++cbinfo->count_00_11; 869 cbinfo->adj_free -= 870 HAMMER2_FREEMAP_BLOCK_SIZE; 871 live->avail -= 872 HAMMER2_FREEMAP_BLOCK_SIZE; 873 if ((int32_t)live->avail < 0) 874 live->avail = 0; 875 break; 876 case 1: /* 01 */ 877 ++cbinfo->count_01_11; 878 break; 879 case 2: /* 10 -> 11 */ 880 ++cbinfo->count_10_11; 881 break; 882 case 3: /* 11 */ 883 break; 884 } 885 live->bitmapq[bindex] |= 886 ((hammer2_bitmap_t)3 << scount); 887 } 888 mmask >>= 2; 889 lmask >>= 2; 890 } 891 } 892 893 /* 894 * Determine if the live bitmap is completely free and reset its 895 * fields if so. Otherwise check to see if we can reduce the linear 896 * offset. 897 */ 898 for (bindex = HAMMER2_BMAP_ELEMENTS - 1; bindex >= 0; --bindex) { 899 if (live->bitmapq[bindex] != 0) 900 break; 901 } 902 if (nofree) { 903 /* do nothing */ 904 } else if (bindex < 0) { 905 live->avail = HAMMER2_FREEMAP_LEVEL0_SIZE; 906 live->class = 0; 907 live->linear = 0; 908 ++cbinfo->count_l0cleans; 909 } else if (bindex < 7) { 910 ++bindex; 911 if (live->linear > bindex * HAMMER2_FREEMAP_BLOCK_SIZE) { 912 live->linear = bindex * HAMMER2_FREEMAP_BLOCK_SIZE; 913 ++cbinfo->count_linadjusts; 914 } 915 916 /* 917 * XXX this fine-grained measure still has some issues. 918 */ 919 if (live->linear < bindex * HAMMER2_FREEMAP_BLOCK_SIZE) { 920 live->linear = bindex * HAMMER2_FREEMAP_BLOCK_SIZE; 921 ++cbinfo->count_linadjusts; 922 } 923 } else { 924 live->linear = HAMMER2_SEGSIZE; 925 } 926 927 #if 0 928 if (bmap->class) { 929 kprintf("%016jx %04d.%04x (avail=%7d) " 930 "%08x %08x %08x %08x %08x %08x %08x %08x\n", 931 (intmax_t)data_off, 932 (int)((data_off & 933 HAMMER2_FREEMAP_LEVEL1_MASK) >> 934 HAMMER2_FREEMAP_LEVEL0_RADIX), 935 bmap->class, 936 bmap->avail, 937 bmap->bitmap[0], bmap->bitmap[1], 938 bmap->bitmap[2], bmap->bitmap[3], 939 bmap->bitmap[4], bmap->bitmap[5], 940 bmap->bitmap[6], bmap->bitmap[7]); 941 } 942 #endif 943 } 944 945 /* 946 * BULKFREE DEDUP HEURISTIC 947 * 948 * WARNING! This code is SMP safe but the heuristic allows SMP collisions. 949 * All fields must be loaded into locals and validated. 950 */ 951 static 952 int 953 h2_bulkfree_test(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref, 954 int pri) 955 { 956 hammer2_dedup_t *dedup; 957 int best; 958 int n; 959 int i; 960 961 n = hammer2_icrc32(&bref->data_off, sizeof(bref->data_off)); 962 dedup = cbinfo->dedup + (n & (HAMMER2_DEDUP_HEUR_MASK & ~7)); 963 964 for (i = best = 0; i < 8; ++i) { 965 if (dedup[i].data_off == bref->data_off) { 966 if (dedup[i].ticks < pri) 967 dedup[i].ticks = pri; 968 if (pri == 1) 969 cbinfo->count_dedup_factor += dedup[i].ticks; 970 return 1; 971 } 972 if (dedup[i].ticks < dedup[best].ticks) 973 best = i; 974 } 975 dedup[best].data_off = bref->data_off; 976 dedup[best].ticks = pri; 977 978 return 0; 979 } 980