1 /* 2 * Copyright (c) 2013-2018 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/kernel.h> 37 #include <sys/fcntl.h> 38 #include <sys/buf.h> 39 #include <sys/proc.h> 40 #include <sys/namei.h> 41 #include <sys/mount.h> 42 #include <sys/vnode.h> 43 #include <sys/mountctl.h> 44 #include <vm/vm_kern.h> 45 #include <vm/vm_extern.h> 46 47 #include "hammer2.h" 48 49 /* 50 * breadth-first search 51 */ 52 typedef struct hammer2_chain_save { 53 TAILQ_ENTRY(hammer2_chain_save) entry; 54 hammer2_chain_t *chain; 55 int pri; 56 } hammer2_chain_save_t; 57 58 TAILQ_HEAD(hammer2_chain_save_list, hammer2_chain_save); 59 typedef struct hammer2_chain_save_list hammer2_chain_save_list_t; 60 61 typedef struct hammer2_bulkfree_info { 62 hammer2_dev_t *hmp; 63 kmem_anon_desc_t kp; 64 hammer2_off_t sbase; /* sub-loop iteration */ 65 hammer2_off_t sstop; 66 hammer2_bmap_data_t *bmap; 67 int depth; 68 long count_10_00; /* staged->free */ 69 long count_11_10; /* allocated->staged */ 70 long count_00_11; /* (should not happen) */ 71 long count_01_11; /* (should not happen) */ 72 long count_10_11; /* staged->allocated */ 73 long count_l0cleans; 74 long count_linadjusts; 75 long count_inodes_scanned; 76 long count_dirents_scanned; 77 long count_dedup_factor; 78 long count_bytes_scanned; 79 long count_chains_scanned; 80 long count_chains_reported; 81 long bulkfree_calls; 82 int bulkfree_ticks; 83 hammer2_off_t adj_free; 84 hammer2_tid_t mtid; 85 hammer2_tid_t saved_mirror_tid; 86 time_t save_time; 87 hammer2_chain_save_list_t list; 88 hammer2_dedup_t *dedup; 89 int pri; 90 } hammer2_bulkfree_info_t; 91 92 static int h2_bulkfree_test(hammer2_bulkfree_info_t *info, 93 hammer2_blockref_t *bref, int pri); 94 95 /* 96 * General bulk scan function with callback. Called with a referenced 97 * but UNLOCKED parent. The parent is returned in the same state. 98 */ 99 static 100 int 101 hammer2_bulk_scan(hammer2_chain_t *parent, 102 int (*func)(hammer2_bulkfree_info_t *info, 103 hammer2_blockref_t *bref), 104 hammer2_bulkfree_info_t *info) 105 { 106 hammer2_blockref_t bref; 107 hammer2_chain_t *chain; 108 int first = 1; 109 int rup_error; 110 int error; 111 112 ++info->pri; 113 114 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 115 HAMMER2_RESOLVE_SHARED); 116 chain = NULL; 117 rup_error = 0; 118 error = 0; 119 120 /* 121 * Generally loop on the contents if we have not been flagged 122 * for abort. 123 * 124 * Remember that these chains are completely isolated from 125 * the frontend, so we can release locks temporarily without 126 * imploding. 127 */ 128 for (;;) { 129 error |= hammer2_chain_scan(parent, &chain, &bref, &first, 130 HAMMER2_LOOKUP_NODATA | 131 HAMMER2_LOOKUP_SHARED); 132 133 /* 134 * Handle EOF or other error at current level. This stops 135 * the bulkfree scan. 136 */ 137 if (error) 138 break; 139 140 /* 141 * Account for dirents before thre data_off test, since most 142 * dirents do not need a data reference. 143 */ 144 if (bref.type == HAMMER2_BREF_TYPE_DIRENT) 145 ++info->count_dirents_scanned; 146 147 /* 148 * Ignore brefs without data (typically dirents) 149 */ 150 if ((bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) 151 continue; 152 153 /* 154 * Process bref, chain is only non-NULL if the bref 155 * might be recursable (its possible that we sometimes get 156 * a non-NULL chain where the bref cannot be recursed). 157 */ 158 ++info->pri; 159 if (h2_bulkfree_test(info, &bref, 1)) 160 continue; 161 162 if (bref.type == HAMMER2_BREF_TYPE_INODE) 163 ++info->count_inodes_scanned; 164 165 error |= func(info, &bref); 166 if (error) 167 break; 168 169 /* 170 * A non-null chain is always returned if it is 171 * recursive, otherwise a non-null chain might be 172 * returned but usually is not when not recursive. 173 */ 174 if (chain == NULL) 175 continue; 176 177 if (chain) { 178 info->count_bytes_scanned += chain->bytes; 179 ++info->count_chains_scanned; 180 181 if (info->count_chains_scanned >= 182 info->count_chains_reported + 1000000 || 183 (info->count_chains_scanned < 1000000 && 184 info->count_chains_scanned >= 185 info->count_chains_reported + 100000)) { 186 kprintf(" chains %-7ld inodes %-7ld " 187 "dirents %-7ld bytes %5ldMB\n", 188 info->count_chains_scanned, 189 info->count_inodes_scanned, 190 info->count_dirents_scanned, 191 info->count_bytes_scanned / 1000000); 192 info->count_chains_reported = 193 info->count_chains_scanned; 194 } 195 } 196 197 198 /* 199 * Else check type and setup depth-first scan. 200 * 201 * Account for bytes actually read. 202 */ 203 switch(chain->bref.type) { 204 case HAMMER2_BREF_TYPE_INODE: 205 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 206 case HAMMER2_BREF_TYPE_INDIRECT: 207 case HAMMER2_BREF_TYPE_VOLUME: 208 case HAMMER2_BREF_TYPE_FREEMAP: 209 ++info->depth; 210 if (info->depth > 16) { 211 hammer2_chain_save_t *save; 212 save = kmalloc(sizeof(*save), M_HAMMER2, 213 M_WAITOK | M_ZERO); 214 save->chain = chain; 215 hammer2_chain_ref(chain); 216 TAILQ_INSERT_TAIL(&info->list, save, entry); 217 218 /* guess */ 219 info->pri += 10; 220 } else { 221 int savepri = info->pri; 222 223 hammer2_chain_unlock(chain); 224 hammer2_chain_unlock(parent); 225 info->pri = 0; 226 rup_error |= 227 hammer2_bulk_scan(chain, func, info); 228 info->pri += savepri; 229 hammer2_chain_lock(parent, 230 HAMMER2_RESOLVE_ALWAYS | 231 HAMMER2_RESOLVE_SHARED); 232 hammer2_chain_lock(chain, 233 HAMMER2_RESOLVE_ALWAYS | 234 HAMMER2_RESOLVE_SHARED); 235 } 236 --info->depth; 237 break; 238 case HAMMER2_BREF_TYPE_DATA: 239 break; 240 default: 241 /* does not recurse */ 242 break; 243 } 244 if (rup_error & HAMMER2_ERROR_ABORTED) 245 break; 246 } 247 if (chain) { 248 hammer2_chain_unlock(chain); 249 hammer2_chain_drop(chain); 250 } 251 252 /* 253 * Save with higher pri now that we know what it is. 254 */ 255 h2_bulkfree_test(info, &parent->bref, info->pri + 1); 256 257 hammer2_chain_unlock(parent); 258 259 return ((error | rup_error) & ~HAMMER2_ERROR_EOF); 260 } 261 262 /* 263 * Bulkfree algorithm 264 * 265 * Repeat { 266 * Chain flush (partial synchronization) XXX removed 267 * Scan the whole topology - build in-memory freemap (mark 11) 268 * Reconcile the in-memory freemap against the on-disk freemap. 269 * ondisk xx -> ondisk 11 (if allocated) 270 * ondisk 11 -> ondisk 10 (if free in-memory) 271 * ondisk 10 -> ondisk 00 (if free in-memory) - on next pass 272 * } 273 * 274 * The topology scan may have to be performed multiple times to window 275 * freemaps which are too large to fit in kernel memory. 276 * 277 * Races are handled using a double-transition (11->10, 10->00). The bulkfree 278 * scan snapshots the volume root's blockset and thus can run concurrent with 279 * normal operations, as long as a full flush is made between each pass to 280 * synchronize any modified chains (otherwise their blocks might be improperly 281 * freed). 282 * 283 * Temporary memory in multiples of 64KB is required to reconstruct the leaf 284 * hammer2_bmap_data blocks so they can later be compared against the live 285 * freemap. Each 64KB block represents 128 x 16KB x 1024 = ~2 GB of storage. 286 * A 32MB save area thus represents around ~1 TB. The temporary memory 287 * allocated can be specified. If it is not sufficient multiple topology 288 * passes will be made. 289 */ 290 291 /* 292 * Bulkfree callback info 293 */ 294 static void hammer2_bulkfree_thread(void *arg __unused); 295 static void cbinfo_bmap_init(hammer2_bulkfree_info_t *cbinfo, size_t size); 296 static int h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo, 297 hammer2_blockref_t *bref); 298 static int h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo); 299 static void h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, 300 hammer2_off_t data_off, hammer2_bmap_data_t *live, 301 hammer2_bmap_data_t *bmap, hammer2_key_t alloc_base); 302 303 void 304 hammer2_bulkfree_init(hammer2_dev_t *hmp) 305 { 306 hammer2_thr_create(&hmp->bfthr, NULL, hmp, 307 hmp->devrepname, -1, -1, 308 hammer2_bulkfree_thread); 309 } 310 311 void 312 hammer2_bulkfree_uninit(hammer2_dev_t *hmp) 313 { 314 hammer2_thr_delete(&hmp->bfthr); 315 } 316 317 static void 318 hammer2_bulkfree_thread(void *arg) 319 { 320 hammer2_thread_t *thr = arg; 321 hammer2_ioc_bulkfree_t bfi; 322 uint32_t flags; 323 324 for (;;) { 325 hammer2_thr_wait_any(thr, 326 HAMMER2_THREAD_STOP | 327 HAMMER2_THREAD_FREEZE | 328 HAMMER2_THREAD_UNFREEZE | 329 HAMMER2_THREAD_REMASTER, 330 hz * 60); 331 332 flags = thr->flags; 333 cpu_ccfence(); 334 if (flags & HAMMER2_THREAD_STOP) 335 break; 336 if (flags & HAMMER2_THREAD_FREEZE) { 337 hammer2_thr_signal2(thr, HAMMER2_THREAD_FROZEN, 338 HAMMER2_THREAD_FREEZE); 339 continue; 340 } 341 if (flags & HAMMER2_THREAD_UNFREEZE) { 342 hammer2_thr_signal2(thr, 0, 343 HAMMER2_THREAD_FROZEN | 344 HAMMER2_THREAD_UNFREEZE); 345 continue; 346 } 347 if (flags & HAMMER2_THREAD_FROZEN) 348 continue; 349 if (flags & HAMMER2_THREAD_REMASTER) { 350 hammer2_thr_signal2(thr, 0, HAMMER2_THREAD_REMASTER); 351 bzero(&bfi, sizeof(bfi)); 352 bfi.size = 8192 * 1024; 353 /* hammer2_bulkfree_pass(thr->hmp, &bfi); */ 354 } 355 } 356 thr->td = NULL; 357 hammer2_thr_signal(thr, HAMMER2_THREAD_STOPPED); 358 /* structure can go invalid at this point */ 359 } 360 361 int 362 hammer2_bulkfree_pass(hammer2_dev_t *hmp, hammer2_chain_t *vchain, 363 hammer2_ioc_bulkfree_t *bfi) 364 { 365 hammer2_bulkfree_info_t cbinfo; 366 hammer2_chain_save_t *save; 367 hammer2_off_t incr; 368 size_t size; 369 int error; 370 371 /* 372 * We have to clear the live dedup cache as it might have entries 373 * that are freeable as of now. Any new entries in the dedup cache 374 * made after this point, even if they become freeable, will have 375 * previously been fully allocated and will be protected by the 376 * 2-stage bulkfree. 377 */ 378 hammer2_dedup_clear(hmp); 379 380 /* 381 * Setup for free pass using the buffer size specified by the 382 * hammer2 utility, 32K-aligned. 383 */ 384 bzero(&cbinfo, sizeof(cbinfo)); 385 size = (bfi->size + HAMMER2_FREEMAP_LEVELN_PSIZE - 1) & 386 ~(size_t)(HAMMER2_FREEMAP_LEVELN_PSIZE - 1); 387 388 /* 389 * Cap at 1/4 physical memory (hammer2 utility will not normally 390 * ever specify a buffer this big, but leave the option available). 391 */ 392 if (size > kmem_lim_size() * 1024 * 1024 / 4) { 393 size = kmem_lim_size() * 1024 * 1024 / 4; 394 kprintf("hammer2: Warning: capping bulkfree buffer at %jdM\n", 395 (intmax_t)size / (1024 * 1024)); 396 } 397 398 #define HAMMER2_FREEMAP_SIZEDIV \ 399 (HAMMER2_FREEMAP_LEVEL1_SIZE / HAMMER2_FREEMAP_LEVELN_PSIZE) 400 #define HAMMER2_FREEMAP_SIZEMASK (HAMMER2_FREEMAP_SIZEDIV - 1) 401 402 /* 403 * Cap at the size needed to cover the whole volume to avoid 404 * making an unnecessarily large allocation. 405 */ 406 if (size > hmp->voldata.volu_size / HAMMER2_FREEMAP_SIZEDIV) { 407 size = (hmp->voldata.volu_size + HAMMER2_FREEMAP_SIZEMASK) / 408 HAMMER2_FREEMAP_SIZEDIV; 409 } 410 411 /* 412 * Minimum bitmap buffer size, then align to a LEVELN_PSIZE (32K) 413 * boundary. 414 */ 415 if (size < 1024 * 1024) 416 size = 1024 * 1024; 417 size = (size + HAMMER2_FREEMAP_LEVELN_PSIZE - 1) & 418 ~(size_t)(HAMMER2_FREEMAP_LEVELN_PSIZE - 1); 419 420 cbinfo.hmp = hmp; 421 cbinfo.bmap = kmem_alloc_swapbacked(&cbinfo.kp, size, VM_SUBSYS_HAMMER); 422 cbinfo.saved_mirror_tid = hmp->voldata.mirror_tid; 423 424 cbinfo.dedup = kmalloc(sizeof(*cbinfo.dedup) * HAMMER2_DEDUP_HEUR_SIZE, 425 M_HAMMER2, M_WAITOK | M_ZERO); 426 427 kprintf("hammer2: bulkfree buf=%jdM\n", 428 (intmax_t)size / (1024 * 1024)); 429 430 /* 431 * Normalize start point to a 2GB boundary. We operate on a 432 * 64KB leaf bitmap boundary which represents 2GB of storage. 433 */ 434 cbinfo.sbase = bfi->sbase; 435 if (cbinfo.sbase > hmp->voldata.volu_size) 436 cbinfo.sbase = hmp->voldata.volu_size; 437 cbinfo.sbase &= ~HAMMER2_FREEMAP_LEVEL1_MASK; 438 TAILQ_INIT(&cbinfo.list); 439 440 cbinfo.bulkfree_ticks = ticks; 441 442 /* 443 * Loop on a full meta-data scan as many times as required to 444 * get through all available storage. 445 */ 446 error = 0; 447 while (cbinfo.sbase < hmp->voldata.volu_size) { 448 /* 449 * We have enough ram to represent (incr) bytes of storage. 450 * Each 64KB of ram represents 2GB of storage. 451 * 452 * We must also clean out our de-duplication heuristic for 453 * each (incr) bytes of storage, otherwise we wind up not 454 * scanning meta-data for later areas of storage because 455 * they had already been scanned in earlier areas of storage. 456 * Since the ranging is different, we have to restart 457 * the dedup heuristic too. 458 */ 459 int allmedia; 460 461 cbinfo_bmap_init(&cbinfo, size); 462 bzero(cbinfo.dedup, sizeof(*cbinfo.dedup) * 463 HAMMER2_DEDUP_HEUR_SIZE); 464 cbinfo.count_inodes_scanned = 0; 465 cbinfo.count_dirents_scanned = 0; 466 cbinfo.count_bytes_scanned = 0; 467 cbinfo.count_chains_scanned = 0; 468 cbinfo.count_chains_reported = 0; 469 470 incr = size / HAMMER2_FREEMAP_LEVELN_PSIZE * 471 HAMMER2_FREEMAP_LEVEL1_SIZE; 472 if (hmp->voldata.volu_size - cbinfo.sbase <= incr) { 473 cbinfo.sstop = hmp->voldata.volu_size; 474 allmedia = 1; 475 } else { 476 cbinfo.sstop = cbinfo.sbase + incr; 477 allmedia = 0; 478 } 479 kprintf("hammer2: pass %016jx-%016jx ", 480 (intmax_t)cbinfo.sbase, 481 (intmax_t)cbinfo.sstop); 482 if (allmedia && cbinfo.sbase == 0) 483 kprintf("(all media)\n"); 484 else if (allmedia) 485 kprintf("(remaining media)\n"); 486 else 487 kprintf("(%jdGB of media)\n", 488 (intmax_t)incr / (1024L*1024*1024)); 489 490 /* 491 * Scan topology for stuff inside this range. 492 * 493 * NOTE - By not using a transaction the operation can 494 * run concurrent with the frontend as well as 495 * with flushes. 496 * 497 * We cannot safely set a mtid without a transaction, 498 * and in fact we don't want to set one anyway. We 499 * want the bulkfree to be passive and no interfere 500 * with crash recovery. 501 */ 502 #undef HAMMER2_BULKFREE_TRANS /* undef - don't use transaction */ 503 #ifdef HAMMER2_BULKFREE_TRANS 504 hammer2_trans_init(hmp->spmp, 0); 505 cbinfo.mtid = hammer2_trans_sub(hmp->spmp); 506 #else 507 cbinfo.mtid = 0; 508 #endif 509 cbinfo.pri = 0; 510 error |= hammer2_bulk_scan(vchain, h2_bulkfree_callback, 511 &cbinfo); 512 513 while ((save = TAILQ_FIRST(&cbinfo.list)) != NULL && 514 error == 0) { 515 TAILQ_REMOVE(&cbinfo.list, save, entry); 516 cbinfo.pri = 0; 517 error |= hammer2_bulk_scan(save->chain, 518 h2_bulkfree_callback, 519 &cbinfo); 520 hammer2_chain_drop(save->chain); 521 kfree(save, M_HAMMER2); 522 } 523 while (save) { 524 TAILQ_REMOVE(&cbinfo.list, save, entry); 525 hammer2_chain_drop(save->chain); 526 kfree(save, M_HAMMER2); 527 save = TAILQ_FIRST(&cbinfo.list); 528 } 529 530 /* 531 * If the complete scan succeeded we can synchronize our 532 * in-memory freemap against live storage. If an abort 533 * occured we cannot safely synchronize our partially 534 * filled-out in-memory freemap. 535 */ 536 if (error) { 537 kprintf("bulkfree lastdrop %d %d error=0x%04x\n", 538 vchain->refs, vchain->core.chain_count, error); 539 } else { 540 kprintf("bulkfree lastdrop %d %d\n", 541 vchain->refs, vchain->core.chain_count); 542 543 error = h2_bulkfree_sync(&cbinfo); 544 545 hammer2_voldata_lock(hmp); 546 hammer2_voldata_modify(hmp); 547 hmp->voldata.allocator_free += cbinfo.adj_free; 548 hammer2_voldata_unlock(hmp); 549 } 550 551 /* 552 * Cleanup for next loop. 553 */ 554 #ifdef HAMMER2_BULKFREE_TRANS 555 hammer2_trans_done(hmp->spmp, 0); 556 #endif 557 if (error) 558 break; 559 cbinfo.sbase = cbinfo.sstop; 560 cbinfo.adj_free = 0; 561 } 562 kmem_free_swapbacked(&cbinfo.kp); 563 kfree(cbinfo.dedup, M_HAMMER2); 564 cbinfo.dedup = NULL; 565 566 bfi->sstop = cbinfo.sbase; 567 568 incr = bfi->sstop / (hmp->voldata.volu_size / 10000); 569 if (incr > 10000) 570 incr = 10000; 571 572 kprintf("bulkfree pass statistics (%d.%02d%% storage processed):\n", 573 (int)incr / 100, 574 (int)incr % 100); 575 576 if (error) { 577 kprintf(" bulkfree was aborted\n"); 578 } else { 579 kprintf(" transition->free %ld\n", cbinfo.count_10_00); 580 kprintf(" transition->staged %ld\n", cbinfo.count_11_10); 581 kprintf(" ERR(00)->allocated %ld\n", cbinfo.count_00_11); 582 kprintf(" ERR(01)->allocated %ld\n", cbinfo.count_01_11); 583 kprintf(" staged->allocated %ld\n", cbinfo.count_10_11); 584 kprintf(" ~2MB segs cleaned %ld\n", cbinfo.count_l0cleans); 585 kprintf(" linear adjusts %ld\n", 586 cbinfo.count_linadjusts); 587 kprintf(" dedup factor %ld\n", 588 cbinfo.count_dedup_factor); 589 } 590 591 return error; 592 } 593 594 static void 595 cbinfo_bmap_init(hammer2_bulkfree_info_t *cbinfo, size_t size) 596 { 597 hammer2_bmap_data_t *bmap = cbinfo->bmap; 598 hammer2_key_t key = cbinfo->sbase; 599 hammer2_key_t lokey; 600 hammer2_key_t hikey; 601 602 lokey = (cbinfo->hmp->voldata.allocator_beg + HAMMER2_SEGMASK64) & 603 ~HAMMER2_SEGMASK64; 604 hikey = cbinfo->hmp->voldata.volu_size & ~HAMMER2_SEGMASK64; 605 606 bzero(bmap, size); 607 while (size) { 608 bzero(bmap, sizeof(*bmap)); 609 if (lokey < H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX)) 610 lokey = H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX); 611 if (lokey < H2FMZONEBASE(key) + HAMMER2_ZONE_SEG64) 612 lokey = H2FMZONEBASE(key) + HAMMER2_ZONE_SEG64; 613 if (key < lokey || key >= hikey) { 614 memset(bmap->bitmapq, -1, 615 sizeof(bmap->bitmapq)); 616 bmap->avail = 0; 617 bmap->linear = HAMMER2_SEGSIZE; 618 } else { 619 bmap->avail = HAMMER2_FREEMAP_LEVEL0_SIZE; 620 } 621 size -= sizeof(*bmap); 622 key += HAMMER2_FREEMAP_LEVEL0_SIZE; 623 ++bmap; 624 } 625 } 626 627 static int 628 h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref) 629 { 630 hammer2_bmap_data_t *bmap; 631 hammer2_off_t data_off; 632 uint16_t class; 633 size_t bytes; 634 int radix; 635 636 /* 637 * Check for signal and allow yield to userland during scan. 638 */ 639 if (hammer2_signal_check(&cbinfo->save_time)) 640 return HAMMER2_ERROR_ABORTED; 641 642 /* 643 * Deal with kernel thread cpu or I/O hogging by limiting the 644 * number of chains scanned per second to hammer2_bulkfree_tps. 645 * Ignore leaf records (DIRENT and DATA), no per-record I/O is 646 * involved for those since we don't load their data. 647 */ 648 if (bref->type != HAMMER2_BREF_TYPE_DATA && 649 bref->type != HAMMER2_BREF_TYPE_DIRENT) { 650 ++cbinfo->bulkfree_calls; 651 if (cbinfo->bulkfree_calls > hammer2_bulkfree_tps) { 652 int dticks = ticks - cbinfo->bulkfree_ticks; 653 if (dticks < 0) 654 dticks = 0; 655 if (dticks < hz) { 656 tsleep(&cbinfo->bulkfree_ticks, 0, 657 "h2bw", hz - dticks); 658 } 659 cbinfo->bulkfree_calls = 0; 660 cbinfo->bulkfree_ticks = ticks; 661 } 662 } 663 664 /* 665 * Calculate the data offset and determine if it is within 666 * the current freemap range being gathered. 667 */ 668 data_off = bref->data_off & ~HAMMER2_OFF_MASK_RADIX; 669 if (data_off < cbinfo->sbase || data_off >= cbinfo->sstop) 670 return 0; 671 if (data_off < cbinfo->hmp->voldata.allocator_beg) 672 return 0; 673 if (data_off >= cbinfo->hmp->voldata.volu_size) 674 return 0; 675 676 /* 677 * Calculate the information needed to generate the in-memory 678 * freemap record. 679 * 680 * Hammer2 does not allow allocations to cross the L1 (2GB) boundary, 681 * it's a problem if it does. (Or L0 (2MB) for that matter). 682 */ 683 radix = (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 684 KKASSERT(radix != 0); 685 bytes = (size_t)1 << radix; 686 class = (bref->type << 8) | hammer2_devblkradix(radix); 687 688 if (data_off + bytes > cbinfo->sstop) { 689 kprintf("hammer2_bulkfree_scan: illegal 2GB boundary " 690 "%016jx %016jx/%d\n", 691 (intmax_t)bref->data_off, 692 (intmax_t)bref->key, 693 bref->keybits); 694 bytes = cbinfo->sstop - data_off; /* XXX */ 695 } 696 697 /* 698 * Convert to a storage offset relative to the beginning of the 699 * storage range we are collecting. Then lookup the level0 bmap entry. 700 */ 701 data_off -= cbinfo->sbase; 702 bmap = cbinfo->bmap + (data_off >> HAMMER2_FREEMAP_LEVEL0_RADIX); 703 704 /* 705 * Convert data_off to a bmap-relative value (~4MB storage range). 706 * Adjust linear, class, and avail. 707 * 708 * Hammer2 does not allow allocations to cross the L0 (4MB) boundary, 709 */ 710 data_off &= HAMMER2_FREEMAP_LEVEL0_MASK; 711 if (data_off + bytes > HAMMER2_FREEMAP_LEVEL0_SIZE) { 712 kprintf("hammer2_bulkfree_scan: illegal 4MB boundary " 713 "%016jx %016jx/%d\n", 714 (intmax_t)bref->data_off, 715 (intmax_t)bref->key, 716 bref->keybits); 717 bytes = HAMMER2_FREEMAP_LEVEL0_SIZE - data_off; 718 } 719 720 if (bmap->class == 0) { 721 bmap->class = class; 722 bmap->avail = HAMMER2_FREEMAP_LEVEL0_SIZE; 723 } 724 725 /* 726 * NOTE: bmap->class does not have to match class. Classification 727 * is relaxed when free space is low, so some mixing can occur. 728 */ 729 #if 0 730 /* 731 * XXX removed 732 */ 733 if (bmap->class != class) { 734 kprintf("hammer2_bulkfree_scan: illegal mixed class " 735 "%016jx %016jx/%d (%04x vs %04x)\n", 736 (intmax_t)bref->data_off, 737 (intmax_t)bref->key, 738 bref->keybits, 739 class, bmap->class); 740 } 741 #endif 742 743 /* 744 * Just record the highest byte-granular offset for now. Do not 745 * match against allocations which are in multiples of whole blocks. 746 * 747 * Make sure that any in-block linear offset at least covers the 748 * data range. This can cause bmap->linear to become block-aligned. 749 */ 750 if (bytes & HAMMER2_FREEMAP_BLOCK_MASK) { 751 if (bmap->linear < (int32_t)data_off + (int32_t)bytes) 752 bmap->linear = (int32_t)data_off + (int32_t)bytes; 753 } else if (bmap->linear >= (int32_t)data_off && 754 bmap->linear < (int32_t)data_off + (int32_t)bytes) { 755 bmap->linear = (int32_t)data_off + (int32_t)bytes; 756 } 757 758 /* 759 * Adjust the hammer2_bitmap_t bitmap[HAMMER2_BMAP_ELEMENTS]. 760 * 64-bit entries, 2 bits per entry, to code 11. 761 * 762 * NOTE: data_off mask to 524288, shift right by 14 (radix for 16384), 763 * and multiply shift amount by 2 for sets of 2 bits. 764 * 765 * NOTE: The allocation can be smaller than HAMMER2_FREEMAP_BLOCK_SIZE. 766 * also, data_off may not be FREEMAP_BLOCK_SIZE aligned. 767 */ 768 while (bytes > 0) { 769 hammer2_bitmap_t bmask; 770 int bindex; 771 772 bindex = (int)data_off >> (HAMMER2_FREEMAP_BLOCK_RADIX + 773 HAMMER2_BMAP_INDEX_RADIX); 774 bmask = (hammer2_bitmap_t)3 << 775 ((((int)data_off & HAMMER2_BMAP_INDEX_MASK) >> 776 HAMMER2_FREEMAP_BLOCK_RADIX) << 1); 777 778 /* 779 * NOTE! The (avail) calculation is bitmap-granular. Multiple 780 * sub-granular records can wind up at the same bitmap 781 * position. 782 */ 783 if ((bmap->bitmapq[bindex] & bmask) == 0) { 784 if (bytes < HAMMER2_FREEMAP_BLOCK_SIZE) { 785 bmap->avail -= HAMMER2_FREEMAP_BLOCK_SIZE; 786 } else { 787 bmap->avail -= bytes; 788 } 789 bmap->bitmapq[bindex] |= bmask; 790 } 791 data_off += HAMMER2_FREEMAP_BLOCK_SIZE; 792 if (bytes < HAMMER2_FREEMAP_BLOCK_SIZE) 793 bytes = 0; 794 else 795 bytes -= HAMMER2_FREEMAP_BLOCK_SIZE; 796 } 797 return 0; 798 } 799 800 /* 801 * Synchronize the in-memory bitmap with the live freemap. This is not a 802 * direct copy. Instead the bitmaps must be compared: 803 * 804 * In-memory Live-freemap 805 * 00 11 -> 10 (do nothing if live modified) 806 * 10 -> 00 (do nothing if live modified) 807 * 11 10 -> 11 handles race against live 808 * ** -> 11 nominally warn of corruption 809 * 810 * We must also fixup the hints in HAMMER2_BREF_TYPE_FREEMAP_LEAF. 811 */ 812 static int 813 h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo) 814 { 815 hammer2_off_t data_off; 816 hammer2_key_t key; 817 hammer2_key_t key_dummy; 818 hammer2_bmap_data_t *bmap; 819 hammer2_bmap_data_t *live; 820 hammer2_chain_t *live_parent; 821 hammer2_chain_t *live_chain; 822 int bmapindex; 823 int error; 824 825 kprintf("hammer2_bulkfree - range "); 826 827 if (cbinfo->sbase < cbinfo->hmp->voldata.allocator_beg) 828 kprintf("%016jx-", 829 (intmax_t)cbinfo->hmp->voldata.allocator_beg); 830 else 831 kprintf("%016jx-", 832 (intmax_t)cbinfo->sbase); 833 834 if (cbinfo->sstop > cbinfo->hmp->voldata.volu_size) 835 kprintf("%016jx\n", 836 (intmax_t)cbinfo->hmp->voldata.volu_size); 837 else 838 kprintf("%016jx\n", 839 (intmax_t)cbinfo->sstop); 840 841 data_off = cbinfo->sbase; 842 bmap = cbinfo->bmap; 843 844 live_parent = &cbinfo->hmp->fchain; 845 hammer2_chain_ref(live_parent); 846 hammer2_chain_lock(live_parent, HAMMER2_RESOLVE_ALWAYS); 847 live_chain = NULL; 848 error = 0; 849 850 /* 851 * Iterate each hammer2_bmap_data_t line (128 bytes) managing 852 * 4MB of storage. 853 */ 854 while (data_off < cbinfo->sstop) { 855 /* 856 * The freemap is not used below allocator_beg or beyond 857 * volu_size. 858 */ 859 860 if (data_off < cbinfo->hmp->voldata.allocator_beg) 861 goto next; 862 if (data_off >= cbinfo->hmp->voldata.volu_size) 863 goto next; 864 865 /* 866 * Locate the freemap leaf on the live filesystem 867 */ 868 key = (data_off & ~HAMMER2_FREEMAP_LEVEL1_MASK); 869 870 if (live_chain == NULL || live_chain->bref.key != key) { 871 if (live_chain) { 872 hammer2_chain_unlock(live_chain); 873 hammer2_chain_drop(live_chain); 874 } 875 live_chain = hammer2_chain_lookup( 876 &live_parent, 877 &key_dummy, 878 key, 879 key + HAMMER2_FREEMAP_LEVEL1_MASK, 880 &error, 881 HAMMER2_LOOKUP_ALWAYS); 882 if (error) { 883 kprintf("hammer2_bulkfree: freemap lookup " 884 "error near %016jx, error %s\n", 885 (intmax_t)data_off, 886 hammer2_error_str(live_chain->error)); 887 break; 888 } 889 } 890 if (live_chain == NULL) { 891 /* 892 * XXX if we implement a full recovery mode we need 893 * to create/recreate missing freemap chains if our 894 * bmap has any allocated blocks. 895 */ 896 if (bmap->class && 897 bmap->avail != HAMMER2_FREEMAP_LEVEL0_SIZE) { 898 kprintf("hammer2_bulkfree: cannot locate " 899 "live leaf for allocated data " 900 "near %016jx\n", 901 (intmax_t)data_off); 902 } 903 goto next; 904 } 905 if (live_chain->error) { 906 kprintf("hammer2_bulkfree: unable to access freemap " 907 "near %016jx, error %s\n", 908 (intmax_t)data_off, 909 hammer2_error_str(live_chain->error)); 910 hammer2_chain_unlock(live_chain); 911 hammer2_chain_drop(live_chain); 912 live_chain = NULL; 913 goto next; 914 } 915 916 bmapindex = (data_off & HAMMER2_FREEMAP_LEVEL1_MASK) >> 917 HAMMER2_FREEMAP_LEVEL0_RADIX; 918 live = &live_chain->data->bmdata[bmapindex]; 919 920 /* 921 * Shortcut if the bitmaps match and the live linear 922 * indicator is sane. We can't do a perfect check of 923 * live->linear because the only real requirement is that 924 * if it is not block-aligned, that it not cover the space 925 * within its current block which overlaps one of the data 926 * ranges we scan. We don't retain enough fine-grained 927 * data in our scan to be able to set it exactly. 928 * 929 * TODO - we could shortcut this by testing that both 930 * live->class and bmap->class are 0, and both avails are 931 * set to HAMMER2_FREEMAP_LEVEL0_SIZE (4MB). 932 */ 933 if (bcmp(live->bitmapq, bmap->bitmapq, 934 sizeof(bmap->bitmapq)) == 0 && 935 live->linear >= bmap->linear) { 936 goto next; 937 } 938 if (hammer2_debug & 1) { 939 kprintf("live %016jx %04d.%04x (avail=%d)\n", 940 data_off, bmapindex, live->class, live->avail); 941 } 942 943 hammer2_chain_modify(live_chain, cbinfo->mtid, 0, 0); 944 live_chain->bref.check.freemap.bigmask = -1; 945 cbinfo->hmp->freemap_relaxed = 0; /* reset heuristic */ 946 live = &live_chain->data->bmdata[bmapindex]; 947 948 h2_bulkfree_sync_adjust(cbinfo, data_off, live, bmap, 949 live_chain->bref.key + 950 bmapindex * 951 HAMMER2_FREEMAP_LEVEL0_SIZE); 952 next: 953 data_off += HAMMER2_FREEMAP_LEVEL0_SIZE; 954 ++bmap; 955 } 956 if (live_chain) { 957 hammer2_chain_unlock(live_chain); 958 hammer2_chain_drop(live_chain); 959 } 960 if (live_parent) { 961 hammer2_chain_unlock(live_parent); 962 hammer2_chain_drop(live_parent); 963 } 964 return error; 965 } 966 967 /* 968 * Merge the bulkfree bitmap against the existing bitmap. 969 */ 970 static 971 void 972 h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, 973 hammer2_off_t data_off, hammer2_bmap_data_t *live, 974 hammer2_bmap_data_t *bmap, hammer2_key_t alloc_base) 975 { 976 int bindex; 977 int scount; 978 hammer2_off_t tmp_off; 979 hammer2_bitmap_t lmask; 980 hammer2_bitmap_t mmask; 981 982 tmp_off = data_off; 983 984 for (bindex = 0; bindex < HAMMER2_BMAP_ELEMENTS; ++bindex) { 985 lmask = live->bitmapq[bindex]; /* live */ 986 mmask = bmap->bitmapq[bindex]; /* snapshotted bulkfree */ 987 if (lmask == mmask) { 988 tmp_off += HAMMER2_BMAP_INDEX_SIZE; 989 continue; 990 } 991 992 for (scount = 0; 993 scount < HAMMER2_BMAP_BITS_PER_ELEMENT; 994 scount += 2) { 995 if ((mmask & 3) == 0) { 996 /* 997 * in-memory 00 live 11 -> 10 998 * live 10 -> 00 999 * 1000 * Storage might be marked allocated or 1001 * staged and must be remarked staged or 1002 * free. 1003 */ 1004 switch (lmask & 3) { 1005 case 0: /* 00 */ 1006 break; 1007 case 1: /* 01 */ 1008 kprintf("hammer2_bulkfree: cannot " 1009 "transition m=00/l=01\n"); 1010 break; 1011 case 2: /* 10 -> 00 */ 1012 live->bitmapq[bindex] &= 1013 ~((hammer2_bitmap_t)2 << scount); 1014 live->avail += 1015 HAMMER2_FREEMAP_BLOCK_SIZE; 1016 if (live->avail > 1017 HAMMER2_FREEMAP_LEVEL0_SIZE) { 1018 live->avail = 1019 HAMMER2_FREEMAP_LEVEL0_SIZE; 1020 } 1021 cbinfo->adj_free += 1022 HAMMER2_FREEMAP_BLOCK_SIZE; 1023 ++cbinfo->count_10_00; 1024 hammer2_io_dedup_assert( 1025 cbinfo->hmp, 1026 tmp_off | 1027 HAMMER2_FREEMAP_BLOCK_RADIX, 1028 HAMMER2_FREEMAP_BLOCK_SIZE); 1029 break; 1030 case 3: /* 11 -> 10 */ 1031 live->bitmapq[bindex] &= 1032 ~((hammer2_bitmap_t)1 << scount); 1033 ++cbinfo->count_11_10; 1034 hammer2_io_dedup_delete( 1035 cbinfo->hmp, 1036 HAMMER2_BREF_TYPE_DATA, 1037 tmp_off | 1038 HAMMER2_FREEMAP_BLOCK_RADIX, 1039 HAMMER2_FREEMAP_BLOCK_SIZE); 1040 break; 1041 } 1042 } else if ((mmask & 3) == 3) { 1043 /* 1044 * in-memory 11 live 10 -> 11 1045 * live ** -> 11 1046 * 1047 * Storage might be incorrectly marked free 1048 * or staged and must be remarked fully 1049 * allocated. 1050 */ 1051 switch (lmask & 3) { 1052 case 0: /* 00 */ 1053 ++cbinfo->count_00_11; 1054 cbinfo->adj_free -= 1055 HAMMER2_FREEMAP_BLOCK_SIZE; 1056 live->avail -= 1057 HAMMER2_FREEMAP_BLOCK_SIZE; 1058 if ((int32_t)live->avail < 0) 1059 live->avail = 0; 1060 break; 1061 case 1: /* 01 */ 1062 ++cbinfo->count_01_11; 1063 break; 1064 case 2: /* 10 -> 11 */ 1065 ++cbinfo->count_10_11; 1066 break; 1067 case 3: /* 11 */ 1068 break; 1069 } 1070 live->bitmapq[bindex] |= 1071 ((hammer2_bitmap_t)3 << scount); 1072 } 1073 mmask >>= 2; 1074 lmask >>= 2; 1075 tmp_off += HAMMER2_FREEMAP_BLOCK_SIZE; 1076 } 1077 } 1078 1079 /* 1080 * Determine if the live bitmap is completely free and reset its 1081 * fields if so. Otherwise check to see if we can reduce the linear 1082 * offset. 1083 */ 1084 for (bindex = HAMMER2_BMAP_ELEMENTS - 1; bindex >= 0; --bindex) { 1085 if (live->bitmapq[bindex] != 0) 1086 break; 1087 } 1088 if (bindex < 0) { 1089 /* 1090 * Completely empty, reset entire segment 1091 */ 1092 #if 0 1093 kprintf("hammer2: cleanseg %016jx.%04x (%d)\n", 1094 alloc_base, live->class, live->avail); 1095 #endif 1096 live->avail = HAMMER2_FREEMAP_LEVEL0_SIZE; 1097 live->class = 0; 1098 live->linear = 0; 1099 ++cbinfo->count_l0cleans; 1100 } else if (bindex < 7) { 1101 /* 1102 * Partially full, bitmapq[bindex] != 0. Our bulkfree pass 1103 * does not record enough information to set live->linear 1104 * exactly. 1105 * 1106 * NOTE: Setting live->linear to a sub-block (16K) boundary 1107 * forces the live code to iterate to the next fully 1108 * free block. It does NOT mean that all blocks above 1109 * live->linear are available. 1110 * 1111 * Setting live->linear to a fragmentary (less than 1112 * 16K) boundary allows allocations to iterate within 1113 * that sub-block. 1114 */ 1115 if (live->linear < bmap->linear && 1116 ((live->linear ^ bmap->linear) & 1117 ~HAMMER2_FREEMAP_BLOCK_MASK) == 0) { 1118 /* 1119 * If greater than but still within the same 1120 * sub-block as live we can adjust linear upward. 1121 */ 1122 live->linear = bmap->linear; 1123 ++cbinfo->count_linadjusts; 1124 } else { 1125 /* 1126 * Otherwise adjust to the nearest higher or same 1127 * sub-block boundary. The live system may have 1128 * bounced live->linear around so we cannot make any 1129 * assumptions with regards to available fragmentary 1130 * allocations. 1131 */ 1132 live->linear = 1133 (bmap->linear + HAMMER2_FREEMAP_BLOCK_MASK) & 1134 ~HAMMER2_FREEMAP_BLOCK_MASK; 1135 ++cbinfo->count_linadjusts; 1136 } 1137 } else { 1138 /* 1139 * Completely full, effectively disable the linear iterator 1140 */ 1141 live->linear = HAMMER2_SEGSIZE; 1142 } 1143 1144 #if 0 1145 if (bmap->class) { 1146 kprintf("%016jx %04d.%04x (avail=%7d) " 1147 "%08x %08x %08x %08x %08x %08x %08x %08x\n", 1148 (intmax_t)data_off, 1149 (int)((data_off & 1150 HAMMER2_FREEMAP_LEVEL1_MASK) >> 1151 HAMMER2_FREEMAP_LEVEL0_RADIX), 1152 bmap->class, 1153 bmap->avail, 1154 bmap->bitmap[0], bmap->bitmap[1], 1155 bmap->bitmap[2], bmap->bitmap[3], 1156 bmap->bitmap[4], bmap->bitmap[5], 1157 bmap->bitmap[6], bmap->bitmap[7]); 1158 } 1159 #endif 1160 } 1161 1162 /* 1163 * BULKFREE DEDUP HEURISTIC 1164 * 1165 * WARNING! This code is SMP safe but the heuristic allows SMP collisions. 1166 * All fields must be loaded into locals and validated. 1167 */ 1168 static 1169 int 1170 h2_bulkfree_test(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref, 1171 int pri) 1172 { 1173 hammer2_dedup_t *dedup; 1174 int best; 1175 int n; 1176 int i; 1177 1178 n = hammer2_icrc32(&bref->data_off, sizeof(bref->data_off)); 1179 dedup = cbinfo->dedup + (n & (HAMMER2_DEDUP_HEUR_MASK & ~7)); 1180 1181 for (i = best = 0; i < 8; ++i) { 1182 if (dedup[i].data_off == bref->data_off) { 1183 if (dedup[i].ticks < pri) 1184 dedup[i].ticks = pri; 1185 if (pri == 1) 1186 cbinfo->count_dedup_factor += dedup[i].ticks; 1187 return 1; 1188 } 1189 if (dedup[i].ticks < dedup[best].ticks) 1190 best = i; 1191 } 1192 dedup[best].data_off = bref->data_off; 1193 dedup[best].ticks = pri; 1194 1195 return 0; 1196 } 1197