1 /* 2 * Copyright (c) 2013-2018 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/kernel.h> 37 #include <sys/fcntl.h> 38 #include <sys/buf.h> 39 #include <sys/proc.h> 40 #include <sys/namei.h> 41 #include <sys/mount.h> 42 #include <sys/vnode.h> 43 #include <sys/mountctl.h> 44 #include <vm/vm_kern.h> 45 #include <vm/vm_extern.h> 46 47 #include "hammer2.h" 48 49 /* 50 * XXX I made a mistake and made the reserved area begin at each LEVEL1 zone, 51 * which is on a 1GB demark. This will eat a little more space but for 52 * now we retain compatibility and make FMZONEBASE every 1GB 53 */ 54 #define H2FMZONEBASE(key) ((key) & ~HAMMER2_FREEMAP_LEVEL1_MASK) 55 #define H2FMBASE(key, radix) ((key) & ~(((hammer2_off_t)1 << (radix)) - 1)) 56 #define H2FMSHIFT(radix) ((hammer2_off_t)1 << (radix)) 57 58 /* 59 * breadth-first search 60 */ 61 typedef struct hammer2_chain_save { 62 TAILQ_ENTRY(hammer2_chain_save) entry; 63 hammer2_chain_t *chain; 64 int pri; 65 } hammer2_chain_save_t; 66 67 TAILQ_HEAD(hammer2_chain_save_list, hammer2_chain_save); 68 typedef struct hammer2_chain_save_list hammer2_chain_save_list_t; 69 70 typedef struct hammer2_bulkfree_info { 71 hammer2_dev_t *hmp; 72 kmem_anon_desc_t kp; 73 hammer2_off_t sbase; /* sub-loop iteration */ 74 hammer2_off_t sstop; 75 hammer2_bmap_data_t *bmap; 76 int depth; 77 long count_10_00; /* staged->free */ 78 long count_11_10; /* allocated->staged */ 79 long count_00_11; /* (should not happen) */ 80 long count_01_11; /* (should not happen) */ 81 long count_10_11; /* staged->allocated */ 82 long count_l0cleans; 83 long count_linadjusts; 84 long count_inodes_scanned; 85 long count_dirents_scanned; 86 long count_dedup_factor; 87 long count_bytes_scanned; 88 long count_chains_scanned; 89 long count_chains_reported; 90 long bulkfree_calls; 91 int bulkfree_ticks; 92 hammer2_off_t adj_free; 93 hammer2_tid_t mtid; 94 hammer2_tid_t saved_mirror_tid; 95 time_t save_time; 96 hammer2_chain_save_list_t list; 97 hammer2_dedup_t *dedup; 98 int pri; 99 } hammer2_bulkfree_info_t; 100 101 static int h2_bulkfree_test(hammer2_bulkfree_info_t *info, 102 hammer2_blockref_t *bref, int pri); 103 104 /* 105 * General bulk scan function with callback. Called with a referenced 106 * but UNLOCKED parent. The parent is returned in the same state. 107 */ 108 static 109 int 110 hammer2_bulk_scan(hammer2_chain_t *parent, 111 int (*func)(hammer2_bulkfree_info_t *info, 112 hammer2_blockref_t *bref), 113 hammer2_bulkfree_info_t *info) 114 { 115 hammer2_blockref_t bref; 116 hammer2_chain_t *chain; 117 int first = 1; 118 int rup_error; 119 int error; 120 121 ++info->pri; 122 123 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 124 HAMMER2_RESOLVE_SHARED); 125 chain = NULL; 126 rup_error = 0; 127 error = 0; 128 129 /* 130 * Generally loop on the contents if we have not been flagged 131 * for abort. 132 * 133 * Remember that these chains are completely isolated from 134 * the frontend, so we can release locks temporarily without 135 * imploding. 136 */ 137 for (;;) { 138 error |= hammer2_chain_scan(parent, &chain, &bref, &first, 139 HAMMER2_LOOKUP_NODATA | 140 HAMMER2_LOOKUP_SHARED); 141 142 /* 143 * Handle EOF or other error at current level. This stops 144 * the bulkfree scan. 145 */ 146 if (error) 147 break; 148 149 /* 150 * Account for dirents before thre data_off test, since most 151 * dirents do not need a data reference. 152 */ 153 if (bref.type == HAMMER2_BREF_TYPE_DIRENT) 154 ++info->count_dirents_scanned; 155 156 /* 157 * Ignore brefs without data (typically dirents) 158 */ 159 if ((bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) 160 continue; 161 162 /* 163 * Process bref, chain is only non-NULL if the bref 164 * might be recursable (its possible that we sometimes get 165 * a non-NULL chain where the bref cannot be recursed). 166 */ 167 ++info->pri; 168 if (h2_bulkfree_test(info, &bref, 1)) 169 continue; 170 171 if (bref.type == HAMMER2_BREF_TYPE_INODE) 172 ++info->count_inodes_scanned; 173 174 error |= func(info, &bref); 175 if (error) 176 break; 177 178 /* 179 * A non-null chain is always returned if it is 180 * recursive, otherwise a non-null chain might be 181 * returned but usually is not when not recursive. 182 */ 183 if (chain == NULL) 184 continue; 185 186 if (chain) { 187 info->count_bytes_scanned += chain->bytes; 188 ++info->count_chains_scanned; 189 190 if (info->count_chains_scanned >= 191 info->count_chains_reported + 1000000 || 192 (info->count_chains_scanned < 1000000 && 193 info->count_chains_scanned >= 194 info->count_chains_reported + 100000)) { 195 kprintf(" chains %-7ld inodes %-7ld " 196 "dirents %-7ld bytes %5ldMB\n", 197 info->count_chains_scanned, 198 info->count_inodes_scanned, 199 info->count_dirents_scanned, 200 info->count_bytes_scanned / 1000000); 201 info->count_chains_reported = 202 info->count_chains_scanned; 203 } 204 } 205 206 207 /* 208 * Else check type and setup depth-first scan. 209 * 210 * Account for bytes actually read. 211 */ 212 switch(chain->bref.type) { 213 case HAMMER2_BREF_TYPE_INODE: 214 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 215 case HAMMER2_BREF_TYPE_INDIRECT: 216 case HAMMER2_BREF_TYPE_VOLUME: 217 case HAMMER2_BREF_TYPE_FREEMAP: 218 ++info->depth; 219 if (info->depth > 16) { 220 hammer2_chain_save_t *save; 221 save = kmalloc(sizeof(*save), M_HAMMER2, 222 M_WAITOK | M_ZERO); 223 save->chain = chain; 224 hammer2_chain_ref(chain); 225 TAILQ_INSERT_TAIL(&info->list, save, entry); 226 227 /* guess */ 228 info->pri += 10; 229 } else { 230 int savepri = info->pri; 231 232 hammer2_chain_unlock(chain); 233 hammer2_chain_unlock(parent); 234 info->pri = 0; 235 rup_error |= 236 hammer2_bulk_scan(chain, func, info); 237 info->pri += savepri; 238 hammer2_chain_lock(parent, 239 HAMMER2_RESOLVE_ALWAYS | 240 HAMMER2_RESOLVE_SHARED); 241 hammer2_chain_lock(chain, 242 HAMMER2_RESOLVE_ALWAYS | 243 HAMMER2_RESOLVE_SHARED); 244 } 245 --info->depth; 246 break; 247 case HAMMER2_BREF_TYPE_DATA: 248 break; 249 default: 250 /* does not recurse */ 251 break; 252 } 253 if (rup_error & HAMMER2_ERROR_ABORTED) 254 break; 255 } 256 if (chain) { 257 hammer2_chain_unlock(chain); 258 hammer2_chain_drop(chain); 259 } 260 261 /* 262 * Save with higher pri now that we know what it is. 263 */ 264 h2_bulkfree_test(info, &parent->bref, info->pri + 1); 265 266 hammer2_chain_unlock(parent); 267 268 return ((error | rup_error) & ~HAMMER2_ERROR_EOF); 269 } 270 271 /* 272 * Bulkfree algorithm 273 * 274 * Repeat { 275 * Chain flush (partial synchronization) XXX removed 276 * Scan the whole topology - build in-memory freemap (mark 11) 277 * Reconcile the in-memory freemap against the on-disk freemap. 278 * ondisk xx -> ondisk 11 (if allocated) 279 * ondisk 11 -> ondisk 10 (if free in-memory) 280 * ondisk 10 -> ondisk 00 (if free in-memory) - on next pass 281 * } 282 * 283 * The topology scan may have to be performed multiple times to window 284 * freemaps which are too large to fit in kernel memory. 285 * 286 * Races are handled using a double-transition (11->10, 10->00). The bulkfree 287 * scan snapshots the volume root's blockset and thus can run concurrent with 288 * normal operations, as long as a full flush is made between each pass to 289 * synchronize any modified chains (otherwise their blocks might be improperly 290 * freed). 291 * 292 * Temporary memory in multiples of 64KB is required to reconstruct the leaf 293 * hammer2_bmap_data blocks so they can later be compared against the live 294 * freemap. Each 64KB block represents 128 x 16KB x 1024 = ~2 GB of storage. 295 * A 32MB save area thus represents around ~1 TB. The temporary memory 296 * allocated can be specified. If it is not sufficient multiple topology 297 * passes will be made. 298 */ 299 300 /* 301 * Bulkfree callback info 302 */ 303 static void hammer2_bulkfree_thread(void *arg __unused); 304 static void cbinfo_bmap_init(hammer2_bulkfree_info_t *cbinfo, size_t size); 305 static int h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo, 306 hammer2_blockref_t *bref); 307 static int h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo); 308 static void h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, 309 hammer2_off_t data_off, hammer2_bmap_data_t *live, 310 hammer2_bmap_data_t *bmap, hammer2_key_t alloc_base); 311 312 void 313 hammer2_bulkfree_init(hammer2_dev_t *hmp) 314 { 315 hammer2_thr_create(&hmp->bfthr, NULL, hmp, 316 hmp->devrepname, -1, -1, 317 hammer2_bulkfree_thread); 318 } 319 320 void 321 hammer2_bulkfree_uninit(hammer2_dev_t *hmp) 322 { 323 hammer2_thr_delete(&hmp->bfthr); 324 } 325 326 static void 327 hammer2_bulkfree_thread(void *arg) 328 { 329 hammer2_thread_t *thr = arg; 330 hammer2_ioc_bulkfree_t bfi; 331 uint32_t flags; 332 333 for (;;) { 334 hammer2_thr_wait_any(thr, 335 HAMMER2_THREAD_STOP | 336 HAMMER2_THREAD_FREEZE | 337 HAMMER2_THREAD_UNFREEZE | 338 HAMMER2_THREAD_REMASTER, 339 hz * 60); 340 341 flags = thr->flags; 342 cpu_ccfence(); 343 if (flags & HAMMER2_THREAD_STOP) 344 break; 345 if (flags & HAMMER2_THREAD_FREEZE) { 346 hammer2_thr_signal2(thr, HAMMER2_THREAD_FROZEN, 347 HAMMER2_THREAD_FREEZE); 348 continue; 349 } 350 if (flags & HAMMER2_THREAD_UNFREEZE) { 351 hammer2_thr_signal2(thr, 0, 352 HAMMER2_THREAD_FROZEN | 353 HAMMER2_THREAD_UNFREEZE); 354 continue; 355 } 356 if (flags & HAMMER2_THREAD_FROZEN) 357 continue; 358 if (flags & HAMMER2_THREAD_REMASTER) { 359 hammer2_thr_signal2(thr, 0, HAMMER2_THREAD_REMASTER); 360 bzero(&bfi, sizeof(bfi)); 361 bfi.size = 8192 * 1024; 362 /* hammer2_bulkfree_pass(thr->hmp, &bfi); */ 363 } 364 } 365 thr->td = NULL; 366 hammer2_thr_signal(thr, HAMMER2_THREAD_STOPPED); 367 /* structure can go invalid at this point */ 368 } 369 370 int 371 hammer2_bulkfree_pass(hammer2_dev_t *hmp, hammer2_chain_t *vchain, 372 hammer2_ioc_bulkfree_t *bfi) 373 { 374 hammer2_bulkfree_info_t cbinfo; 375 hammer2_chain_save_t *save; 376 hammer2_off_t incr; 377 size_t size; 378 int error; 379 380 /* 381 * We have to clear the live dedup cache as it might have entries 382 * that are freeable as of now. Any new entries in the dedup cache 383 * made after this point, even if they become freeable, will have 384 * previously been fully allocated and will be protected by the 385 * 2-stage bulkfree. 386 */ 387 hammer2_dedup_clear(hmp); 388 389 /* 390 * Setup for free pass using the buffer size specified by the 391 * hammer2 utility, 32K-aligned. 392 */ 393 bzero(&cbinfo, sizeof(cbinfo)); 394 size = (bfi->size + HAMMER2_FREEMAP_LEVELN_PSIZE - 1) & 395 ~(size_t)(HAMMER2_FREEMAP_LEVELN_PSIZE - 1); 396 397 /* 398 * Cap at 1/4 physical memory (hammer2 utility will not normally 399 * ever specify a buffer this big, but leave the option available). 400 */ 401 if (size > kmem_lim_size() * 1024 * 1024 / 4) { 402 size = kmem_lim_size() * 1024 * 1024 / 4; 403 kprintf("hammer2: Warning: capping bulkfree buffer at %jdM\n", 404 (intmax_t)size / (1024 * 1024)); 405 } 406 407 #define HAMMER2_FREEMAP_SIZEDIV \ 408 (HAMMER2_FREEMAP_LEVEL1_SIZE / HAMMER2_FREEMAP_LEVELN_PSIZE) 409 #define HAMMER2_FREEMAP_SIZEMASK (HAMMER2_FREEMAP_SIZEDIV - 1) 410 411 /* 412 * Cap at the size needed to cover the whole volume to avoid 413 * making an unnecessarily large allocation. 414 */ 415 if (size > hmp->voldata.volu_size / HAMMER2_FREEMAP_SIZEDIV) { 416 size = (hmp->voldata.volu_size + HAMMER2_FREEMAP_SIZEMASK) / 417 HAMMER2_FREEMAP_SIZEDIV; 418 } 419 420 /* 421 * Minimum bitmap buffer size, then align to a LEVELN_PSIZE (32K) 422 * boundary. 423 */ 424 if (size < 1024 * 1024) 425 size = 1024 * 1024; 426 size = (size + HAMMER2_FREEMAP_LEVELN_PSIZE - 1) & 427 ~(size_t)(HAMMER2_FREEMAP_LEVELN_PSIZE - 1); 428 429 cbinfo.hmp = hmp; 430 cbinfo.bmap = kmem_alloc_swapbacked(&cbinfo.kp, size, VM_SUBSYS_HAMMER); 431 cbinfo.saved_mirror_tid = hmp->voldata.mirror_tid; 432 433 cbinfo.dedup = kmalloc(sizeof(*cbinfo.dedup) * HAMMER2_DEDUP_HEUR_SIZE, 434 M_HAMMER2, M_WAITOK | M_ZERO); 435 436 kprintf("hammer2: bulkfree buf=%jdM\n", 437 (intmax_t)size / (1024 * 1024)); 438 439 /* 440 * Normalize start point to a 2GB boundary. We operate on a 441 * 64KB leaf bitmap boundary which represents 2GB of storage. 442 */ 443 cbinfo.sbase = bfi->sbase; 444 if (cbinfo.sbase > hmp->voldata.volu_size) 445 cbinfo.sbase = hmp->voldata.volu_size; 446 cbinfo.sbase &= ~HAMMER2_FREEMAP_LEVEL1_MASK; 447 TAILQ_INIT(&cbinfo.list); 448 449 cbinfo.bulkfree_ticks = ticks; 450 451 /* 452 * Loop on a full meta-data scan as many times as required to 453 * get through all available storage. 454 */ 455 error = 0; 456 while (cbinfo.sbase < hmp->voldata.volu_size) { 457 /* 458 * We have enough ram to represent (incr) bytes of storage. 459 * Each 64KB of ram represents 2GB of storage. 460 * 461 * We must also clean out our de-duplication heuristic for 462 * each (incr) bytes of storage, otherwise we wind up not 463 * scanning meta-data for later areas of storage because 464 * they had already been scanned in earlier areas of storage. 465 * Since the ranging is different, we have to restart 466 * the dedup heuristic too. 467 */ 468 int allmedia; 469 470 cbinfo_bmap_init(&cbinfo, size); 471 bzero(cbinfo.dedup, sizeof(*cbinfo.dedup) * 472 HAMMER2_DEDUP_HEUR_SIZE); 473 cbinfo.count_inodes_scanned = 0; 474 cbinfo.count_dirents_scanned = 0; 475 cbinfo.count_bytes_scanned = 0; 476 cbinfo.count_chains_scanned = 0; 477 cbinfo.count_chains_reported = 0; 478 479 incr = size / HAMMER2_FREEMAP_LEVELN_PSIZE * 480 HAMMER2_FREEMAP_LEVEL1_SIZE; 481 if (hmp->voldata.volu_size - cbinfo.sbase <= incr) { 482 cbinfo.sstop = hmp->voldata.volu_size; 483 allmedia = 1; 484 } else { 485 cbinfo.sstop = cbinfo.sbase + incr; 486 allmedia = 0; 487 } 488 kprintf("hammer2: pass %016jx-%016jx ", 489 (intmax_t)cbinfo.sbase, 490 (intmax_t)cbinfo.sstop); 491 if (allmedia && cbinfo.sbase == 0) 492 kprintf("(all media)\n"); 493 else if (allmedia) 494 kprintf("(remaining media)\n"); 495 else 496 kprintf("(%jdGB of media)\n", 497 (intmax_t)incr / (1024L*1024*1024)); 498 499 /* 500 * Scan topology for stuff inside this range. 501 * 502 * NOTE - By not using a transaction the operation can 503 * run concurrent with the frontend as well as 504 * with flushes. 505 * 506 * We cannot safely set a mtid without a transaction, 507 * and in fact we don't want to set one anyway. We 508 * want the bulkfree to be passive and no interfere 509 * with crash recovery. 510 */ 511 #undef HAMMER2_BULKFREE_TRANS /* undef - don't use transaction */ 512 #ifdef HAMMER2_BULKFREE_TRANS 513 hammer2_trans_init(hmp->spmp, 0); 514 cbinfo.mtid = hammer2_trans_sub(hmp->spmp); 515 #else 516 cbinfo.mtid = 0; 517 #endif 518 cbinfo.pri = 0; 519 error |= hammer2_bulk_scan(vchain, h2_bulkfree_callback, 520 &cbinfo); 521 522 while ((save = TAILQ_FIRST(&cbinfo.list)) != NULL && 523 error == 0) { 524 TAILQ_REMOVE(&cbinfo.list, save, entry); 525 cbinfo.pri = 0; 526 error |= hammer2_bulk_scan(save->chain, 527 h2_bulkfree_callback, 528 &cbinfo); 529 hammer2_chain_drop(save->chain); 530 kfree(save, M_HAMMER2); 531 } 532 while (save) { 533 TAILQ_REMOVE(&cbinfo.list, save, entry); 534 hammer2_chain_drop(save->chain); 535 kfree(save, M_HAMMER2); 536 save = TAILQ_FIRST(&cbinfo.list); 537 } 538 539 /* 540 * If the complete scan succeeded we can synchronize our 541 * in-memory freemap against live storage. If an abort 542 * occured we cannot safely synchronize our partially 543 * filled-out in-memory freemap. 544 */ 545 if (error) { 546 kprintf("bulkfree lastdrop %d %d error=0x%04x\n", 547 vchain->refs, vchain->core.chain_count, error); 548 } else { 549 kprintf("bulkfree lastdrop %d %d\n", 550 vchain->refs, vchain->core.chain_count); 551 552 error = h2_bulkfree_sync(&cbinfo); 553 554 hammer2_voldata_lock(hmp); 555 hammer2_voldata_modify(hmp); 556 hmp->voldata.allocator_free += cbinfo.adj_free; 557 hammer2_voldata_unlock(hmp); 558 } 559 560 /* 561 * Cleanup for next loop. 562 */ 563 #ifdef HAMMER2_BULKFREE_TRANS 564 hammer2_trans_done(hmp->spmp, 0); 565 #endif 566 if (error) 567 break; 568 cbinfo.sbase = cbinfo.sstop; 569 cbinfo.adj_free = 0; 570 } 571 kmem_free_swapbacked(&cbinfo.kp); 572 kfree(cbinfo.dedup, M_HAMMER2); 573 cbinfo.dedup = NULL; 574 575 bfi->sstop = cbinfo.sbase; 576 577 incr = bfi->sstop / (hmp->voldata.volu_size / 10000); 578 if (incr > 10000) 579 incr = 10000; 580 581 kprintf("bulkfree pass statistics (%d.%02d%% storage processed):\n", 582 (int)incr / 100, 583 (int)incr % 100); 584 585 if (error) { 586 kprintf(" bulkfree was aborted\n"); 587 } else { 588 kprintf(" transition->free %ld\n", cbinfo.count_10_00); 589 kprintf(" transition->staged %ld\n", cbinfo.count_11_10); 590 kprintf(" ERR(00)->allocated %ld\n", cbinfo.count_00_11); 591 kprintf(" ERR(01)->allocated %ld\n", cbinfo.count_01_11); 592 kprintf(" staged->allocated %ld\n", cbinfo.count_10_11); 593 kprintf(" ~2MB segs cleaned %ld\n", cbinfo.count_l0cleans); 594 kprintf(" linear adjusts %ld\n", 595 cbinfo.count_linadjusts); 596 kprintf(" dedup factor %ld\n", 597 cbinfo.count_dedup_factor); 598 } 599 600 return error; 601 } 602 603 static void 604 cbinfo_bmap_init(hammer2_bulkfree_info_t *cbinfo, size_t size) 605 { 606 hammer2_bmap_data_t *bmap = cbinfo->bmap; 607 hammer2_key_t key = cbinfo->sbase; 608 hammer2_key_t lokey; 609 hammer2_key_t hikey; 610 611 lokey = (cbinfo->hmp->voldata.allocator_beg + HAMMER2_SEGMASK64) & 612 ~HAMMER2_SEGMASK64; 613 hikey = cbinfo->hmp->voldata.volu_size & ~HAMMER2_SEGMASK64; 614 615 bzero(bmap, size); 616 while (size) { 617 bzero(bmap, sizeof(*bmap)); 618 if (lokey < H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX)) 619 lokey = H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX); 620 if (lokey < H2FMZONEBASE(key) + HAMMER2_ZONE_SEG64) 621 lokey = H2FMZONEBASE(key) + HAMMER2_ZONE_SEG64; 622 if (key < lokey || key >= hikey) { 623 memset(bmap->bitmapq, -1, 624 sizeof(bmap->bitmapq)); 625 bmap->avail = 0; 626 bmap->linear = HAMMER2_SEGSIZE; 627 } else { 628 bmap->avail = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX); 629 } 630 size -= sizeof(*bmap); 631 key += HAMMER2_FREEMAP_LEVEL0_SIZE; 632 ++bmap; 633 } 634 } 635 636 static int 637 h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref) 638 { 639 hammer2_bmap_data_t *bmap; 640 hammer2_off_t data_off; 641 uint16_t class; 642 size_t bytes; 643 int radix; 644 645 /* 646 * Check for signal and allow yield to userland during scan. 647 */ 648 if (hammer2_signal_check(&cbinfo->save_time)) 649 return HAMMER2_ERROR_ABORTED; 650 651 /* 652 * Deal with kernel thread cpu or I/O hogging by limiting the 653 * number of chains scanned per second to hammer2_bulkfree_tps. 654 * Ignore leaf records (DIRENT and DATA), no per-record I/O is 655 * involved for those since we don't load their data. 656 */ 657 if (bref->type != HAMMER2_BREF_TYPE_DATA && 658 bref->type != HAMMER2_BREF_TYPE_DIRENT) { 659 ++cbinfo->bulkfree_calls; 660 if (cbinfo->bulkfree_calls > hammer2_bulkfree_tps) { 661 int dticks = ticks - cbinfo->bulkfree_ticks; 662 if (dticks < 0) 663 dticks = 0; 664 if (dticks < hz) { 665 tsleep(&cbinfo->bulkfree_ticks, 0, 666 "h2bw", hz - dticks); 667 } 668 cbinfo->bulkfree_calls = 0; 669 cbinfo->bulkfree_ticks = ticks; 670 } 671 } 672 673 /* 674 * Calculate the data offset and determine if it is within 675 * the current freemap range being gathered. 676 */ 677 data_off = bref->data_off & ~HAMMER2_OFF_MASK_RADIX; 678 if (data_off < cbinfo->sbase || data_off >= cbinfo->sstop) 679 return 0; 680 if (data_off < cbinfo->hmp->voldata.allocator_beg) 681 return 0; 682 if (data_off >= cbinfo->hmp->voldata.volu_size) 683 return 0; 684 685 /* 686 * Calculate the information needed to generate the in-memory 687 * freemap record. 688 * 689 * Hammer2 does not allow allocations to cross the L1 (2GB) boundary, 690 * it's a problem if it does. (Or L0 (2MB) for that matter). 691 */ 692 radix = (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 693 KKASSERT(radix != 0); 694 bytes = (size_t)1 << radix; 695 class = (bref->type << 8) | hammer2_devblkradix(radix); 696 697 if (data_off + bytes > cbinfo->sstop) { 698 kprintf("hammer2_bulkfree_scan: illegal 2GB boundary " 699 "%016jx %016jx/%d\n", 700 (intmax_t)bref->data_off, 701 (intmax_t)bref->key, 702 bref->keybits); 703 bytes = cbinfo->sstop - data_off; /* XXX */ 704 } 705 706 /* 707 * Convert to a storage offset relative to the beginning of the 708 * storage range we are collecting. Then lookup the level0 bmap entry. 709 */ 710 data_off -= cbinfo->sbase; 711 bmap = cbinfo->bmap + (data_off >> HAMMER2_FREEMAP_LEVEL0_RADIX); 712 713 /* 714 * Convert data_off to a bmap-relative value (~4MB storage range). 715 * Adjust linear, class, and avail. 716 * 717 * Hammer2 does not allow allocations to cross the L0 (4MB) boundary, 718 */ 719 data_off &= HAMMER2_FREEMAP_LEVEL0_MASK; 720 if (data_off + bytes > HAMMER2_FREEMAP_LEVEL0_SIZE) { 721 kprintf("hammer2_bulkfree_scan: illegal 4MB boundary " 722 "%016jx %016jx/%d\n", 723 (intmax_t)bref->data_off, 724 (intmax_t)bref->key, 725 bref->keybits); 726 bytes = HAMMER2_FREEMAP_LEVEL0_SIZE - data_off; 727 } 728 729 if (bmap->class == 0) { 730 bmap->class = class; 731 bmap->avail = HAMMER2_FREEMAP_LEVEL0_SIZE; 732 } 733 734 /* 735 * NOTE: bmap->class does not have to match class. Classification 736 * is relaxed when free space is low, so some mixing can occur. 737 */ 738 #if 0 739 /* 740 * XXX removed 741 */ 742 if (bmap->class != class) { 743 kprintf("hammer2_bulkfree_scan: illegal mixed class " 744 "%016jx %016jx/%d (%04x vs %04x)\n", 745 (intmax_t)bref->data_off, 746 (intmax_t)bref->key, 747 bref->keybits, 748 class, bmap->class); 749 } 750 #endif 751 752 /* 753 * Just record the highest byte-granular offset for now. Do not 754 * match against allocations which are in multiples of whole blocks. 755 * 756 * Make sure that any in-block linear offset at least covers the 757 * data range. This can cause bmap->linear to become block-aligned. 758 */ 759 if (bytes & HAMMER2_FREEMAP_BLOCK_MASK) { 760 if (bmap->linear < (int32_t)data_off + (int32_t)bytes) 761 bmap->linear = (int32_t)data_off + (int32_t)bytes; 762 } else if (bmap->linear >= (int32_t)data_off && 763 bmap->linear < (int32_t)data_off + (int32_t)bytes) { 764 bmap->linear = (int32_t)data_off + (int32_t)bytes; 765 } 766 767 /* 768 * Adjust the hammer2_bitmap_t bitmap[HAMMER2_BMAP_ELEMENTS]. 769 * 64-bit entries, 2 bits per entry, to code 11. 770 * 771 * NOTE: data_off mask to 524288, shift right by 14 (radix for 16384), 772 * and multiply shift amount by 2 for sets of 2 bits. 773 * 774 * NOTE: The allocation can be smaller than HAMMER2_FREEMAP_BLOCK_SIZE. 775 * also, data_off may not be FREEMAP_BLOCK_SIZE aligned. 776 */ 777 while (bytes > 0) { 778 hammer2_bitmap_t bmask; 779 int bindex; 780 781 bindex = (int)data_off >> (HAMMER2_FREEMAP_BLOCK_RADIX + 782 HAMMER2_BMAP_INDEX_RADIX); 783 bmask = (hammer2_bitmap_t)3 << 784 ((((int)data_off & HAMMER2_BMAP_INDEX_MASK) >> 785 HAMMER2_FREEMAP_BLOCK_RADIX) << 1); 786 787 /* 788 * NOTE! The (avail) calculation is bitmap-granular. Multiple 789 * sub-granular records can wind up at the same bitmap 790 * position. 791 */ 792 if ((bmap->bitmapq[bindex] & bmask) == 0) { 793 if (bytes < HAMMER2_FREEMAP_BLOCK_SIZE) { 794 bmap->avail -= HAMMER2_FREEMAP_BLOCK_SIZE; 795 } else { 796 bmap->avail -= bytes; 797 } 798 bmap->bitmapq[bindex] |= bmask; 799 } 800 data_off += HAMMER2_FREEMAP_BLOCK_SIZE; 801 if (bytes < HAMMER2_FREEMAP_BLOCK_SIZE) 802 bytes = 0; 803 else 804 bytes -= HAMMER2_FREEMAP_BLOCK_SIZE; 805 } 806 return 0; 807 } 808 809 /* 810 * Synchronize the in-memory bitmap with the live freemap. This is not a 811 * direct copy. Instead the bitmaps must be compared: 812 * 813 * In-memory Live-freemap 814 * 00 11 -> 10 (do nothing if live modified) 815 * 10 -> 00 (do nothing if live modified) 816 * 11 10 -> 11 handles race against live 817 * ** -> 11 nominally warn of corruption 818 * 819 * We must also fixup the hints in HAMMER2_BREF_TYPE_FREEMAP_LEAF. 820 */ 821 static int 822 h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo) 823 { 824 hammer2_off_t data_off; 825 hammer2_key_t key; 826 hammer2_key_t key_dummy; 827 hammer2_bmap_data_t *bmap; 828 hammer2_bmap_data_t *live; 829 hammer2_chain_t *live_parent; 830 hammer2_chain_t *live_chain; 831 int bmapindex; 832 int error; 833 834 kprintf("hammer2_bulkfree - range "); 835 836 if (cbinfo->sbase < cbinfo->hmp->voldata.allocator_beg) 837 kprintf("%016jx-", 838 (intmax_t)cbinfo->hmp->voldata.allocator_beg); 839 else 840 kprintf("%016jx-", 841 (intmax_t)cbinfo->sbase); 842 843 if (cbinfo->sstop > cbinfo->hmp->voldata.volu_size) 844 kprintf("%016jx\n", 845 (intmax_t)cbinfo->hmp->voldata.volu_size); 846 else 847 kprintf("%016jx\n", 848 (intmax_t)cbinfo->sstop); 849 850 data_off = cbinfo->sbase; 851 bmap = cbinfo->bmap; 852 853 live_parent = &cbinfo->hmp->fchain; 854 hammer2_chain_ref(live_parent); 855 hammer2_chain_lock(live_parent, HAMMER2_RESOLVE_ALWAYS); 856 live_chain = NULL; 857 error = 0; 858 859 /* 860 * Iterate each hammer2_bmap_data_t line (128 bytes) managing 861 * 4MB of storage. 862 */ 863 while (data_off < cbinfo->sstop) { 864 /* 865 * The freemap is not used below allocator_beg or beyond 866 * volu_size. 867 */ 868 869 if (data_off < cbinfo->hmp->voldata.allocator_beg) 870 goto next; 871 if (data_off >= cbinfo->hmp->voldata.volu_size) 872 goto next; 873 874 /* 875 * Locate the freemap leaf on the live filesystem 876 */ 877 key = (data_off & ~HAMMER2_FREEMAP_LEVEL1_MASK); 878 879 if (live_chain == NULL || live_chain->bref.key != key) { 880 if (live_chain) { 881 hammer2_chain_unlock(live_chain); 882 hammer2_chain_drop(live_chain); 883 } 884 live_chain = hammer2_chain_lookup( 885 &live_parent, 886 &key_dummy, 887 key, 888 key + HAMMER2_FREEMAP_LEVEL1_MASK, 889 &error, 890 HAMMER2_LOOKUP_ALWAYS); 891 if (error) { 892 kprintf("hammer2_bulkfree: freemap lookup " 893 "error near %016jx, error %s\n", 894 (intmax_t)data_off, 895 hammer2_error_str(live_chain->error)); 896 break; 897 } 898 } 899 if (live_chain == NULL) { 900 /* 901 * XXX if we implement a full recovery mode we need 902 * to create/recreate missing freemap chains if our 903 * bmap has any allocated blocks. 904 */ 905 if (bmap->class && 906 bmap->avail != HAMMER2_FREEMAP_LEVEL0_SIZE) { 907 kprintf("hammer2_bulkfree: cannot locate " 908 "live leaf for allocated data " 909 "near %016jx\n", 910 (intmax_t)data_off); 911 } 912 goto next; 913 } 914 if (live_chain->error) { 915 kprintf("hammer2_bulkfree: unable to access freemap " 916 "near %016jx, error %s\n", 917 (intmax_t)data_off, 918 hammer2_error_str(live_chain->error)); 919 hammer2_chain_unlock(live_chain); 920 hammer2_chain_drop(live_chain); 921 live_chain = NULL; 922 goto next; 923 } 924 925 bmapindex = (data_off & HAMMER2_FREEMAP_LEVEL1_MASK) >> 926 HAMMER2_FREEMAP_LEVEL0_RADIX; 927 live = &live_chain->data->bmdata[bmapindex]; 928 929 /* 930 * Shortcut if the bitmaps match and the live linear 931 * indicator is sane. We can't do a perfect check of 932 * live->linear because the only real requirement is that 933 * if it is not block-aligned, that it not cover the space 934 * within its current block which overlaps one of the data 935 * ranges we scan. We don't retain enough fine-grained 936 * data in our scan to be able to set it exactly. 937 * 938 * TODO - we could shortcut this by testing that both 939 * live->class and bmap->class are 0, and both avails are 940 * set to HAMMER2_FREEMAP_LEVEL0_SIZE (4MB). 941 */ 942 if (bcmp(live->bitmapq, bmap->bitmapq, 943 sizeof(bmap->bitmapq)) == 0 && 944 live->linear >= bmap->linear) { 945 goto next; 946 } 947 if (hammer2_debug & 1) { 948 kprintf("live %016jx %04d.%04x (avail=%d)\n", 949 data_off, bmapindex, live->class, live->avail); 950 } 951 952 hammer2_chain_modify(live_chain, cbinfo->mtid, 0, 0); 953 live_chain->bref.check.freemap.bigmask = -1; 954 cbinfo->hmp->freemap_relaxed = 0; /* reset heuristic */ 955 live = &live_chain->data->bmdata[bmapindex]; 956 957 h2_bulkfree_sync_adjust(cbinfo, data_off, live, bmap, 958 live_chain->bref.key + 959 bmapindex * 960 HAMMER2_FREEMAP_LEVEL0_SIZE); 961 next: 962 data_off += HAMMER2_FREEMAP_LEVEL0_SIZE; 963 ++bmap; 964 } 965 if (live_chain) { 966 hammer2_chain_unlock(live_chain); 967 hammer2_chain_drop(live_chain); 968 } 969 if (live_parent) { 970 hammer2_chain_unlock(live_parent); 971 hammer2_chain_drop(live_parent); 972 } 973 return error; 974 } 975 976 /* 977 * Merge the bulkfree bitmap against the existing bitmap. 978 */ 979 static 980 void 981 h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, 982 hammer2_off_t data_off, hammer2_bmap_data_t *live, 983 hammer2_bmap_data_t *bmap, hammer2_key_t alloc_base) 984 { 985 int bindex; 986 int scount; 987 hammer2_off_t tmp_off; 988 hammer2_bitmap_t lmask; 989 hammer2_bitmap_t mmask; 990 991 tmp_off = data_off; 992 993 for (bindex = 0; bindex < HAMMER2_BMAP_ELEMENTS; ++bindex) { 994 lmask = live->bitmapq[bindex]; /* live */ 995 mmask = bmap->bitmapq[bindex]; /* snapshotted bulkfree */ 996 if (lmask == mmask) { 997 tmp_off += HAMMER2_BMAP_INDEX_SIZE; 998 continue; 999 } 1000 1001 for (scount = 0; 1002 scount < HAMMER2_BMAP_BITS_PER_ELEMENT; 1003 scount += 2) { 1004 if ((mmask & 3) == 0) { 1005 /* 1006 * in-memory 00 live 11 -> 10 1007 * live 10 -> 00 1008 * 1009 * Storage might be marked allocated or 1010 * staged and must be remarked staged or 1011 * free. 1012 */ 1013 switch (lmask & 3) { 1014 case 0: /* 00 */ 1015 break; 1016 case 1: /* 01 */ 1017 kprintf("hammer2_bulkfree: cannot " 1018 "transition m=00/l=01\n"); 1019 break; 1020 case 2: /* 10 -> 00 */ 1021 live->bitmapq[bindex] &= 1022 ~((hammer2_bitmap_t)2 << scount); 1023 live->avail += 1024 HAMMER2_FREEMAP_BLOCK_SIZE; 1025 if (live->avail > 1026 HAMMER2_FREEMAP_LEVEL0_SIZE) { 1027 live->avail = 1028 HAMMER2_FREEMAP_LEVEL0_SIZE; 1029 } 1030 cbinfo->adj_free += 1031 HAMMER2_FREEMAP_BLOCK_SIZE; 1032 ++cbinfo->count_10_00; 1033 hammer2_io_dedup_assert( 1034 cbinfo->hmp, 1035 tmp_off | 1036 HAMMER2_FREEMAP_BLOCK_RADIX, 1037 HAMMER2_FREEMAP_BLOCK_SIZE); 1038 break; 1039 case 3: /* 11 -> 10 */ 1040 live->bitmapq[bindex] &= 1041 ~((hammer2_bitmap_t)1 << scount); 1042 ++cbinfo->count_11_10; 1043 hammer2_io_dedup_delete( 1044 cbinfo->hmp, 1045 HAMMER2_BREF_TYPE_DATA, 1046 tmp_off | 1047 HAMMER2_FREEMAP_BLOCK_RADIX, 1048 HAMMER2_FREEMAP_BLOCK_SIZE); 1049 break; 1050 } 1051 } else if ((mmask & 3) == 3) { 1052 /* 1053 * in-memory 11 live 10 -> 11 1054 * live ** -> 11 1055 * 1056 * Storage might be incorrectly marked free 1057 * or staged and must be remarked fully 1058 * allocated. 1059 */ 1060 switch (lmask & 3) { 1061 case 0: /* 00 */ 1062 ++cbinfo->count_00_11; 1063 cbinfo->adj_free -= 1064 HAMMER2_FREEMAP_BLOCK_SIZE; 1065 live->avail -= 1066 HAMMER2_FREEMAP_BLOCK_SIZE; 1067 if ((int32_t)live->avail < 0) 1068 live->avail = 0; 1069 break; 1070 case 1: /* 01 */ 1071 ++cbinfo->count_01_11; 1072 break; 1073 case 2: /* 10 -> 11 */ 1074 ++cbinfo->count_10_11; 1075 break; 1076 case 3: /* 11 */ 1077 break; 1078 } 1079 live->bitmapq[bindex] |= 1080 ((hammer2_bitmap_t)3 << scount); 1081 } 1082 mmask >>= 2; 1083 lmask >>= 2; 1084 tmp_off += HAMMER2_FREEMAP_BLOCK_SIZE; 1085 } 1086 } 1087 1088 /* 1089 * Determine if the live bitmap is completely free and reset its 1090 * fields if so. Otherwise check to see if we can reduce the linear 1091 * offset. 1092 */ 1093 for (bindex = HAMMER2_BMAP_ELEMENTS - 1; bindex >= 0; --bindex) { 1094 if (live->bitmapq[bindex] != 0) 1095 break; 1096 } 1097 if (bindex < 0) { 1098 /* 1099 * Completely empty, reset entire segment 1100 */ 1101 #if 0 1102 kprintf("hammer2: cleanseg %016jx.%04x (%d)\n", 1103 alloc_base, live->class, live->avail); 1104 #endif 1105 live->avail = HAMMER2_FREEMAP_LEVEL0_SIZE; 1106 live->class = 0; 1107 live->linear = 0; 1108 ++cbinfo->count_l0cleans; 1109 } else if (bindex < 7) { 1110 /* 1111 * Partially full, bitmapq[bindex] != 0. Our bulkfree pass 1112 * does not record enough information to set live->linear 1113 * exactly. 1114 * 1115 * NOTE: Setting live->linear to a sub-block (16K) boundary 1116 * forces the live code to iterate to the next fully 1117 * free block. It does NOT mean that all blocks above 1118 * live->linear are available. 1119 * 1120 * Setting live->linear to a fragmentary (less than 1121 * 16K) boundary allows allocations to iterate within 1122 * that sub-block. 1123 */ 1124 if (live->linear < bmap->linear && 1125 ((live->linear ^ bmap->linear) & 1126 ~HAMMER2_FREEMAP_BLOCK_MASK) == 0) { 1127 /* 1128 * If greater than but still within the same 1129 * sub-block as live we can adjust linear upward. 1130 */ 1131 live->linear = bmap->linear; 1132 ++cbinfo->count_linadjusts; 1133 } else { 1134 /* 1135 * Otherwise adjust to the nearest higher or same 1136 * sub-block boundary. The live system may have 1137 * bounced live->linear around so we cannot make any 1138 * assumptions with regards to available fragmentary 1139 * allocations. 1140 */ 1141 live->linear = 1142 (bmap->linear + HAMMER2_FREEMAP_BLOCK_MASK) & 1143 ~HAMMER2_FREEMAP_BLOCK_MASK; 1144 ++cbinfo->count_linadjusts; 1145 } 1146 } else { 1147 /* 1148 * Completely full, effectively disable the linear iterator 1149 */ 1150 live->linear = HAMMER2_SEGSIZE; 1151 } 1152 1153 #if 0 1154 if (bmap->class) { 1155 kprintf("%016jx %04d.%04x (avail=%7d) " 1156 "%08x %08x %08x %08x %08x %08x %08x %08x\n", 1157 (intmax_t)data_off, 1158 (int)((data_off & 1159 HAMMER2_FREEMAP_LEVEL1_MASK) >> 1160 HAMMER2_FREEMAP_LEVEL0_RADIX), 1161 bmap->class, 1162 bmap->avail, 1163 bmap->bitmap[0], bmap->bitmap[1], 1164 bmap->bitmap[2], bmap->bitmap[3], 1165 bmap->bitmap[4], bmap->bitmap[5], 1166 bmap->bitmap[6], bmap->bitmap[7]); 1167 } 1168 #endif 1169 } 1170 1171 /* 1172 * BULKFREE DEDUP HEURISTIC 1173 * 1174 * WARNING! This code is SMP safe but the heuristic allows SMP collisions. 1175 * All fields must be loaded into locals and validated. 1176 */ 1177 static 1178 int 1179 h2_bulkfree_test(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref, 1180 int pri) 1181 { 1182 hammer2_dedup_t *dedup; 1183 int best; 1184 int n; 1185 int i; 1186 1187 n = hammer2_icrc32(&bref->data_off, sizeof(bref->data_off)); 1188 dedup = cbinfo->dedup + (n & (HAMMER2_DEDUP_HEUR_MASK & ~7)); 1189 1190 for (i = best = 0; i < 8; ++i) { 1191 if (dedup[i].data_off == bref->data_off) { 1192 if (dedup[i].ticks < pri) 1193 dedup[i].ticks = pri; 1194 if (pri == 1) 1195 cbinfo->count_dedup_factor += dedup[i].ticks; 1196 return 1; 1197 } 1198 if (dedup[i].ticks < dedup[best].ticks) 1199 best = i; 1200 } 1201 dedup[best].data_off = bref->data_off; 1202 dedup[best].ticks = pri; 1203 1204 return 0; 1205 } 1206