1 /* 2 * Copyright (c) 2013-2018 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/kernel.h> 37 #include <sys/fcntl.h> 38 #include <sys/buf.h> 39 #include <sys/proc.h> 40 #include <sys/namei.h> 41 #include <sys/mount.h> 42 #include <sys/vnode.h> 43 #include <sys/mountctl.h> 44 #include <vm/vm_kern.h> 45 #include <vm/vm_extern.h> 46 47 #include "hammer2.h" 48 49 /* 50 * XXX I made a mistake and made the reserved area begin at each LEVEL1 zone, 51 * which is on a 1GB demark. This will eat a little more space but for 52 * now we retain compatibility and make FMZONEBASE every 1GB 53 */ 54 #define H2FMZONEBASE(key) ((key) & ~HAMMER2_FREEMAP_LEVEL1_MASK) 55 #define H2FMBASE(key, radix) ((key) & ~(((hammer2_off_t)1 << (radix)) - 1)) 56 #define H2FMSHIFT(radix) ((hammer2_off_t)1 << (radix)) 57 58 /* 59 * breadth-first search 60 */ 61 typedef struct hammer2_chain_save { 62 TAILQ_ENTRY(hammer2_chain_save) entry; 63 hammer2_chain_t *chain; 64 int pri; 65 } hammer2_chain_save_t; 66 67 TAILQ_HEAD(hammer2_chain_save_list, hammer2_chain_save); 68 typedef struct hammer2_chain_save_list hammer2_chain_save_list_t; 69 70 typedef struct hammer2_bulkfree_info { 71 hammer2_dev_t *hmp; 72 kmem_anon_desc_t kp; 73 hammer2_off_t sbase; /* sub-loop iteration */ 74 hammer2_off_t sstop; 75 hammer2_bmap_data_t *bmap; 76 int depth; 77 long count_10_00; /* staged->free */ 78 long count_11_10; /* allocated->staged */ 79 long count_00_11; /* (should not happen) */ 80 long count_01_11; /* (should not happen) */ 81 long count_10_11; /* staged->allocated */ 82 long count_l0cleans; 83 long count_linadjusts; 84 long count_inodes_scanned; 85 long count_dirents_scanned; 86 long count_dedup_factor; 87 long count_bytes_scanned; 88 long count_chains_scanned; 89 long count_chains_reported; 90 long bulkfree_calls; 91 int bulkfree_ticks; 92 hammer2_off_t adj_free; 93 hammer2_tid_t mtid; 94 hammer2_tid_t saved_mirror_tid; 95 time_t save_time; 96 hammer2_chain_save_list_t list; 97 hammer2_dedup_t *dedup; 98 int pri; 99 } hammer2_bulkfree_info_t; 100 101 static int h2_bulkfree_test(hammer2_bulkfree_info_t *info, 102 hammer2_blockref_t *bref, int pri); 103 104 /* 105 * General bulk scan function with callback. Called with a referenced 106 * but UNLOCKED parent. The parent is returned in the same state. 107 */ 108 static 109 int 110 hammer2_bulk_scan(hammer2_chain_t *parent, 111 int (*func)(hammer2_bulkfree_info_t *info, 112 hammer2_blockref_t *bref), 113 hammer2_bulkfree_info_t *info) 114 { 115 hammer2_blockref_t bref; 116 hammer2_chain_t *chain; 117 int first = 1; 118 int rup_error; 119 int error; 120 121 ++info->pri; 122 123 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 124 HAMMER2_RESOLVE_SHARED); 125 chain = NULL; 126 rup_error = 0; 127 error = 0; 128 129 /* 130 * Generally loop on the contents if we have not been flagged 131 * for abort. 132 * 133 * Remember that these chains are completely isolated from 134 * the frontend, so we can release locks temporarily without 135 * imploding. 136 */ 137 for (;;) { 138 error |= hammer2_chain_scan(parent, &chain, &bref, &first, 139 HAMMER2_LOOKUP_NODATA | 140 HAMMER2_LOOKUP_SHARED); 141 142 /* 143 * Handle EOF or other error at current level. This stops 144 * the bulkfree scan. 145 */ 146 if (error) 147 break; 148 149 /* 150 * Account for dirents before thre data_off test, since most 151 * dirents do not need a data reference. 152 */ 153 if (bref.type == HAMMER2_BREF_TYPE_DIRENT) 154 ++info->count_dirents_scanned; 155 156 /* 157 * Ignore brefs without data (typically dirents) 158 */ 159 if ((bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) 160 continue; 161 162 /* 163 * Process bref, chain is only non-NULL if the bref 164 * might be recursable (its possible that we sometimes get 165 * a non-NULL chain where the bref cannot be recursed). 166 */ 167 ++info->pri; 168 if (h2_bulkfree_test(info, &bref, 1)) 169 continue; 170 171 if (bref.type == HAMMER2_BREF_TYPE_INODE) 172 ++info->count_inodes_scanned; 173 174 error |= func(info, &bref); 175 if (error) 176 break; 177 178 /* 179 * A non-null chain is always returned if it is 180 * recursive, otherwise a non-null chain might be 181 * returned but usually is not when not recursive. 182 */ 183 if (chain == NULL) 184 continue; 185 186 if (chain) { 187 info->count_bytes_scanned += chain->bytes; 188 ++info->count_chains_scanned; 189 190 if (info->count_chains_scanned >= 191 info->count_chains_reported + 1000000 || 192 (info->count_chains_scanned < 1000000 && 193 info->count_chains_scanned >= 194 info->count_chains_reported + 100000)) { 195 kprintf(" chains %-7ld inodes %-7ld " 196 "dirents %-7ld bytes %5ldMB\n", 197 info->count_chains_scanned, 198 info->count_inodes_scanned, 199 info->count_dirents_scanned, 200 info->count_bytes_scanned / 1000000); 201 info->count_chains_reported = 202 info->count_chains_scanned; 203 } 204 } 205 206 207 /* 208 * Else check type and setup depth-first scan. 209 * 210 * Account for bytes actually read. 211 */ 212 switch(chain->bref.type) { 213 case HAMMER2_BREF_TYPE_INODE: 214 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 215 case HAMMER2_BREF_TYPE_INDIRECT: 216 case HAMMER2_BREF_TYPE_VOLUME: 217 case HAMMER2_BREF_TYPE_FREEMAP: 218 ++info->depth; 219 if (info->depth > 16) { 220 hammer2_chain_save_t *save; 221 save = kmalloc(sizeof(*save), M_HAMMER2, 222 M_WAITOK | M_ZERO); 223 save->chain = chain; 224 hammer2_chain_ref(chain); 225 TAILQ_INSERT_TAIL(&info->list, save, entry); 226 227 /* guess */ 228 info->pri += 10; 229 } else { 230 int savepri = info->pri; 231 232 hammer2_chain_unlock(chain); 233 hammer2_chain_unlock(parent); 234 info->pri = 0; 235 rup_error |= 236 hammer2_bulk_scan(chain, func, info); 237 info->pri += savepri; 238 hammer2_chain_lock(parent, 239 HAMMER2_RESOLVE_ALWAYS | 240 HAMMER2_RESOLVE_SHARED); 241 hammer2_chain_lock(chain, 242 HAMMER2_RESOLVE_ALWAYS | 243 HAMMER2_RESOLVE_SHARED); 244 } 245 --info->depth; 246 break; 247 case HAMMER2_BREF_TYPE_DATA: 248 break; 249 default: 250 /* does not recurse */ 251 break; 252 } 253 if (rup_error & HAMMER2_ERROR_ABORTED) 254 break; 255 } 256 if (chain) { 257 hammer2_chain_unlock(chain); 258 hammer2_chain_drop(chain); 259 } 260 261 /* 262 * Save with higher pri now that we know what it is. 263 */ 264 h2_bulkfree_test(info, &parent->bref, info->pri + 1); 265 266 hammer2_chain_unlock(parent); 267 268 return ((error | rup_error) & ~HAMMER2_ERROR_EOF); 269 } 270 271 /* 272 * Bulkfree algorithm 273 * 274 * Repeat { 275 * Chain flush (partial synchronization) XXX removed 276 * Scan the whole topology - build in-memory freemap (mark 11) 277 * Reconcile the in-memory freemap against the on-disk freemap. 278 * ondisk xx -> ondisk 11 (if allocated) 279 * ondisk 11 -> ondisk 10 (if free in-memory) 280 * ondisk 10 -> ondisk 00 (if free in-memory) - on next pass 281 * } 282 * 283 * The topology scan may have to be performed multiple times to window 284 * freemaps which are too large to fit in kernel memory. 285 * 286 * Races are handled using a double-transition (11->10, 10->00). The bulkfree 287 * scan snapshots the volume root's blockset and thus can run concurrent with 288 * normal operations, as long as a full flush is made between each pass to 289 * synchronize any modified chains (otherwise their blocks might be improperly 290 * freed). 291 * 292 * Temporary memory in multiples of 64KB is required to reconstruct the leaf 293 * hammer2_bmap_data blocks so they can later be compared against the live 294 * freemap. Each 64KB block represents 128 x 16KB x 1024 = ~2 GB of storage. 295 * A 32MB save area thus represents around ~1 TB. The temporary memory 296 * allocated can be specified. If it is not sufficient multiple topology 297 * passes will be made. 298 */ 299 300 /* 301 * Bulkfree callback info 302 */ 303 static void hammer2_bulkfree_thread(void *arg __unused); 304 static void cbinfo_bmap_init(hammer2_bulkfree_info_t *cbinfo, size_t size); 305 static int h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo, 306 hammer2_blockref_t *bref); 307 static int h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo); 308 static void h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, 309 hammer2_off_t data_off, hammer2_bmap_data_t *live, 310 hammer2_bmap_data_t *bmap, hammer2_key_t alloc_base); 311 312 void 313 hammer2_bulkfree_init(hammer2_dev_t *hmp) 314 { 315 hammer2_thr_create(&hmp->bfthr, NULL, hmp, 316 hmp->devrepname, -1, -1, 317 hammer2_bulkfree_thread); 318 } 319 320 void 321 hammer2_bulkfree_uninit(hammer2_dev_t *hmp) 322 { 323 hammer2_thr_delete(&hmp->bfthr); 324 } 325 326 static void 327 hammer2_bulkfree_thread(void *arg) 328 { 329 hammer2_thread_t *thr = arg; 330 hammer2_ioc_bulkfree_t bfi; 331 uint32_t flags; 332 333 for (;;) { 334 hammer2_thr_wait_any(thr, 335 HAMMER2_THREAD_STOP | 336 HAMMER2_THREAD_FREEZE | 337 HAMMER2_THREAD_UNFREEZE | 338 HAMMER2_THREAD_REMASTER, 339 hz * 60); 340 341 flags = thr->flags; 342 cpu_ccfence(); 343 if (flags & HAMMER2_THREAD_STOP) 344 break; 345 if (flags & HAMMER2_THREAD_FREEZE) { 346 hammer2_thr_signal2(thr, HAMMER2_THREAD_FROZEN, 347 HAMMER2_THREAD_FREEZE); 348 continue; 349 } 350 if (flags & HAMMER2_THREAD_UNFREEZE) { 351 hammer2_thr_signal2(thr, 0, 352 HAMMER2_THREAD_FROZEN | 353 HAMMER2_THREAD_UNFREEZE); 354 continue; 355 } 356 if (flags & HAMMER2_THREAD_FROZEN) 357 continue; 358 if (flags & HAMMER2_THREAD_REMASTER) { 359 hammer2_thr_signal2(thr, 0, HAMMER2_THREAD_REMASTER); 360 bzero(&bfi, sizeof(bfi)); 361 bfi.size = 8192 * 1024; 362 /* hammer2_bulkfree_pass(thr->hmp, &bfi); */ 363 } 364 } 365 thr->td = NULL; 366 hammer2_thr_signal(thr, HAMMER2_THREAD_STOPPED); 367 /* structure can go invalid at this point */ 368 } 369 370 int 371 hammer2_bulkfree_pass(hammer2_dev_t *hmp, hammer2_chain_t *vchain, 372 hammer2_ioc_bulkfree_t *bfi) 373 { 374 hammer2_bulkfree_info_t cbinfo; 375 hammer2_chain_save_t *save; 376 hammer2_off_t incr; 377 size_t size; 378 int error; 379 380 /* 381 * We have to clear the live dedup cache as it might have entries 382 * that are freeable as of now. Any new entries in the dedup cache 383 * made after this point, even if they become freeable, will have 384 * previously been fully allocated and will be protected by the 385 * 2-stage bulkfree. 386 */ 387 hammer2_dedup_clear(hmp); 388 389 /* 390 * Setup for free pass. Maximum buffer memory is 1/4 physical 391 * memory. 392 */ 393 bzero(&cbinfo, sizeof(cbinfo)); 394 size = (bfi->size + HAMMER2_FREEMAP_LEVELN_PSIZE - 1) & 395 ~(size_t)(HAMMER2_FREEMAP_LEVELN_PSIZE - 1); 396 if (size < 1024 * 1024) 397 size = 1024 * 1024; 398 if (size > kmem_lim_size() * 1024 * 1024 / 4) { 399 size = kmem_lim_size() * 1024 * 1024 / 4; 400 kprintf("hammer2: Warning: capping bulkfree buffer at %jdM\n", 401 (intmax_t)size / (1024 * 1024)); 402 } 403 404 cbinfo.hmp = hmp; 405 cbinfo.bmap = kmem_alloc_swapbacked(&cbinfo.kp, size, VM_SUBSYS_HAMMER); 406 cbinfo.saved_mirror_tid = hmp->voldata.mirror_tid; 407 408 cbinfo.dedup = kmalloc(sizeof(*cbinfo.dedup) * HAMMER2_DEDUP_HEUR_SIZE, 409 M_HAMMER2, M_WAITOK | M_ZERO); 410 411 /* 412 * Normalize start point to a 2GB boundary. We operate on a 413 * 64KB leaf bitmap boundary which represents 2GB of storage. 414 */ 415 cbinfo.sbase = bfi->sbase; 416 if (cbinfo.sbase > hmp->voldata.volu_size) 417 cbinfo.sbase = hmp->voldata.volu_size; 418 cbinfo.sbase &= ~HAMMER2_FREEMAP_LEVEL1_MASK; 419 TAILQ_INIT(&cbinfo.list); 420 421 cbinfo.bulkfree_ticks = ticks; 422 423 /* 424 * Loop on a full meta-data scan as many times as required to 425 * get through all available storage. 426 */ 427 error = 0; 428 while (cbinfo.sbase < hmp->voldata.volu_size) { 429 /* 430 * We have enough ram to represent (incr) bytes of storage. 431 * Each 64KB of ram represents 2GB of storage. 432 * 433 * We must also clean out our de-duplication heuristic for 434 * each (incr) bytes of storage, otherwise we wind up not 435 * scanning meta-data for later areas of storage because 436 * they had already been scanned in earlier areas of storage. 437 * Since the ranging is different, we have to restart 438 * the dedup heuristic too. 439 */ 440 cbinfo_bmap_init(&cbinfo, size); 441 bzero(cbinfo.dedup, sizeof(*cbinfo.dedup) * 442 HAMMER2_DEDUP_HEUR_SIZE); 443 cbinfo.count_inodes_scanned = 0; 444 cbinfo.count_dirents_scanned = 0; 445 cbinfo.count_bytes_scanned = 0; 446 cbinfo.count_chains_scanned = 0; 447 cbinfo.count_chains_reported = 0; 448 449 incr = size / HAMMER2_FREEMAP_LEVELN_PSIZE * 450 HAMMER2_FREEMAP_LEVEL1_SIZE; 451 if (hmp->voldata.volu_size - cbinfo.sbase < incr) 452 cbinfo.sstop = hmp->voldata.volu_size; 453 else 454 cbinfo.sstop = cbinfo.sbase + incr; 455 kprintf("hammer2: bulkfree buf=%5jdM " 456 "pass %016jx-%016jx (%jdGB of media)\n", 457 (intmax_t)size / (1024 * 1024), 458 (intmax_t)cbinfo.sbase, 459 (intmax_t)cbinfo.sstop, 460 (intmax_t)incr / (1024L*1024*1024)); 461 462 /* 463 * Scan topology for stuff inside this range. 464 * 465 * NOTE - By not using a transaction the operation can 466 * run concurrent with the frontend as well as 467 * with flushes. 468 * 469 * We cannot safely set a mtid without a transaction, 470 * and in fact we don't want to set one anyway. We 471 * want the bulkfree to be passive and no interfere 472 * with crash recovery. 473 */ 474 #undef HAMMER2_BULKFREE_TRANS /* undef - don't use transaction */ 475 #ifdef HAMMER2_BULKFREE_TRANS 476 hammer2_trans_init(hmp->spmp, 0); 477 cbinfo.mtid = hammer2_trans_sub(hmp->spmp); 478 #else 479 cbinfo.mtid = 0; 480 #endif 481 cbinfo.pri = 0; 482 error |= hammer2_bulk_scan(vchain, h2_bulkfree_callback, 483 &cbinfo); 484 485 while ((save = TAILQ_FIRST(&cbinfo.list)) != NULL && 486 error == 0) { 487 TAILQ_REMOVE(&cbinfo.list, save, entry); 488 cbinfo.pri = 0; 489 error |= hammer2_bulk_scan(save->chain, 490 h2_bulkfree_callback, 491 &cbinfo); 492 hammer2_chain_drop(save->chain); 493 kfree(save, M_HAMMER2); 494 } 495 while (save) { 496 TAILQ_REMOVE(&cbinfo.list, save, entry); 497 hammer2_chain_drop(save->chain); 498 kfree(save, M_HAMMER2); 499 save = TAILQ_FIRST(&cbinfo.list); 500 } 501 502 /* 503 * If the complete scan succeeded we can synchronize our 504 * in-memory freemap against live storage. If an abort 505 * occured we cannot safely synchronize our partially 506 * filled-out in-memory freemap. 507 */ 508 if (error) { 509 kprintf("bulkfree lastdrop %d %d error=0x%04x\n", 510 vchain->refs, vchain->core.chain_count, error); 511 } else { 512 kprintf("bulkfree lastdrop %d %d\n", 513 vchain->refs, vchain->core.chain_count); 514 515 error = h2_bulkfree_sync(&cbinfo); 516 517 hammer2_voldata_lock(hmp); 518 hammer2_voldata_modify(hmp); 519 hmp->voldata.allocator_free += cbinfo.adj_free; 520 hammer2_voldata_unlock(hmp); 521 } 522 523 /* 524 * Cleanup for next loop. 525 */ 526 #ifdef HAMMER2_BULKFREE_TRANS 527 hammer2_trans_done(hmp->spmp, 0); 528 #endif 529 if (error) 530 break; 531 cbinfo.sbase = cbinfo.sstop; 532 cbinfo.adj_free = 0; 533 } 534 kmem_free_swapbacked(&cbinfo.kp); 535 kfree(cbinfo.dedup, M_HAMMER2); 536 cbinfo.dedup = NULL; 537 538 bfi->sstop = cbinfo.sbase; 539 540 incr = bfi->sstop / (hmp->voldata.volu_size / 10000); 541 if (incr > 10000) 542 incr = 10000; 543 544 kprintf("bulkfree pass statistics (%d.%02d%% storage processed):\n", 545 (int)incr / 100, 546 (int)incr % 100); 547 548 if (error) { 549 kprintf(" bulkfree was aborted\n"); 550 } else { 551 kprintf(" transition->free %ld\n", cbinfo.count_10_00); 552 kprintf(" transition->staged %ld\n", cbinfo.count_11_10); 553 kprintf(" ERR(00)->allocated %ld\n", cbinfo.count_00_11); 554 kprintf(" ERR(01)->allocated %ld\n", cbinfo.count_01_11); 555 kprintf(" staged->allocated %ld\n", cbinfo.count_10_11); 556 kprintf(" ~2MB segs cleaned %ld\n", cbinfo.count_l0cleans); 557 kprintf(" linear adjusts %ld\n", 558 cbinfo.count_linadjusts); 559 kprintf(" dedup factor %ld\n", 560 cbinfo.count_dedup_factor); 561 } 562 563 return error; 564 } 565 566 static void 567 cbinfo_bmap_init(hammer2_bulkfree_info_t *cbinfo, size_t size) 568 { 569 hammer2_bmap_data_t *bmap = cbinfo->bmap; 570 hammer2_key_t key = cbinfo->sbase; 571 hammer2_key_t lokey; 572 hammer2_key_t hikey; 573 574 lokey = (cbinfo->hmp->voldata.allocator_beg + HAMMER2_SEGMASK64) & 575 ~HAMMER2_SEGMASK64; 576 hikey = cbinfo->hmp->voldata.volu_size & ~HAMMER2_SEGMASK64; 577 578 bzero(bmap, size); 579 while (size) { 580 bzero(bmap, sizeof(*bmap)); 581 if (lokey < H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX)) 582 lokey = H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX); 583 if (lokey < H2FMZONEBASE(key) + HAMMER2_ZONE_SEG64) 584 lokey = H2FMZONEBASE(key) + HAMMER2_ZONE_SEG64; 585 if (key < lokey || key >= hikey) { 586 memset(bmap->bitmapq, -1, 587 sizeof(bmap->bitmapq)); 588 bmap->avail = 0; 589 bmap->linear = HAMMER2_SEGSIZE; 590 } else { 591 bmap->avail = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX); 592 } 593 size -= sizeof(*bmap); 594 key += HAMMER2_FREEMAP_LEVEL0_SIZE; 595 ++bmap; 596 } 597 } 598 599 static int 600 h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref) 601 { 602 hammer2_bmap_data_t *bmap; 603 hammer2_off_t data_off; 604 uint16_t class; 605 size_t bytes; 606 int radix; 607 608 /* 609 * Check for signal and allow yield to userland during scan. 610 */ 611 if (hammer2_signal_check(&cbinfo->save_time)) 612 return HAMMER2_ERROR_ABORTED; 613 614 /* 615 * Deal with kernel thread cpu or I/O hogging by limiting the 616 * number of chains scanned per second to hammer2_bulkfree_tps. 617 * Ignore leaf records (DIRENT and DATA), no per-record I/O is 618 * involved for those since we don't load their data. 619 */ 620 if (bref->type != HAMMER2_BREF_TYPE_DATA && 621 bref->type != HAMMER2_BREF_TYPE_DIRENT) { 622 ++cbinfo->bulkfree_calls; 623 if (cbinfo->bulkfree_calls > hammer2_bulkfree_tps) { 624 int dticks = ticks - cbinfo->bulkfree_ticks; 625 if (dticks < 0) 626 dticks = 0; 627 if (dticks < hz) { 628 tsleep(&cbinfo->bulkfree_ticks, 0, 629 "h2bw", hz - dticks); 630 } 631 cbinfo->bulkfree_calls = 0; 632 cbinfo->bulkfree_ticks = ticks; 633 } 634 } 635 636 /* 637 * Calculate the data offset and determine if it is within 638 * the current freemap range being gathered. 639 */ 640 data_off = bref->data_off & ~HAMMER2_OFF_MASK_RADIX; 641 if (data_off < cbinfo->sbase || data_off >= cbinfo->sstop) 642 return 0; 643 if (data_off < cbinfo->hmp->voldata.allocator_beg) 644 return 0; 645 if (data_off >= cbinfo->hmp->voldata.volu_size) 646 return 0; 647 648 /* 649 * Calculate the information needed to generate the in-memory 650 * freemap record. 651 * 652 * Hammer2 does not allow allocations to cross the L1 (2GB) boundary, 653 * it's a problem if it does. (Or L0 (2MB) for that matter). 654 */ 655 radix = (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 656 KKASSERT(radix != 0); 657 bytes = (size_t)1 << radix; 658 class = (bref->type << 8) | hammer2_devblkradix(radix); 659 660 if (data_off + bytes > cbinfo->sstop) { 661 kprintf("hammer2_bulkfree_scan: illegal 2GB boundary " 662 "%016jx %016jx/%d\n", 663 (intmax_t)bref->data_off, 664 (intmax_t)bref->key, 665 bref->keybits); 666 bytes = cbinfo->sstop - data_off; /* XXX */ 667 } 668 669 /* 670 * Convert to a storage offset relative to the beginning of the 671 * storage range we are collecting. Then lookup the level0 bmap entry. 672 */ 673 data_off -= cbinfo->sbase; 674 bmap = cbinfo->bmap + (data_off >> HAMMER2_FREEMAP_LEVEL0_RADIX); 675 676 /* 677 * Convert data_off to a bmap-relative value (~4MB storage range). 678 * Adjust linear, class, and avail. 679 * 680 * Hammer2 does not allow allocations to cross the L0 (4MB) boundary, 681 */ 682 data_off &= HAMMER2_FREEMAP_LEVEL0_MASK; 683 if (data_off + bytes > HAMMER2_FREEMAP_LEVEL0_SIZE) { 684 kprintf("hammer2_bulkfree_scan: illegal 4MB boundary " 685 "%016jx %016jx/%d\n", 686 (intmax_t)bref->data_off, 687 (intmax_t)bref->key, 688 bref->keybits); 689 bytes = HAMMER2_FREEMAP_LEVEL0_SIZE - data_off; 690 } 691 692 if (bmap->class == 0) { 693 bmap->class = class; 694 bmap->avail = HAMMER2_FREEMAP_LEVEL0_SIZE; 695 } 696 697 /* 698 * NOTE: bmap->class does not have to match class. Classification 699 * is relaxed when free space is low, so some mixing can occur. 700 */ 701 #if 0 702 /* 703 * XXX removed 704 */ 705 if (bmap->class != class) { 706 kprintf("hammer2_bulkfree_scan: illegal mixed class " 707 "%016jx %016jx/%d (%04x vs %04x)\n", 708 (intmax_t)bref->data_off, 709 (intmax_t)bref->key, 710 bref->keybits, 711 class, bmap->class); 712 } 713 #endif 714 715 /* 716 * Just record the highest byte-granular offset for now. Do not 717 * match against allocations which are in multiples of whole blocks. 718 * 719 * Make sure that any in-block linear offset at least covers the 720 * data range. This can cause bmap->linear to become block-aligned. 721 */ 722 if (bytes & HAMMER2_FREEMAP_BLOCK_MASK) { 723 if (bmap->linear < (int32_t)data_off + (int32_t)bytes) 724 bmap->linear = (int32_t)data_off + (int32_t)bytes; 725 } else if (bmap->linear >= (int32_t)data_off && 726 bmap->linear < (int32_t)data_off + (int32_t)bytes) { 727 bmap->linear = (int32_t)data_off + (int32_t)bytes; 728 } 729 730 /* 731 * Adjust the hammer2_bitmap_t bitmap[HAMMER2_BMAP_ELEMENTS]. 732 * 64-bit entries, 2 bits per entry, to code 11. 733 * 734 * NOTE: data_off mask to 524288, shift right by 14 (radix for 16384), 735 * and multiply shift amount by 2 for sets of 2 bits. 736 * 737 * NOTE: The allocation can be smaller than HAMMER2_FREEMAP_BLOCK_SIZE. 738 * also, data_off may not be FREEMAP_BLOCK_SIZE aligned. 739 */ 740 while (bytes > 0) { 741 hammer2_bitmap_t bmask; 742 int bindex; 743 744 bindex = (int)data_off >> (HAMMER2_FREEMAP_BLOCK_RADIX + 745 HAMMER2_BMAP_INDEX_RADIX); 746 bmask = (hammer2_bitmap_t)3 << 747 ((((int)data_off & HAMMER2_BMAP_INDEX_MASK) >> 748 HAMMER2_FREEMAP_BLOCK_RADIX) << 1); 749 750 /* 751 * NOTE! The (avail) calculation is bitmap-granular. Multiple 752 * sub-granular records can wind up at the same bitmap 753 * position. 754 */ 755 if ((bmap->bitmapq[bindex] & bmask) == 0) { 756 if (bytes < HAMMER2_FREEMAP_BLOCK_SIZE) { 757 bmap->avail -= HAMMER2_FREEMAP_BLOCK_SIZE; 758 } else { 759 bmap->avail -= bytes; 760 } 761 bmap->bitmapq[bindex] |= bmask; 762 } 763 data_off += HAMMER2_FREEMAP_BLOCK_SIZE; 764 if (bytes < HAMMER2_FREEMAP_BLOCK_SIZE) 765 bytes = 0; 766 else 767 bytes -= HAMMER2_FREEMAP_BLOCK_SIZE; 768 } 769 return 0; 770 } 771 772 /* 773 * Synchronize the in-memory bitmap with the live freemap. This is not a 774 * direct copy. Instead the bitmaps must be compared: 775 * 776 * In-memory Live-freemap 777 * 00 11 -> 10 (do nothing if live modified) 778 * 10 -> 00 (do nothing if live modified) 779 * 11 10 -> 11 handles race against live 780 * ** -> 11 nominally warn of corruption 781 * 782 * We must also fixup the hints in HAMMER2_BREF_TYPE_FREEMAP_LEAF. 783 */ 784 static int 785 h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo) 786 { 787 hammer2_off_t data_off; 788 hammer2_key_t key; 789 hammer2_key_t key_dummy; 790 hammer2_bmap_data_t *bmap; 791 hammer2_bmap_data_t *live; 792 hammer2_chain_t *live_parent; 793 hammer2_chain_t *live_chain; 794 int bmapindex; 795 int error; 796 797 kprintf("hammer2_bulkfree - range "); 798 799 if (cbinfo->sbase < cbinfo->hmp->voldata.allocator_beg) 800 kprintf("%016jx-", 801 (intmax_t)cbinfo->hmp->voldata.allocator_beg); 802 else 803 kprintf("%016jx-", 804 (intmax_t)cbinfo->sbase); 805 806 if (cbinfo->sstop > cbinfo->hmp->voldata.volu_size) 807 kprintf("%016jx\n", 808 (intmax_t)cbinfo->hmp->voldata.volu_size); 809 else 810 kprintf("%016jx\n", 811 (intmax_t)cbinfo->sstop); 812 813 data_off = cbinfo->sbase; 814 bmap = cbinfo->bmap; 815 816 live_parent = &cbinfo->hmp->fchain; 817 hammer2_chain_ref(live_parent); 818 hammer2_chain_lock(live_parent, HAMMER2_RESOLVE_ALWAYS); 819 live_chain = NULL; 820 error = 0; 821 822 /* 823 * Iterate each hammer2_bmap_data_t line (128 bytes) managing 824 * 4MB of storage. 825 */ 826 while (data_off < cbinfo->sstop) { 827 /* 828 * The freemap is not used below allocator_beg or beyond 829 * volu_size. 830 */ 831 832 if (data_off < cbinfo->hmp->voldata.allocator_beg) 833 goto next; 834 if (data_off >= cbinfo->hmp->voldata.volu_size) 835 goto next; 836 837 /* 838 * Locate the freemap leaf on the live filesystem 839 */ 840 key = (data_off & ~HAMMER2_FREEMAP_LEVEL1_MASK); 841 842 if (live_chain == NULL || live_chain->bref.key != key) { 843 if (live_chain) { 844 hammer2_chain_unlock(live_chain); 845 hammer2_chain_drop(live_chain); 846 } 847 live_chain = hammer2_chain_lookup( 848 &live_parent, 849 &key_dummy, 850 key, 851 key + HAMMER2_FREEMAP_LEVEL1_MASK, 852 &error, 853 HAMMER2_LOOKUP_ALWAYS); 854 if (error) { 855 kprintf("hammer2_bulkfree: freemap lookup " 856 "error near %016jx, error %s\n", 857 (intmax_t)data_off, 858 hammer2_error_str(live_chain->error)); 859 break; 860 } 861 } 862 if (live_chain == NULL) { 863 /* 864 * XXX if we implement a full recovery mode we need 865 * to create/recreate missing freemap chains if our 866 * bmap has any allocated blocks. 867 */ 868 if (bmap->class && 869 bmap->avail != HAMMER2_FREEMAP_LEVEL0_SIZE) { 870 kprintf("hammer2_bulkfree: cannot locate " 871 "live leaf for allocated data " 872 "near %016jx\n", 873 (intmax_t)data_off); 874 } 875 goto next; 876 } 877 if (live_chain->error) { 878 kprintf("hammer2_bulkfree: unable to access freemap " 879 "near %016jx, error %s\n", 880 (intmax_t)data_off, 881 hammer2_error_str(live_chain->error)); 882 hammer2_chain_unlock(live_chain); 883 hammer2_chain_drop(live_chain); 884 live_chain = NULL; 885 goto next; 886 } 887 888 bmapindex = (data_off & HAMMER2_FREEMAP_LEVEL1_MASK) >> 889 HAMMER2_FREEMAP_LEVEL0_RADIX; 890 live = &live_chain->data->bmdata[bmapindex]; 891 892 /* 893 * Shortcut if the bitmaps match and the live linear 894 * indicator is sane. We can't do a perfect check of 895 * live->linear because the only real requirement is that 896 * if it is not block-aligned, that it not cover the space 897 * within its current block which overlaps one of the data 898 * ranges we scan. We don't retain enough fine-grained 899 * data in our scan to be able to set it exactly. 900 * 901 * TODO - we could shortcut this by testing that both 902 * live->class and bmap->class are 0, and both avails are 903 * set to HAMMER2_FREEMAP_LEVEL0_SIZE (4MB). 904 */ 905 if (bcmp(live->bitmapq, bmap->bitmapq, 906 sizeof(bmap->bitmapq)) == 0 && 907 live->linear >= bmap->linear) { 908 goto next; 909 } 910 if (hammer2_debug & 1) { 911 kprintf("live %016jx %04d.%04x (avail=%d)\n", 912 data_off, bmapindex, live->class, live->avail); 913 } 914 915 hammer2_chain_modify(live_chain, cbinfo->mtid, 0, 0); 916 live_chain->bref.check.freemap.bigmask = -1; 917 cbinfo->hmp->freemap_relaxed = 0; /* reset heuristic */ 918 live = &live_chain->data->bmdata[bmapindex]; 919 920 h2_bulkfree_sync_adjust(cbinfo, data_off, live, bmap, 921 live_chain->bref.key + 922 bmapindex * 923 HAMMER2_FREEMAP_LEVEL0_SIZE); 924 next: 925 data_off += HAMMER2_FREEMAP_LEVEL0_SIZE; 926 ++bmap; 927 } 928 if (live_chain) { 929 hammer2_chain_unlock(live_chain); 930 hammer2_chain_drop(live_chain); 931 } 932 if (live_parent) { 933 hammer2_chain_unlock(live_parent); 934 hammer2_chain_drop(live_parent); 935 } 936 return error; 937 } 938 939 /* 940 * Merge the bulkfree bitmap against the existing bitmap. 941 */ 942 static 943 void 944 h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, 945 hammer2_off_t data_off, hammer2_bmap_data_t *live, 946 hammer2_bmap_data_t *bmap, hammer2_key_t alloc_base) 947 { 948 int bindex; 949 int scount; 950 hammer2_off_t tmp_off; 951 hammer2_bitmap_t lmask; 952 hammer2_bitmap_t mmask; 953 954 tmp_off = data_off; 955 956 for (bindex = 0; bindex < HAMMER2_BMAP_ELEMENTS; ++bindex) { 957 lmask = live->bitmapq[bindex]; /* live */ 958 mmask = bmap->bitmapq[bindex]; /* snapshotted bulkfree */ 959 if (lmask == mmask) { 960 tmp_off += HAMMER2_BMAP_INDEX_SIZE; 961 continue; 962 } 963 964 for (scount = 0; 965 scount < HAMMER2_BMAP_BITS_PER_ELEMENT; 966 scount += 2) { 967 if ((mmask & 3) == 0) { 968 /* 969 * in-memory 00 live 11 -> 10 970 * live 10 -> 00 971 * 972 * Storage might be marked allocated or 973 * staged and must be remarked staged or 974 * free. 975 */ 976 switch (lmask & 3) { 977 case 0: /* 00 */ 978 break; 979 case 1: /* 01 */ 980 kprintf("hammer2_bulkfree: cannot " 981 "transition m=00/l=01\n"); 982 break; 983 case 2: /* 10 -> 00 */ 984 live->bitmapq[bindex] &= 985 ~((hammer2_bitmap_t)2 << scount); 986 live->avail += 987 HAMMER2_FREEMAP_BLOCK_SIZE; 988 if (live->avail > 989 HAMMER2_FREEMAP_LEVEL0_SIZE) { 990 live->avail = 991 HAMMER2_FREEMAP_LEVEL0_SIZE; 992 } 993 cbinfo->adj_free += 994 HAMMER2_FREEMAP_BLOCK_SIZE; 995 ++cbinfo->count_10_00; 996 hammer2_io_dedup_assert( 997 cbinfo->hmp, 998 tmp_off | 999 HAMMER2_FREEMAP_BLOCK_RADIX, 1000 HAMMER2_FREEMAP_BLOCK_SIZE); 1001 break; 1002 case 3: /* 11 -> 10 */ 1003 live->bitmapq[bindex] &= 1004 ~((hammer2_bitmap_t)1 << scount); 1005 ++cbinfo->count_11_10; 1006 hammer2_io_dedup_delete( 1007 cbinfo->hmp, 1008 HAMMER2_BREF_TYPE_DATA, 1009 tmp_off | 1010 HAMMER2_FREEMAP_BLOCK_RADIX, 1011 HAMMER2_FREEMAP_BLOCK_SIZE); 1012 break; 1013 } 1014 } else if ((mmask & 3) == 3) { 1015 /* 1016 * in-memory 11 live 10 -> 11 1017 * live ** -> 11 1018 * 1019 * Storage might be incorrectly marked free 1020 * or staged and must be remarked fully 1021 * allocated. 1022 */ 1023 switch (lmask & 3) { 1024 case 0: /* 00 */ 1025 ++cbinfo->count_00_11; 1026 cbinfo->adj_free -= 1027 HAMMER2_FREEMAP_BLOCK_SIZE; 1028 live->avail -= 1029 HAMMER2_FREEMAP_BLOCK_SIZE; 1030 if ((int32_t)live->avail < 0) 1031 live->avail = 0; 1032 break; 1033 case 1: /* 01 */ 1034 ++cbinfo->count_01_11; 1035 break; 1036 case 2: /* 10 -> 11 */ 1037 ++cbinfo->count_10_11; 1038 break; 1039 case 3: /* 11 */ 1040 break; 1041 } 1042 live->bitmapq[bindex] |= 1043 ((hammer2_bitmap_t)3 << scount); 1044 } 1045 mmask >>= 2; 1046 lmask >>= 2; 1047 tmp_off += HAMMER2_FREEMAP_BLOCK_SIZE; 1048 } 1049 } 1050 1051 /* 1052 * Determine if the live bitmap is completely free and reset its 1053 * fields if so. Otherwise check to see if we can reduce the linear 1054 * offset. 1055 */ 1056 for (bindex = HAMMER2_BMAP_ELEMENTS - 1; bindex >= 0; --bindex) { 1057 if (live->bitmapq[bindex] != 0) 1058 break; 1059 } 1060 if (bindex < 0) { 1061 /* 1062 * Completely empty, reset entire segment 1063 */ 1064 #if 0 1065 kprintf("hammer2: cleanseg %016jx.%04x (%d)\n", 1066 alloc_base, live->class, live->avail); 1067 #endif 1068 live->avail = HAMMER2_FREEMAP_LEVEL0_SIZE; 1069 live->class = 0; 1070 live->linear = 0; 1071 ++cbinfo->count_l0cleans; 1072 } else if (bindex < 7) { 1073 /* 1074 * Partially full, bitmapq[bindex] != 0. The live->linear 1075 * offset can legitimately be just about anything, but 1076 * our bulkfree pass doesn't record enough information to 1077 * set it exactly. Just make sure that it is set to a 1078 * safe value that also works in our match code above (the 1079 * bcmp and linear test). 1080 * 1081 * We cannot safely leave live->linear at a sub-block offset 1082 * unless it is already in the same block as bmap->linear. 1083 * 1084 * If it is not in the same block, we cannot assume that 1085 * we can set it to bmap->linear on a sub-block boundary, 1086 * because the live system could have bounced it around. 1087 * In that situation we satisfy our bcmp/skip requirement 1088 * above by setting it to the nearest higher block boundary. 1089 * This alignment effectively kills any partial allocation it 1090 * might have been tracking before. 1091 */ 1092 if (live->linear < bmap->linear && 1093 ((live->linear ^ bmap->linear) & 1094 ~HAMMER2_FREEMAP_BLOCK_MASK) == 0) { 1095 live->linear = bmap->linear; 1096 ++cbinfo->count_linadjusts; 1097 } else { 1098 live->linear = 1099 (bmap->linear + HAMMER2_FREEMAP_BLOCK_MASK) & 1100 ~HAMMER2_FREEMAP_BLOCK_MASK; 1101 ++cbinfo->count_linadjusts; 1102 } 1103 } else { 1104 /* 1105 * Completely full, effectively disable the linear iterator 1106 */ 1107 live->linear = HAMMER2_SEGSIZE; 1108 } 1109 1110 #if 0 1111 if (bmap->class) { 1112 kprintf("%016jx %04d.%04x (avail=%7d) " 1113 "%08x %08x %08x %08x %08x %08x %08x %08x\n", 1114 (intmax_t)data_off, 1115 (int)((data_off & 1116 HAMMER2_FREEMAP_LEVEL1_MASK) >> 1117 HAMMER2_FREEMAP_LEVEL0_RADIX), 1118 bmap->class, 1119 bmap->avail, 1120 bmap->bitmap[0], bmap->bitmap[1], 1121 bmap->bitmap[2], bmap->bitmap[3], 1122 bmap->bitmap[4], bmap->bitmap[5], 1123 bmap->bitmap[6], bmap->bitmap[7]); 1124 } 1125 #endif 1126 } 1127 1128 /* 1129 * BULKFREE DEDUP HEURISTIC 1130 * 1131 * WARNING! This code is SMP safe but the heuristic allows SMP collisions. 1132 * All fields must be loaded into locals and validated. 1133 */ 1134 static 1135 int 1136 h2_bulkfree_test(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref, 1137 int pri) 1138 { 1139 hammer2_dedup_t *dedup; 1140 int best; 1141 int n; 1142 int i; 1143 1144 n = hammer2_icrc32(&bref->data_off, sizeof(bref->data_off)); 1145 dedup = cbinfo->dedup + (n & (HAMMER2_DEDUP_HEUR_MASK & ~7)); 1146 1147 for (i = best = 0; i < 8; ++i) { 1148 if (dedup[i].data_off == bref->data_off) { 1149 if (dedup[i].ticks < pri) 1150 dedup[i].ticks = pri; 1151 if (pri == 1) 1152 cbinfo->count_dedup_factor += dedup[i].ticks; 1153 return 1; 1154 } 1155 if (dedup[i].ticks < dedup[best].ticks) 1156 best = i; 1157 } 1158 dedup[best].data_off = bref->data_off; 1159 dedup[best].ticks = pri; 1160 1161 return 0; 1162 } 1163