1 /* 2 * Copyright (c) 2013-2018 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/kernel.h> 37 #include <sys/fcntl.h> 38 #include <sys/buf.h> 39 #include <sys/proc.h> 40 #include <sys/namei.h> 41 #include <sys/mount.h> 42 #include <sys/vnode.h> 43 #include <sys/mountctl.h> 44 #include <vm/vm_kern.h> 45 #include <vm/vm_extern.h> 46 47 #include "hammer2.h" 48 49 /* 50 * XXX I made a mistake and made the reserved area begin at each LEVEL1 zone, 51 * which is on a 1GB demark. This will eat a little more space but for 52 * now we retain compatibility and make FMZONEBASE every 1GB 53 */ 54 #define H2FMZONEBASE(key) ((key) & ~HAMMER2_FREEMAP_LEVEL1_MASK) 55 #define H2FMBASE(key, radix) ((key) & ~(((hammer2_off_t)1 << (radix)) - 1)) 56 #define H2FMSHIFT(radix) ((hammer2_off_t)1 << (radix)) 57 58 /* 59 * breadth-first search 60 */ 61 typedef struct hammer2_chain_save { 62 TAILQ_ENTRY(hammer2_chain_save) entry; 63 hammer2_chain_t *chain; 64 int pri; 65 } hammer2_chain_save_t; 66 67 TAILQ_HEAD(hammer2_chain_save_list, hammer2_chain_save); 68 typedef struct hammer2_chain_save_list hammer2_chain_save_list_t; 69 70 typedef struct hammer2_bulkfree_info { 71 hammer2_dev_t *hmp; 72 kmem_anon_desc_t kp; 73 hammer2_off_t sbase; /* sub-loop iteration */ 74 hammer2_off_t sstop; 75 hammer2_bmap_data_t *bmap; 76 int depth; 77 long count_10_00; /* staged->free */ 78 long count_11_10; /* allocated->staged */ 79 long count_00_11; /* (should not happen) */ 80 long count_01_11; /* (should not happen) */ 81 long count_10_11; /* staged->allocated */ 82 long count_l0cleans; 83 long count_linadjusts; 84 long count_inodes_scanned; 85 long count_dirents_scanned; 86 long count_dedup_factor; 87 long count_bytes_scanned; 88 long count_chains_scanned; 89 long count_chains_reported; 90 long bulkfree_calls; 91 int bulkfree_ticks; 92 hammer2_off_t adj_free; 93 hammer2_tid_t mtid; 94 hammer2_tid_t saved_mirror_tid; 95 time_t save_time; 96 hammer2_chain_save_list_t list; 97 hammer2_dedup_t *dedup; 98 int pri; 99 } hammer2_bulkfree_info_t; 100 101 static int h2_bulkfree_test(hammer2_bulkfree_info_t *info, 102 hammer2_blockref_t *bref, int pri); 103 104 /* 105 * General bulk scan function with callback. Called with a referenced 106 * but UNLOCKED parent. The parent is returned in the same state. 107 */ 108 static 109 int 110 hammer2_bulk_scan(hammer2_chain_t *parent, 111 int (*func)(hammer2_bulkfree_info_t *info, 112 hammer2_blockref_t *bref), 113 hammer2_bulkfree_info_t *info) 114 { 115 hammer2_blockref_t bref; 116 hammer2_chain_t *chain; 117 int first = 1; 118 int rup_error; 119 int error; 120 121 ++info->pri; 122 123 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 124 HAMMER2_RESOLVE_SHARED); 125 chain = NULL; 126 rup_error = 0; 127 error = 0; 128 129 /* 130 * Generally loop on the contents if we have not been flagged 131 * for abort. 132 * 133 * Remember that these chains are completely isolated from 134 * the frontend, so we can release locks temporarily without 135 * imploding. 136 */ 137 for (;;) { 138 error |= hammer2_chain_scan(parent, &chain, &bref, &first, 139 HAMMER2_LOOKUP_NODATA | 140 HAMMER2_LOOKUP_SHARED); 141 142 /* 143 * Handle EOF or other error at current level. This stops 144 * the bulkfree scan. 145 */ 146 if (error) 147 break; 148 149 /* 150 * Account for dirents before thre data_off test, since most 151 * dirents do not need a data reference. 152 */ 153 if (bref.type == HAMMER2_BREF_TYPE_DIRENT) 154 ++info->count_dirents_scanned; 155 156 /* 157 * Ignore brefs without data (typically dirents) 158 */ 159 if ((bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) 160 continue; 161 162 /* 163 * Process bref, chain is only non-NULL if the bref 164 * might be recursable (its possible that we sometimes get 165 * a non-NULL chain where the bref cannot be recursed). 166 */ 167 ++info->pri; 168 if (h2_bulkfree_test(info, &bref, 1)) 169 continue; 170 171 if (bref.type == HAMMER2_BREF_TYPE_INODE) 172 ++info->count_inodes_scanned; 173 174 error |= func(info, &bref); 175 if (error) 176 break; 177 178 /* 179 * A non-null chain is always returned if it is 180 * recursive, otherwise a non-null chain might be 181 * returned but usually is not when not recursive. 182 */ 183 if (chain == NULL) 184 continue; 185 186 if (chain) { 187 info->count_bytes_scanned += chain->bytes; 188 ++info->count_chains_scanned; 189 190 if (info->count_chains_scanned >= 191 info->count_chains_reported + 50000) { 192 kprintf(" chains %-7ld inodes %-7ld " 193 "dirents %-7ld bytes %5ldMB\n", 194 info->count_chains_scanned, 195 info->count_inodes_scanned, 196 info->count_dirents_scanned, 197 info->count_bytes_scanned / 1000000); 198 info->count_chains_reported += 50000; 199 } 200 } 201 202 203 /* 204 * Else check type and setup depth-first scan. 205 * 206 * Account for bytes actually read. 207 */ 208 switch(chain->bref.type) { 209 case HAMMER2_BREF_TYPE_INODE: 210 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 211 case HAMMER2_BREF_TYPE_INDIRECT: 212 case HAMMER2_BREF_TYPE_VOLUME: 213 case HAMMER2_BREF_TYPE_FREEMAP: 214 ++info->depth; 215 if (info->depth > 16) { 216 hammer2_chain_save_t *save; 217 save = kmalloc(sizeof(*save), M_HAMMER2, 218 M_WAITOK | M_ZERO); 219 save->chain = chain; 220 hammer2_chain_ref(chain); 221 TAILQ_INSERT_TAIL(&info->list, save, entry); 222 223 /* guess */ 224 info->pri += 10; 225 } else { 226 int savepri = info->pri; 227 228 hammer2_chain_unlock(chain); 229 hammer2_chain_unlock(parent); 230 info->pri = 0; 231 rup_error |= 232 hammer2_bulk_scan(chain, func, info); 233 info->pri += savepri; 234 hammer2_chain_lock(parent, 235 HAMMER2_RESOLVE_ALWAYS | 236 HAMMER2_RESOLVE_SHARED); 237 hammer2_chain_lock(chain, 238 HAMMER2_RESOLVE_ALWAYS | 239 HAMMER2_RESOLVE_SHARED); 240 } 241 --info->depth; 242 break; 243 case HAMMER2_BREF_TYPE_DATA: 244 break; 245 default: 246 /* does not recurse */ 247 break; 248 } 249 if (rup_error & HAMMER2_ERROR_ABORTED) 250 break; 251 } 252 if (chain) { 253 hammer2_chain_unlock(chain); 254 hammer2_chain_drop(chain); 255 } 256 257 /* 258 * Save with higher pri now that we know what it is. 259 */ 260 h2_bulkfree_test(info, &parent->bref, info->pri + 1); 261 262 hammer2_chain_unlock(parent); 263 264 return ((error | rup_error) & ~HAMMER2_ERROR_EOF); 265 } 266 267 /* 268 * Bulkfree algorithm 269 * 270 * Repeat { 271 * Chain flush (partial synchronization) XXX removed 272 * Scan the whole topology - build in-memory freemap (mark 11) 273 * Reconcile the in-memory freemap against the on-disk freemap. 274 * ondisk xx -> ondisk 11 (if allocated) 275 * ondisk 11 -> ondisk 10 (if free in-memory) 276 * ondisk 10 -> ondisk 00 (if free in-memory) - on next pass 277 * } 278 * 279 * The topology scan may have to be performed multiple times to window 280 * freemaps which are too large to fit in kernel memory. 281 * 282 * Races are handled using a double-transition (11->10, 10->00). The bulkfree 283 * scan snapshots the volume root's blockset and thus can run concurrent with 284 * normal operations, as long as a full flush is made between each pass to 285 * synchronize any modified chains (otherwise their blocks might be improperly 286 * freed). 287 * 288 * Temporary memory in multiples of 64KB is required to reconstruct the leaf 289 * hammer2_bmap_data blocks so they can later be compared against the live 290 * freemap. Each 64KB block represents 128 x 16KB x 1024 = ~2 GB of storage. 291 * A 32MB save area thus represents around ~1 TB. The temporary memory 292 * allocated can be specified. If it is not sufficient multiple topology 293 * passes will be made. 294 */ 295 296 /* 297 * Bulkfree callback info 298 */ 299 static void hammer2_bulkfree_thread(void *arg __unused); 300 static void cbinfo_bmap_init(hammer2_bulkfree_info_t *cbinfo, size_t size); 301 static int h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo, 302 hammer2_blockref_t *bref); 303 static int h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo); 304 static void h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, 305 hammer2_off_t data_off, hammer2_bmap_data_t *live, 306 hammer2_bmap_data_t *bmap, hammer2_key_t alloc_base); 307 308 void 309 hammer2_bulkfree_init(hammer2_dev_t *hmp) 310 { 311 hammer2_thr_create(&hmp->bfthr, NULL, hmp, 312 hmp->devrepname, -1, -1, 313 hammer2_bulkfree_thread); 314 } 315 316 void 317 hammer2_bulkfree_uninit(hammer2_dev_t *hmp) 318 { 319 hammer2_thr_delete(&hmp->bfthr); 320 } 321 322 static void 323 hammer2_bulkfree_thread(void *arg) 324 { 325 hammer2_thread_t *thr = arg; 326 hammer2_ioc_bulkfree_t bfi; 327 uint32_t flags; 328 329 for (;;) { 330 hammer2_thr_wait_any(thr, 331 HAMMER2_THREAD_STOP | 332 HAMMER2_THREAD_FREEZE | 333 HAMMER2_THREAD_UNFREEZE | 334 HAMMER2_THREAD_REMASTER, 335 hz * 60); 336 337 flags = thr->flags; 338 cpu_ccfence(); 339 if (flags & HAMMER2_THREAD_STOP) 340 break; 341 if (flags & HAMMER2_THREAD_FREEZE) { 342 hammer2_thr_signal2(thr, HAMMER2_THREAD_FROZEN, 343 HAMMER2_THREAD_FREEZE); 344 continue; 345 } 346 if (flags & HAMMER2_THREAD_UNFREEZE) { 347 hammer2_thr_signal2(thr, 0, 348 HAMMER2_THREAD_FROZEN | 349 HAMMER2_THREAD_UNFREEZE); 350 continue; 351 } 352 if (flags & HAMMER2_THREAD_FROZEN) 353 continue; 354 if (flags & HAMMER2_THREAD_REMASTER) { 355 hammer2_thr_signal2(thr, 0, HAMMER2_THREAD_REMASTER); 356 bzero(&bfi, sizeof(bfi)); 357 bfi.size = 8192 * 1024; 358 /* hammer2_bulkfree_pass(thr->hmp, &bfi); */ 359 } 360 } 361 thr->td = NULL; 362 hammer2_thr_signal(thr, HAMMER2_THREAD_STOPPED); 363 /* structure can go invalid at this point */ 364 } 365 366 int 367 hammer2_bulkfree_pass(hammer2_dev_t *hmp, hammer2_chain_t *vchain, 368 hammer2_ioc_bulkfree_t *bfi) 369 { 370 hammer2_bulkfree_info_t cbinfo; 371 hammer2_chain_save_t *save; 372 hammer2_off_t incr; 373 size_t size; 374 int error; 375 376 /* 377 * We have to clear the live dedup cache as it might have entries 378 * that are freeable as of now. Any new entries in the dedup cache 379 * made after this point, even if they become freeable, will have 380 * previously been fully allocated and will be protected by the 381 * 2-stage bulkfree. 382 */ 383 hammer2_dedup_clear(hmp); 384 385 /* 386 * Setup for free pass 387 */ 388 bzero(&cbinfo, sizeof(cbinfo)); 389 size = (bfi->size + HAMMER2_FREEMAP_LEVELN_PSIZE - 1) & 390 ~(size_t)(HAMMER2_FREEMAP_LEVELN_PSIZE - 1); 391 if (size < 1024 * 1024) 392 size = 1024 * 1024; 393 if (size > kmem_lim_size() * 1024 * 1024 / 16) 394 size = kmem_lim_size() * 1024 * 1024 / 16; 395 396 cbinfo.hmp = hmp; 397 cbinfo.bmap = kmem_alloc_swapbacked(&cbinfo.kp, size, VM_SUBSYS_HAMMER); 398 cbinfo.saved_mirror_tid = hmp->voldata.mirror_tid; 399 400 cbinfo.dedup = kmalloc(sizeof(*cbinfo.dedup) * HAMMER2_DEDUP_HEUR_SIZE, 401 M_HAMMER2, M_WAITOK | M_ZERO); 402 403 /* 404 * Normalize start point to a 2GB boundary. We operate on a 405 * 64KB leaf bitmap boundary which represents 2GB of storage. 406 */ 407 cbinfo.sbase = bfi->sbase; 408 if (cbinfo.sbase > hmp->voldata.volu_size) 409 cbinfo.sbase = hmp->voldata.volu_size; 410 cbinfo.sbase &= ~HAMMER2_FREEMAP_LEVEL1_MASK; 411 TAILQ_INIT(&cbinfo.list); 412 413 cbinfo.bulkfree_ticks = ticks; 414 415 /* 416 * Loop on a full meta-data scan as many times as required to 417 * get through all available storage. 418 */ 419 error = 0; 420 while (cbinfo.sbase < hmp->voldata.volu_size) { 421 /* 422 * We have enough ram to represent (incr) bytes of storage. 423 * Each 64KB of ram represents 2GB of storage. 424 * 425 * We must also clean out our de-duplication heuristic for 426 * each (incr) bytes of storage, otherwise we wind up not 427 * scanning meta-data for later areas of storage because 428 * they had already been scanned in earlier areas of storage. 429 * Since the ranging is different, we have to restart 430 * the dedup heuristic too. 431 */ 432 cbinfo_bmap_init(&cbinfo, size); 433 bzero(cbinfo.dedup, sizeof(*cbinfo.dedup) * 434 HAMMER2_DEDUP_HEUR_SIZE); 435 cbinfo.count_inodes_scanned = 0; 436 cbinfo.count_dirents_scanned = 0; 437 cbinfo.count_bytes_scanned = 0; 438 cbinfo.count_chains_scanned = 0; 439 cbinfo.count_chains_reported = 0; 440 441 incr = size / HAMMER2_FREEMAP_LEVELN_PSIZE * 442 HAMMER2_FREEMAP_LEVEL1_SIZE; 443 if (hmp->voldata.volu_size - cbinfo.sbase < incr) 444 cbinfo.sstop = hmp->voldata.volu_size; 445 else 446 cbinfo.sstop = cbinfo.sbase + incr; 447 if (hammer2_debug & 1) { 448 kprintf("bulkfree pass %016jx/%jdGB\n", 449 (intmax_t)cbinfo.sbase, 450 (intmax_t)incr / HAMMER2_FREEMAP_LEVEL1_SIZE); 451 } 452 453 /* 454 * Scan topology for stuff inside this range. 455 * 456 * NOTE - By not using a transaction the operation can 457 * run concurrent with the frontend as well as 458 * with flushes. 459 * 460 * We cannot safely set a mtid without a transaction, 461 * and in fact we don't want to set one anyway. We 462 * want the bulkfree to be passive and no interfere 463 * with crash recovery. 464 */ 465 #undef HAMMER2_BULKFREE_TRANS /* undef - don't use transaction */ 466 #ifdef HAMMER2_BULKFREE_TRANS 467 hammer2_trans_init(hmp->spmp, 0); 468 cbinfo.mtid = hammer2_trans_sub(hmp->spmp); 469 #else 470 cbinfo.mtid = 0; 471 #endif 472 cbinfo.pri = 0; 473 error |= hammer2_bulk_scan(vchain, h2_bulkfree_callback, 474 &cbinfo); 475 476 while ((save = TAILQ_FIRST(&cbinfo.list)) != NULL && 477 error == 0) { 478 TAILQ_REMOVE(&cbinfo.list, save, entry); 479 cbinfo.pri = 0; 480 error |= hammer2_bulk_scan(save->chain, 481 h2_bulkfree_callback, 482 &cbinfo); 483 hammer2_chain_drop(save->chain); 484 kfree(save, M_HAMMER2); 485 } 486 while (save) { 487 TAILQ_REMOVE(&cbinfo.list, save, entry); 488 hammer2_chain_drop(save->chain); 489 kfree(save, M_HAMMER2); 490 save = TAILQ_FIRST(&cbinfo.list); 491 } 492 493 kprintf("bulkfree lastdrop %d %d error=0x%04x\n", 494 vchain->refs, vchain->core.chain_count, error); 495 496 /* 497 * If the complete scan succeeded we can synchronize our 498 * in-memory freemap against live storage. If an abort 499 * occured we cannot safely synchronize our partially 500 * filled-out in-memory freemap. 501 */ 502 if (error == 0) { 503 error = h2_bulkfree_sync(&cbinfo); 504 505 hammer2_voldata_lock(hmp); 506 hammer2_voldata_modify(hmp); 507 hmp->voldata.allocator_free += cbinfo.adj_free; 508 hammer2_voldata_unlock(hmp); 509 } 510 511 /* 512 * Cleanup for next loop. 513 */ 514 #ifdef HAMMER2_BULKFREE_TRANS 515 hammer2_trans_done(hmp->spmp); 516 #endif 517 if (error) 518 break; 519 cbinfo.sbase = cbinfo.sstop; 520 cbinfo.adj_free = 0; 521 } 522 kmem_free_swapbacked(&cbinfo.kp); 523 kfree(cbinfo.dedup, M_HAMMER2); 524 cbinfo.dedup = NULL; 525 526 bfi->sstop = cbinfo.sbase; 527 528 incr = bfi->sstop / (hmp->voldata.volu_size / 10000); 529 if (incr > 10000) 530 incr = 10000; 531 532 kprintf("bulkfree pass statistics (%d.%02d%% storage processed):\n", 533 (int)incr / 100, 534 (int)incr % 100); 535 536 if (error) { 537 kprintf(" bulkfree was aborted\n"); 538 } else { 539 kprintf(" transition->free %ld\n", cbinfo.count_10_00); 540 kprintf(" transition->staged %ld\n", cbinfo.count_11_10); 541 kprintf(" ERR(00)->allocated %ld\n", cbinfo.count_00_11); 542 kprintf(" ERR(01)->allocated %ld\n", cbinfo.count_01_11); 543 kprintf(" staged->allocated %ld\n", cbinfo.count_10_11); 544 kprintf(" ~2MB segs cleaned %ld\n", cbinfo.count_l0cleans); 545 kprintf(" linear adjusts %ld\n", 546 cbinfo.count_linadjusts); 547 kprintf(" dedup factor %ld\n", 548 cbinfo.count_dedup_factor); 549 } 550 551 return error; 552 } 553 554 static void 555 cbinfo_bmap_init(hammer2_bulkfree_info_t *cbinfo, size_t size) 556 { 557 hammer2_bmap_data_t *bmap = cbinfo->bmap; 558 hammer2_key_t key = cbinfo->sbase; 559 hammer2_key_t lokey; 560 hammer2_key_t hikey; 561 562 lokey = (cbinfo->hmp->voldata.allocator_beg + HAMMER2_SEGMASK64) & 563 ~HAMMER2_SEGMASK64; 564 hikey = cbinfo->hmp->voldata.volu_size & ~HAMMER2_SEGMASK64; 565 566 bzero(bmap, size); 567 while (size) { 568 bzero(bmap, sizeof(*bmap)); 569 if (lokey < H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX)) 570 lokey = H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX); 571 if (lokey < H2FMZONEBASE(key) + HAMMER2_ZONE_SEG64) 572 lokey = H2FMZONEBASE(key) + HAMMER2_ZONE_SEG64; 573 if (key < lokey || key >= hikey) { 574 memset(bmap->bitmapq, -1, 575 sizeof(bmap->bitmapq)); 576 bmap->avail = 0; 577 bmap->linear = HAMMER2_SEGSIZE; 578 } else { 579 bmap->avail = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX); 580 } 581 size -= sizeof(*bmap); 582 key += HAMMER2_FREEMAP_LEVEL0_SIZE; 583 ++bmap; 584 } 585 } 586 587 static int 588 h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref) 589 { 590 hammer2_bmap_data_t *bmap; 591 hammer2_off_t data_off; 592 uint16_t class; 593 size_t bytes; 594 int radix; 595 596 /* 597 * Check for signal and allow yield to userland during scan. 598 */ 599 if (hammer2_signal_check(&cbinfo->save_time)) 600 return HAMMER2_ERROR_ABORTED; 601 602 /* 603 * Deal with kernel thread cpu or I/O hogging by limiting the 604 * number of chains scanned per second to hammer2_bulkfree_tps. 605 * Ignore leaf records (DIRENT and DATA), no per-record I/O is 606 * involved for those since we don't load their data. 607 */ 608 if (bref->type != HAMMER2_BREF_TYPE_DATA && 609 bref->type != HAMMER2_BREF_TYPE_DIRENT) { 610 ++cbinfo->bulkfree_calls; 611 if (cbinfo->bulkfree_calls > hammer2_bulkfree_tps) { 612 int dticks = ticks - cbinfo->bulkfree_ticks; 613 if (dticks < 0) 614 dticks = 0; 615 if (dticks < hz) { 616 tsleep(&cbinfo->bulkfree_ticks, 0, 617 "h2bw", hz - dticks); 618 } 619 cbinfo->bulkfree_calls = 0; 620 cbinfo->bulkfree_ticks = ticks; 621 } 622 } 623 624 /* 625 * Calculate the data offset and determine if it is within 626 * the current freemap range being gathered. 627 */ 628 data_off = bref->data_off & ~HAMMER2_OFF_MASK_RADIX; 629 if (data_off < cbinfo->sbase || data_off >= cbinfo->sstop) 630 return 0; 631 if (data_off < cbinfo->hmp->voldata.allocator_beg) 632 return 0; 633 if (data_off >= cbinfo->hmp->voldata.volu_size) 634 return 0; 635 636 /* 637 * Calculate the information needed to generate the in-memory 638 * freemap record. 639 * 640 * Hammer2 does not allow allocations to cross the L1 (2GB) boundary, 641 * it's a problem if it does. (Or L0 (2MB) for that matter). 642 */ 643 radix = (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 644 KKASSERT(radix != 0); 645 bytes = (size_t)1 << radix; 646 class = (bref->type << 8) | hammer2_devblkradix(radix); 647 648 if (data_off + bytes > cbinfo->sstop) { 649 kprintf("hammer2_bulkfree_scan: illegal 2GB boundary " 650 "%016jx %016jx/%d\n", 651 (intmax_t)bref->data_off, 652 (intmax_t)bref->key, 653 bref->keybits); 654 bytes = cbinfo->sstop - data_off; /* XXX */ 655 } 656 657 /* 658 * Convert to a storage offset relative to the beginning of the 659 * storage range we are collecting. Then lookup the level0 bmap entry. 660 */ 661 data_off -= cbinfo->sbase; 662 bmap = cbinfo->bmap + (data_off >> HAMMER2_FREEMAP_LEVEL0_RADIX); 663 664 /* 665 * Convert data_off to a bmap-relative value (~4MB storage range). 666 * Adjust linear, class, and avail. 667 * 668 * Hammer2 does not allow allocations to cross the L0 (4MB) boundary, 669 */ 670 data_off &= HAMMER2_FREEMAP_LEVEL0_MASK; 671 if (data_off + bytes > HAMMER2_FREEMAP_LEVEL0_SIZE) { 672 kprintf("hammer2_bulkfree_scan: illegal 4MB boundary " 673 "%016jx %016jx/%d\n", 674 (intmax_t)bref->data_off, 675 (intmax_t)bref->key, 676 bref->keybits); 677 bytes = HAMMER2_FREEMAP_LEVEL0_SIZE - data_off; 678 } 679 680 if (bmap->class == 0) { 681 bmap->class = class; 682 bmap->avail = HAMMER2_FREEMAP_LEVEL0_SIZE; 683 } 684 685 /* 686 * NOTE: bmap->class does not have to match class. Classification 687 * is relaxed when free space is low, so some mixing can occur. 688 */ 689 #if 0 690 /* 691 * XXX removed 692 */ 693 if (bmap->class != class) { 694 kprintf("hammer2_bulkfree_scan: illegal mixed class " 695 "%016jx %016jx/%d (%04x vs %04x)\n", 696 (intmax_t)bref->data_off, 697 (intmax_t)bref->key, 698 bref->keybits, 699 class, bmap->class); 700 } 701 #endif 702 703 /* 704 * Just record the highest byte-granular offset for now. Do not 705 * match against allocations which are in multiples of whole blocks. 706 * 707 * Make sure that any in-block linear offset at least covers the 708 * data range. This can cause bmap->linear to become block-aligned. 709 */ 710 if (bytes & HAMMER2_FREEMAP_BLOCK_MASK) { 711 if (bmap->linear < (int32_t)data_off + (int32_t)bytes) 712 bmap->linear = (int32_t)data_off + (int32_t)bytes; 713 } else if (bmap->linear >= (int32_t)data_off && 714 bmap->linear < (int32_t)data_off + (int32_t)bytes) { 715 bmap->linear = (int32_t)data_off + (int32_t)bytes; 716 } 717 718 /* 719 * Adjust the hammer2_bitmap_t bitmap[HAMMER2_BMAP_ELEMENTS]. 720 * 64-bit entries, 2 bits per entry, to code 11. 721 * 722 * NOTE: data_off mask to 524288, shift right by 14 (radix for 16384), 723 * and multiply shift amount by 2 for sets of 2 bits. 724 * 725 * NOTE: The allocation can be smaller than HAMMER2_FREEMAP_BLOCK_SIZE. 726 * also, data_off may not be FREEMAP_BLOCK_SIZE aligned. 727 */ 728 while (bytes > 0) { 729 hammer2_bitmap_t bmask; 730 int bindex; 731 732 bindex = (int)data_off >> (HAMMER2_FREEMAP_BLOCK_RADIX + 733 HAMMER2_BMAP_INDEX_RADIX); 734 bmask = (hammer2_bitmap_t)3 << 735 ((((int)data_off & HAMMER2_BMAP_INDEX_MASK) >> 736 HAMMER2_FREEMAP_BLOCK_RADIX) << 1); 737 738 /* 739 * NOTE! The (avail) calculation is bitmap-granular. Multiple 740 * sub-granular records can wind up at the same bitmap 741 * position. 742 */ 743 if ((bmap->bitmapq[bindex] & bmask) == 0) { 744 if (bytes < HAMMER2_FREEMAP_BLOCK_SIZE) { 745 bmap->avail -= HAMMER2_FREEMAP_BLOCK_SIZE; 746 } else { 747 bmap->avail -= bytes; 748 } 749 bmap->bitmapq[bindex] |= bmask; 750 } 751 data_off += HAMMER2_FREEMAP_BLOCK_SIZE; 752 if (bytes < HAMMER2_FREEMAP_BLOCK_SIZE) 753 bytes = 0; 754 else 755 bytes -= HAMMER2_FREEMAP_BLOCK_SIZE; 756 } 757 return 0; 758 } 759 760 /* 761 * Synchronize the in-memory bitmap with the live freemap. This is not a 762 * direct copy. Instead the bitmaps must be compared: 763 * 764 * In-memory Live-freemap 765 * 00 11 -> 10 (do nothing if live modified) 766 * 10 -> 00 (do nothing if live modified) 767 * 11 10 -> 11 handles race against live 768 * ** -> 11 nominally warn of corruption 769 * 770 * We must also fixup the hints in HAMMER2_BREF_TYPE_FREEMAP_LEAF. 771 */ 772 static int 773 h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo) 774 { 775 hammer2_off_t data_off; 776 hammer2_key_t key; 777 hammer2_key_t key_dummy; 778 hammer2_bmap_data_t *bmap; 779 hammer2_bmap_data_t *live; 780 hammer2_chain_t *live_parent; 781 hammer2_chain_t *live_chain; 782 int bmapindex; 783 int error; 784 785 kprintf("hammer2_bulkfree - range "); 786 787 if (cbinfo->sbase < cbinfo->hmp->voldata.allocator_beg) 788 kprintf("%016jx-", 789 (intmax_t)cbinfo->hmp->voldata.allocator_beg); 790 else 791 kprintf("%016jx-", 792 (intmax_t)cbinfo->sbase); 793 794 if (cbinfo->sstop > cbinfo->hmp->voldata.volu_size) 795 kprintf("%016jx\n", 796 (intmax_t)cbinfo->hmp->voldata.volu_size); 797 else 798 kprintf("%016jx\n", 799 (intmax_t)cbinfo->sstop); 800 801 data_off = cbinfo->sbase; 802 bmap = cbinfo->bmap; 803 804 live_parent = &cbinfo->hmp->fchain; 805 hammer2_chain_ref(live_parent); 806 hammer2_chain_lock(live_parent, HAMMER2_RESOLVE_ALWAYS); 807 live_chain = NULL; 808 error = 0; 809 810 /* 811 * Iterate each hammer2_bmap_data_t line (128 bytes) managing 812 * 4MB of storage. 813 */ 814 while (data_off < cbinfo->sstop) { 815 /* 816 * The freemap is not used below allocator_beg or beyond 817 * volu_size. 818 */ 819 820 if (data_off < cbinfo->hmp->voldata.allocator_beg) 821 goto next; 822 if (data_off >= cbinfo->hmp->voldata.volu_size) 823 goto next; 824 825 /* 826 * Locate the freemap leaf on the live filesystem 827 */ 828 key = (data_off & ~HAMMER2_FREEMAP_LEVEL1_MASK); 829 830 if (live_chain == NULL || live_chain->bref.key != key) { 831 if (live_chain) { 832 hammer2_chain_unlock(live_chain); 833 hammer2_chain_drop(live_chain); 834 } 835 live_chain = hammer2_chain_lookup( 836 &live_parent, 837 &key_dummy, 838 key, 839 key + HAMMER2_FREEMAP_LEVEL1_MASK, 840 &error, 841 HAMMER2_LOOKUP_ALWAYS); 842 if (error) { 843 kprintf("hammer2_bulkfree: freemap lookup " 844 "error near %016jx, error %s\n", 845 (intmax_t)data_off, 846 hammer2_error_str(live_chain->error)); 847 break; 848 } 849 } 850 if (live_chain == NULL) { 851 /* 852 * XXX if we implement a full recovery mode we need 853 * to create/recreate missing freemap chains if our 854 * bmap has any allocated blocks. 855 */ 856 if (bmap->class && 857 bmap->avail != HAMMER2_FREEMAP_LEVEL0_SIZE) { 858 kprintf("hammer2_bulkfree: cannot locate " 859 "live leaf for allocated data " 860 "near %016jx\n", 861 (intmax_t)data_off); 862 } 863 goto next; 864 } 865 if (live_chain->error) { 866 kprintf("hammer2_bulkfree: unable to access freemap " 867 "near %016jx, error %s\n", 868 (intmax_t)data_off, 869 hammer2_error_str(live_chain->error)); 870 hammer2_chain_unlock(live_chain); 871 hammer2_chain_drop(live_chain); 872 live_chain = NULL; 873 goto next; 874 } 875 876 bmapindex = (data_off & HAMMER2_FREEMAP_LEVEL1_MASK) >> 877 HAMMER2_FREEMAP_LEVEL0_RADIX; 878 live = &live_chain->data->bmdata[bmapindex]; 879 880 /* 881 * Shortcut if the bitmaps match and the live linear 882 * indicator is sane. We can't do a perfect check of 883 * live->linear because the only real requirement is that 884 * if it is not block-aligned, that it not cover the space 885 * within its current block which overlaps one of the data 886 * ranges we scan. We don't retain enough fine-grained 887 * data in our scan to be able to set it exactly. 888 * 889 * TODO - we could shortcut this by testing that both 890 * live->class and bmap->class are 0, and both avails are 891 * set to HAMMER2_FREEMAP_LEVEL0_SIZE (4MB). 892 */ 893 if (bcmp(live->bitmapq, bmap->bitmapq, 894 sizeof(bmap->bitmapq)) == 0 && 895 live->linear >= bmap->linear) { 896 goto next; 897 } 898 if (hammer2_debug & 1) { 899 kprintf("live %016jx %04d.%04x (avail=%d)\n", 900 data_off, bmapindex, live->class, live->avail); 901 } 902 903 hammer2_chain_modify(live_chain, cbinfo->mtid, 0, 0); 904 live_chain->bref.check.freemap.bigmask = -1; 905 cbinfo->hmp->freemap_relaxed = 0; /* reset heuristic */ 906 live = &live_chain->data->bmdata[bmapindex]; 907 908 h2_bulkfree_sync_adjust(cbinfo, data_off, live, bmap, 909 live_chain->bref.key + 910 bmapindex * 911 HAMMER2_FREEMAP_LEVEL0_SIZE); 912 next: 913 data_off += HAMMER2_FREEMAP_LEVEL0_SIZE; 914 ++bmap; 915 } 916 if (live_chain) { 917 hammer2_chain_unlock(live_chain); 918 hammer2_chain_drop(live_chain); 919 } 920 if (live_parent) { 921 hammer2_chain_unlock(live_parent); 922 hammer2_chain_drop(live_parent); 923 } 924 return error; 925 } 926 927 /* 928 * Merge the bulkfree bitmap against the existing bitmap. 929 */ 930 static 931 void 932 h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, 933 hammer2_off_t data_off, hammer2_bmap_data_t *live, 934 hammer2_bmap_data_t *bmap, hammer2_key_t alloc_base) 935 { 936 int bindex; 937 int scount; 938 hammer2_off_t tmp_off; 939 hammer2_bitmap_t lmask; 940 hammer2_bitmap_t mmask; 941 942 tmp_off = data_off; 943 944 for (bindex = 0; bindex < HAMMER2_BMAP_ELEMENTS; ++bindex) { 945 lmask = live->bitmapq[bindex]; /* live */ 946 mmask = bmap->bitmapq[bindex]; /* snapshotted bulkfree */ 947 if (lmask == mmask) { 948 tmp_off += HAMMER2_BMAP_INDEX_SIZE; 949 continue; 950 } 951 952 for (scount = 0; 953 scount < HAMMER2_BMAP_BITS_PER_ELEMENT; 954 scount += 2) { 955 if ((mmask & 3) == 0) { 956 /* 957 * in-memory 00 live 11 -> 10 958 * live 10 -> 00 959 * 960 * Storage might be marked allocated or 961 * staged and must be remarked staged or 962 * free. 963 */ 964 switch (lmask & 3) { 965 case 0: /* 00 */ 966 break; 967 case 1: /* 01 */ 968 kprintf("hammer2_bulkfree: cannot " 969 "transition m=00/l=01\n"); 970 break; 971 case 2: /* 10 -> 00 */ 972 live->bitmapq[bindex] &= 973 ~((hammer2_bitmap_t)2 << scount); 974 live->avail += 975 HAMMER2_FREEMAP_BLOCK_SIZE; 976 if (live->avail > 977 HAMMER2_FREEMAP_LEVEL0_SIZE) { 978 live->avail = 979 HAMMER2_FREEMAP_LEVEL0_SIZE; 980 } 981 cbinfo->adj_free += 982 HAMMER2_FREEMAP_BLOCK_SIZE; 983 ++cbinfo->count_10_00; 984 hammer2_io_dedup_assert( 985 cbinfo->hmp, 986 tmp_off | 987 HAMMER2_FREEMAP_BLOCK_RADIX, 988 HAMMER2_FREEMAP_BLOCK_SIZE); 989 break; 990 case 3: /* 11 -> 10 */ 991 live->bitmapq[bindex] &= 992 ~((hammer2_bitmap_t)1 << scount); 993 ++cbinfo->count_11_10; 994 hammer2_io_dedup_delete( 995 cbinfo->hmp, 996 HAMMER2_BREF_TYPE_DATA, 997 tmp_off | 998 HAMMER2_FREEMAP_BLOCK_RADIX, 999 HAMMER2_FREEMAP_BLOCK_SIZE); 1000 break; 1001 } 1002 } else if ((mmask & 3) == 3) { 1003 /* 1004 * in-memory 11 live 10 -> 11 1005 * live ** -> 11 1006 * 1007 * Storage might be incorrectly marked free 1008 * or staged and must be remarked fully 1009 * allocated. 1010 */ 1011 switch (lmask & 3) { 1012 case 0: /* 00 */ 1013 ++cbinfo->count_00_11; 1014 cbinfo->adj_free -= 1015 HAMMER2_FREEMAP_BLOCK_SIZE; 1016 live->avail -= 1017 HAMMER2_FREEMAP_BLOCK_SIZE; 1018 if ((int32_t)live->avail < 0) 1019 live->avail = 0; 1020 break; 1021 case 1: /* 01 */ 1022 ++cbinfo->count_01_11; 1023 break; 1024 case 2: /* 10 -> 11 */ 1025 ++cbinfo->count_10_11; 1026 break; 1027 case 3: /* 11 */ 1028 break; 1029 } 1030 live->bitmapq[bindex] |= 1031 ((hammer2_bitmap_t)3 << scount); 1032 } 1033 mmask >>= 2; 1034 lmask >>= 2; 1035 tmp_off += HAMMER2_FREEMAP_BLOCK_SIZE; 1036 } 1037 } 1038 1039 /* 1040 * Determine if the live bitmap is completely free and reset its 1041 * fields if so. Otherwise check to see if we can reduce the linear 1042 * offset. 1043 */ 1044 for (bindex = HAMMER2_BMAP_ELEMENTS - 1; bindex >= 0; --bindex) { 1045 if (live->bitmapq[bindex] != 0) 1046 break; 1047 } 1048 if (bindex < 0) { 1049 /* 1050 * Completely empty, reset entire segment 1051 */ 1052 #if 0 1053 kprintf("hammer2: cleanseg %016jx.%04x (%d)\n", 1054 alloc_base, live->class, live->avail); 1055 #endif 1056 live->avail = HAMMER2_FREEMAP_LEVEL0_SIZE; 1057 live->class = 0; 1058 live->linear = 0; 1059 ++cbinfo->count_l0cleans; 1060 } else if (bindex < 7) { 1061 /* 1062 * Partially full, bitmapq[bindex] != 0. The live->linear 1063 * offset can legitimately be just about anything, but 1064 * our bulkfree pass doesn't record enough information to 1065 * set it exactly. Just make sure that it is set to a 1066 * safe value that also works in our match code above (the 1067 * bcmp and linear test). 1068 * 1069 * We cannot safely leave live->linear at a sub-block offset 1070 * unless it is already in the same block as bmap->linear. 1071 * 1072 * If it is not in the same block, we cannot assume that 1073 * we can set it to bmap->linear on a sub-block boundary, 1074 * because the live system could have bounced it around. 1075 * In that situation we satisfy our bcmp/skip requirement 1076 * above by setting it to the nearest higher block boundary. 1077 * This alignment effectively kills any partial allocation it 1078 * might have been tracking before. 1079 */ 1080 if (live->linear < bmap->linear && 1081 ((live->linear ^ bmap->linear) & 1082 ~HAMMER2_FREEMAP_BLOCK_MASK) == 0) { 1083 live->linear = bmap->linear; 1084 ++cbinfo->count_linadjusts; 1085 } else { 1086 live->linear = 1087 (bmap->linear + HAMMER2_FREEMAP_BLOCK_MASK) & 1088 ~HAMMER2_FREEMAP_BLOCK_MASK; 1089 ++cbinfo->count_linadjusts; 1090 } 1091 } else { 1092 /* 1093 * Completely full, effectively disable the linear iterator 1094 */ 1095 live->linear = HAMMER2_SEGSIZE; 1096 } 1097 1098 #if 0 1099 if (bmap->class) { 1100 kprintf("%016jx %04d.%04x (avail=%7d) " 1101 "%08x %08x %08x %08x %08x %08x %08x %08x\n", 1102 (intmax_t)data_off, 1103 (int)((data_off & 1104 HAMMER2_FREEMAP_LEVEL1_MASK) >> 1105 HAMMER2_FREEMAP_LEVEL0_RADIX), 1106 bmap->class, 1107 bmap->avail, 1108 bmap->bitmap[0], bmap->bitmap[1], 1109 bmap->bitmap[2], bmap->bitmap[3], 1110 bmap->bitmap[4], bmap->bitmap[5], 1111 bmap->bitmap[6], bmap->bitmap[7]); 1112 } 1113 #endif 1114 } 1115 1116 /* 1117 * BULKFREE DEDUP HEURISTIC 1118 * 1119 * WARNING! This code is SMP safe but the heuristic allows SMP collisions. 1120 * All fields must be loaded into locals and validated. 1121 */ 1122 static 1123 int 1124 h2_bulkfree_test(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref, 1125 int pri) 1126 { 1127 hammer2_dedup_t *dedup; 1128 int best; 1129 int n; 1130 int i; 1131 1132 n = hammer2_icrc32(&bref->data_off, sizeof(bref->data_off)); 1133 dedup = cbinfo->dedup + (n & (HAMMER2_DEDUP_HEUR_MASK & ~7)); 1134 1135 for (i = best = 0; i < 8; ++i) { 1136 if (dedup[i].data_off == bref->data_off) { 1137 if (dedup[i].ticks < pri) 1138 dedup[i].ticks = pri; 1139 if (pri == 1) 1140 cbinfo->count_dedup_factor += dedup[i].ticks; 1141 return 1; 1142 } 1143 if (dedup[i].ticks < dedup[best].ticks) 1144 best = i; 1145 } 1146 dedup[best].data_off = bref->data_off; 1147 dedup[best].ticks = pri; 1148 1149 return 0; 1150 } 1151