1 /* 2 * Copyright (c) 2010 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Ilya Dryomov <idryomov@gmail.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include "hammer.h" 36 37 #include <sys/tree.h> 38 #include <libutil.h> 39 #include <openssl/sha.h> 40 41 #define DEDUP_BUF (64 * 1024) 42 43 /* Sorted list of block CRCs - light version for dedup-simulate */ 44 struct sim_dedup_entry_rb_tree; 45 RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree = 46 RB_INITIALIZER(&sim_dedup_tree); 47 RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry, 48 rb_sim_dedup_entry_compare, hammer_crc_t); 49 50 struct sim_dedup_entry { 51 hammer_crc_t crc; 52 uint64_t ref_blks; /* number of blocks referenced */ 53 uint64_t ref_size; /* size of data referenced */ 54 RB_ENTRY(sim_dedup_entry) rb_entry; 55 }; 56 57 struct dedup_entry { 58 struct hammer_btree_leaf_elm leaf; 59 union { 60 struct { 61 uint64_t ref_blks; 62 uint64_t ref_size; 63 } de; 64 RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root; 65 } u; 66 uint8_t flags; 67 RB_ENTRY(dedup_entry) rb_entry; 68 }; 69 70 #define HAMMER_DEDUP_ENTRY_FICTITIOUS 0x0001 71 72 struct sha_dedup_entry { 73 struct hammer_btree_leaf_elm leaf; 74 uint64_t ref_blks; 75 uint64_t ref_size; 76 uint8_t sha_hash[SHA256_DIGEST_LENGTH]; 77 RB_ENTRY(sha_dedup_entry) fict_entry; 78 }; 79 80 /* Sorted list of HAMMER B-Tree keys */ 81 struct dedup_entry_rb_tree; 82 struct sha_dedup_entry_rb_tree; 83 84 RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree = 85 RB_INITIALIZER(&dedup_tree); 86 RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry, 87 rb_dedup_entry_compare, hammer_crc_t); 88 89 RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry, 90 rb_sha_dedup_entry_compare); 91 92 /* 93 * Pass2 list - contains entries that were not dedup'ed because ioctl failed 94 */ 95 STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue = 96 STAILQ_HEAD_INITIALIZER(pass2_dedup_queue); 97 98 struct pass2_dedup_entry { 99 struct hammer_btree_leaf_elm leaf; 100 STAILQ_ENTRY(pass2_dedup_entry) sq_entry; 101 }; 102 103 #define DEDUP_PASS2 0x0001 /* process_btree_elm() mode */ 104 105 static int SigInfoFlag; 106 static int SigAlrmFlag; 107 static int64_t DedupDataReads; 108 static int64_t DedupCurrentRecords; 109 static int64_t DedupTotalRecords; 110 static uint32_t DedupCrcStart; 111 static uint32_t DedupCrcEnd; 112 static uint64_t MemoryUse; 113 114 /* PFS global ids - we deal with just one PFS at a run */ 115 static int glob_fd; 116 static struct hammer_ioc_pseudofs_rw glob_pfs; 117 118 /* 119 * Global accounting variables 120 * 121 * Last three don't have to be 64-bit, just to be safe.. 122 */ 123 static uint64_t dedup_alloc_size; 124 static uint64_t dedup_ref_size; 125 static uint64_t dedup_skipped_size; 126 static uint64_t dedup_crc_failures; 127 static uint64_t dedup_sha_failures; 128 static uint64_t dedup_underflows; 129 static uint64_t dedup_successes_count; 130 static uint64_t dedup_successes_bytes; 131 132 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1, 133 struct sim_dedup_entry *sim_de2); 134 static int rb_dedup_entry_compare(struct dedup_entry *de1, 135 struct dedup_entry *de2); 136 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1, 137 struct sha_dedup_entry *sha_de2); 138 typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags); 139 static void scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id); 140 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags); 141 static int count_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags); 142 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags); 143 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, uint8_t *sha_hash); 144 static void dump_simulated_dedup(void); 145 static void dump_real_dedup(void); 146 static void dedup_usage(int code); 147 148 RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry, 149 rb_sim_dedup_entry_compare, hammer_crc_t, crc); 150 RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry, 151 rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc); 152 RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry, 153 rb_sha_dedup_entry_compare); 154 155 static 156 int 157 rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1, 158 struct sim_dedup_entry *sim_de2) 159 { 160 if (sim_de1->crc < sim_de2->crc) 161 return (-1); 162 if (sim_de1->crc > sim_de2->crc) 163 return (1); 164 165 return (0); 166 } 167 168 static 169 int 170 rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2) 171 { 172 if (de1->leaf.data_crc < de2->leaf.data_crc) 173 return (-1); 174 if (de1->leaf.data_crc > de2->leaf.data_crc) 175 return (1); 176 177 return (0); 178 } 179 180 static 181 int 182 rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1, 183 struct sha_dedup_entry *sha_de2) 184 { 185 unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash; 186 unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash; 187 int i; 188 189 for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) { 190 if (h1[i] < h2[i]) 191 return (-1); 192 if (h1[i] > h2[i]) 193 return (1); 194 } 195 196 return (0); 197 } 198 199 /* 200 * dedup-simulate <filesystem> 201 */ 202 void 203 hammer_cmd_dedup_simulate(char **av, int ac) 204 { 205 struct sim_dedup_entry *sim_de; 206 207 if (ac != 1) { 208 dedup_usage(1); 209 /* not reached */ 210 } 211 212 glob_fd = getpfs(&glob_pfs, av[0]); 213 214 /* 215 * Collection passes (memory limited) 216 */ 217 printf("Dedup-simulate running\n"); 218 do { 219 DedupCrcStart = DedupCrcEnd; 220 DedupCrcEnd = 0; 221 MemoryUse = 0; 222 223 if (VerboseOpt) { 224 printf("B-Tree pass crc-range %08x-max\n", 225 DedupCrcStart); 226 fflush(stdout); 227 } 228 scan_pfs(av[0], collect_btree_elm, "simu-pass"); 229 230 if (VerboseOpt >= 2) 231 dump_simulated_dedup(); 232 233 /* 234 * Calculate simulated dedup ratio and get rid of the tree 235 */ 236 while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) { 237 assert(sim_de->ref_blks != 0); 238 dedup_ref_size += sim_de->ref_size; 239 dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks; 240 241 RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de); 242 free(sim_de); 243 } 244 if (DedupCrcEnd && VerboseOpt == 0) 245 printf("."); 246 } while (DedupCrcEnd); 247 248 printf("Dedup-simulate %s succeeded\n", av[0]); 249 relpfs(glob_fd, &glob_pfs); 250 251 printf("Simulated dedup ratio = %.2f\n", 252 (dedup_alloc_size != 0) ? 253 (double)dedup_ref_size / dedup_alloc_size : 0); 254 } 255 256 /* 257 * dedup <filesystem> 258 */ 259 void 260 hammer_cmd_dedup(char **av, int ac) 261 { 262 struct dedup_entry *de; 263 struct sha_dedup_entry *sha_de; 264 struct pass2_dedup_entry *pass2_de; 265 char *tmp; 266 char buf[8]; 267 int needfree = 0; 268 269 if (TimeoutOpt > 0) 270 alarm(TimeoutOpt); 271 272 if (ac != 1) { 273 dedup_usage(1); 274 /* not reached */ 275 } 276 277 STAILQ_INIT(&pass2_dedup_queue); 278 279 glob_fd = getpfs(&glob_pfs, av[0]); 280 281 /* 282 * A cycle file is _required_ for resuming dedup after the timeout 283 * specified with -t has expired. If no -c option, then place a 284 * .dedup.cycle file either in the PFS snapshots directory or in 285 * the default snapshots directory. 286 */ 287 if (!CyclePath) { 288 if (glob_pfs.ondisk->snapshots[0] != '/') { 289 asprintf(&tmp, "%s/%s/.dedup.cycle", 290 SNAPSHOTS_BASE, av[0]); 291 } else { 292 asprintf(&tmp, "%s/.dedup.cycle", 293 glob_pfs.ondisk->snapshots); 294 } 295 CyclePath = tmp; 296 needfree = 1; 297 } 298 299 /* 300 * Pre-pass to cache the btree 301 */ 302 scan_pfs(av[0], count_btree_elm, "pre-pass "); 303 DedupTotalRecords = DedupCurrentRecords; 304 305 /* 306 * Collection passes (memory limited) 307 */ 308 printf("Dedup running\n"); 309 do { 310 DedupCrcStart = DedupCrcEnd; 311 DedupCrcEnd = 0; 312 MemoryUse = 0; 313 314 if (VerboseOpt) { 315 printf("B-Tree pass crc-range %08x-max\n", 316 DedupCrcStart); 317 fflush(stdout); 318 } 319 scan_pfs(av[0], process_btree_elm, "main-pass"); 320 321 while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) { 322 if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2)) 323 dedup_skipped_size -= pass2_de->leaf.data_len; 324 325 STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry); 326 free(pass2_de); 327 } 328 assert(STAILQ_EMPTY(&pass2_dedup_queue)); 329 330 if (VerboseOpt >= 2) 331 dump_real_dedup(); 332 333 /* 334 * Calculate dedup ratio and get rid of the trees 335 */ 336 while ((de = RB_ROOT(&dedup_tree)) != NULL) { 337 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) { 338 while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) { 339 assert(sha_de->ref_blks != 0); 340 dedup_ref_size += sha_de->ref_size; 341 dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks; 342 343 RB_REMOVE(sha_dedup_entry_rb_tree, 344 &de->u.fict_root, sha_de); 345 free(sha_de); 346 } 347 assert(RB_EMPTY(&de->u.fict_root)); 348 } else { 349 assert(de->u.de.ref_blks != 0); 350 dedup_ref_size += de->u.de.ref_size; 351 dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks; 352 } 353 354 RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de); 355 free(de); 356 } 357 assert(RB_EMPTY(&dedup_tree)); 358 if (DedupCrcEnd && VerboseOpt == 0) 359 printf("."); 360 } while (DedupCrcEnd); 361 362 printf("Dedup %s succeeded\n", av[0]); 363 relpfs(glob_fd, &glob_pfs); 364 365 humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024); 366 printf("Dedup ratio = %.2f (in this run)\n" 367 " %8s referenced\n", 368 ((dedup_alloc_size != 0) ? 369 (double)dedup_ref_size / dedup_alloc_size : 0), 370 buf 371 ); 372 humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024); 373 printf(" %8s allocated\n", buf); 374 humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024); 375 printf(" %8s skipped\n", buf); 376 printf(" %8jd CRC collisions\n" 377 " %8jd SHA collisions\n" 378 " %8jd big-block underflows\n" 379 " %8jd new dedup records\n" 380 " %8jd new dedup bytes\n", 381 (intmax_t)dedup_crc_failures, 382 (intmax_t)dedup_sha_failures, 383 (intmax_t)dedup_underflows, 384 (intmax_t)dedup_successes_count, 385 (intmax_t)dedup_successes_bytes 386 ); 387 388 /* Once completed remove cycle file */ 389 hammer_reset_cycle(); 390 391 /* We don't want to mess up with other directives */ 392 if (needfree) { 393 free(tmp); 394 CyclePath = NULL; 395 } 396 } 397 398 static 399 int 400 count_btree_elm(hammer_btree_leaf_elm_t scan_leaf __unused, int flags __unused) 401 { 402 return(1); 403 } 404 405 static 406 int 407 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused) 408 { 409 struct sim_dedup_entry *sim_de; 410 411 /* 412 * If we are using too much memory we have to clean some out, which 413 * will cause the run to use multiple passes. Be careful of integer 414 * overflows! 415 */ 416 if (MemoryUse > MemoryLimit) { 417 DedupCrcEnd = DedupCrcStart + 418 (uint32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2; 419 if (VerboseOpt) { 420 printf("memory limit crc-range %08x-%08x\n", 421 DedupCrcStart, DedupCrcEnd); 422 fflush(stdout); 423 } 424 for (;;) { 425 sim_de = RB_MAX(sim_dedup_entry_rb_tree, 426 &sim_dedup_tree); 427 if (sim_de == NULL || sim_de->crc < DedupCrcEnd) 428 break; 429 RB_REMOVE(sim_dedup_entry_rb_tree, 430 &sim_dedup_tree, sim_de); 431 MemoryUse -= sizeof(*sim_de); 432 free(sim_de); 433 } 434 } 435 436 /* 437 * Collect statistics based on the CRC only, do not try to read 438 * any data blocks or run SHA hashes. 439 */ 440 sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree, 441 scan_leaf->data_crc); 442 443 if (sim_de == NULL) { 444 sim_de = calloc(1, sizeof(*sim_de)); 445 sim_de->crc = scan_leaf->data_crc; 446 RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de); 447 MemoryUse += sizeof(*sim_de); 448 } 449 450 sim_de->ref_blks += 1; 451 sim_de->ref_size += scan_leaf->data_len; 452 return (1); 453 } 454 455 static __inline 456 int 457 validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q) 458 { 459 if (HAMMER_ZONE(p->data_offset) != HAMMER_ZONE(q->data_offset)) 460 return (1); 461 if (p->data_len != q->data_len) 462 return (1); 463 464 return (0); 465 } 466 467 #define DEDUP_TECH_FAILURE 1 468 #define DEDUP_CMP_FAILURE 2 469 #define DEDUP_INVALID_ZONE 3 470 #define DEDUP_UNDERFLOW 4 471 #define DEDUP_VERS_FAILURE 5 472 473 static __inline 474 int 475 deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q) 476 { 477 struct hammer_ioc_dedup dedup; 478 479 bzero(&dedup, sizeof(dedup)); 480 481 /* 482 * If data_offset fields are the same there is no need to run ioctl, 483 * candidate is already dedup'ed. 484 */ 485 if (p->data_offset == q->data_offset) 486 return (0); 487 488 dedup.elm1 = p->base; 489 dedup.elm2 = q->base; 490 RunningIoctl = 1; 491 if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) { 492 if (errno == EOPNOTSUPP) 493 return (DEDUP_VERS_FAILURE); /* must be at least version 5 */ 494 /* Technical failure - locking or w/e */ 495 return (DEDUP_TECH_FAILURE); 496 } 497 if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE) 498 return (DEDUP_CMP_FAILURE); 499 if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE) 500 return (DEDUP_INVALID_ZONE); 501 if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW) 502 return (DEDUP_UNDERFLOW); 503 RunningIoctl = 0; 504 ++dedup_successes_count; 505 dedup_successes_bytes += p->data_len; 506 return (0); 507 } 508 509 static 510 int 511 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags) 512 { 513 struct dedup_entry *de; 514 struct sha_dedup_entry *sha_de, temp; 515 struct pass2_dedup_entry *pass2_de; 516 int error; 517 518 /* 519 * If we are using too much memory we have to clean some out, which 520 * will cause the run to use multiple passes. Be careful of integer 521 * overflows! 522 */ 523 while (MemoryUse > MemoryLimit) { 524 DedupCrcEnd = DedupCrcStart + 525 (uint32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2; 526 if (VerboseOpt) { 527 printf("memory limit crc-range %08x-%08x\n", 528 DedupCrcStart, DedupCrcEnd); 529 fflush(stdout); 530 } 531 532 for (;;) { 533 de = RB_MAX(dedup_entry_rb_tree, &dedup_tree); 534 if (de == NULL || de->leaf.data_crc < DedupCrcEnd) 535 break; 536 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) { 537 while ((sha_de = RB_ROOT(&de->u.fict_root)) != 538 NULL) { 539 RB_REMOVE(sha_dedup_entry_rb_tree, 540 &de->u.fict_root, sha_de); 541 MemoryUse -= sizeof(*sha_de); 542 free(sha_de); 543 } 544 } 545 RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de); 546 MemoryUse -= sizeof(*de); 547 free(de); 548 } 549 } 550 551 /* 552 * Collect statistics based on the CRC. Colliding CRCs usually 553 * cause a SHA sub-tree to be created under the de. 554 * 555 * Trivial case if de not found. 556 */ 557 de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc); 558 if (de == NULL) { 559 de = calloc(1, sizeof(*de)); 560 de->leaf = *scan_leaf; 561 RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de); 562 MemoryUse += sizeof(*de); 563 goto upgrade_stats; 564 } 565 566 /* 567 * Found entry in CRC tree 568 */ 569 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) { 570 /* 571 * Optimize the case where a CRC failure results in multiple 572 * SHA entries. If we unconditionally issue a data-read a 573 * degenerate situation where a colliding CRC's second SHA 574 * entry contains the lion's share of the deduplication 575 * candidates will result in excessive data block reads. 576 * 577 * Deal with the degenerate case by looking for a matching 578 * data_offset/data_len in the SHA elements we already have 579 * before reading the data block and generating a new SHA. 580 */ 581 RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) { 582 if (sha_de->leaf.data_offset == 583 scan_leaf->data_offset && 584 sha_de->leaf.data_len == scan_leaf->data_len) { 585 memcpy(temp.sha_hash, sha_de->sha_hash, 586 SHA256_DIGEST_LENGTH); 587 break; 588 } 589 } 590 591 /* 592 * Entry in CRC tree is fictitious, so we already had problems 593 * with this CRC. Upgrade (compute SHA) the candidate and 594 * dive into SHA subtree. If upgrade fails insert the candidate 595 * into Pass2 list (it will be processed later). 596 */ 597 if (sha_de == NULL) { 598 if (upgrade_chksum(scan_leaf, temp.sha_hash)) 599 goto pass2_insert; 600 601 sha_de = RB_FIND(sha_dedup_entry_rb_tree, 602 &de->u.fict_root, &temp); 603 } 604 605 /* 606 * Nothing in SHA subtree so far, so this is a new 607 * 'dataset'. Insert new entry into SHA subtree. 608 */ 609 if (sha_de == NULL) { 610 sha_de = calloc(1, sizeof(*sha_de)); 611 sha_de->leaf = *scan_leaf; 612 memcpy(sha_de->sha_hash, temp.sha_hash, 613 SHA256_DIGEST_LENGTH); 614 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, 615 sha_de); 616 MemoryUse += sizeof(*sha_de); 617 goto upgrade_stats_sha; 618 } 619 620 /* 621 * Found entry in SHA subtree, it means we have a potential 622 * dedup pair. Validate it (zones have to match and data_len 623 * field have to be the same too. If validation fails, treat 624 * it as a SHA collision (jump to sha256_failure). 625 */ 626 if (validate_dedup_pair(&sha_de->leaf, scan_leaf)) 627 goto sha256_failure; 628 629 /* 630 * We have a valid dedup pair (SHA match, validated). 631 * 632 * In case of technical failure (dedup pair was good, but 633 * ioctl failed anyways) insert the candidate into Pass2 list 634 * (we will try to dedup it after we are done with the rest of 635 * the tree). 636 * 637 * If ioctl fails because either of blocks is in the non-dedup 638 * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't 639 * bother with the candidate and terminate early. 640 * 641 * If ioctl fails because of big-block underflow replace the 642 * leaf node that found dedup entry represents with scan_leaf. 643 */ 644 error = deduplicate(&sha_de->leaf, scan_leaf); 645 switch(error) { 646 case 0: 647 goto upgrade_stats_sha; 648 case DEDUP_TECH_FAILURE: 649 goto pass2_insert; 650 case DEDUP_CMP_FAILURE: 651 goto sha256_failure; 652 case DEDUP_INVALID_ZONE: 653 goto terminate_early; 654 case DEDUP_UNDERFLOW: 655 ++dedup_underflows; 656 sha_de->leaf = *scan_leaf; 657 memcpy(sha_de->sha_hash, temp.sha_hash, 658 SHA256_DIGEST_LENGTH); 659 goto upgrade_stats_sha; 660 case DEDUP_VERS_FAILURE: 661 errx(1, "HAMMER filesystem must be at least " 662 "version 5 to dedup"); 663 /* not reached */ 664 default: 665 fprintf(stderr, "Unknown error\n"); 666 goto terminate_early; 667 } 668 669 /* 670 * Ooh la la.. SHA-256 collision. Terminate early, there's 671 * nothing we can do here. 672 */ 673 sha256_failure: 674 ++dedup_sha_failures; 675 goto terminate_early; 676 } else { 677 /* 678 * Candidate CRC is good for now (we found an entry in CRC 679 * tree and it's not fictitious). This means we have a 680 * potential dedup pair. 681 */ 682 if (validate_dedup_pair(&de->leaf, scan_leaf)) 683 goto crc_failure; 684 685 /* 686 * We have a valid dedup pair (CRC match, validated) 687 */ 688 error = deduplicate(&de->leaf, scan_leaf); 689 switch(error) { 690 case 0: 691 goto upgrade_stats; 692 case DEDUP_TECH_FAILURE: 693 goto pass2_insert; 694 case DEDUP_CMP_FAILURE: 695 goto crc_failure; 696 case DEDUP_INVALID_ZONE: 697 goto terminate_early; 698 case DEDUP_UNDERFLOW: 699 ++dedup_underflows; 700 de->leaf = *scan_leaf; 701 goto upgrade_stats; 702 case DEDUP_VERS_FAILURE: 703 errx(1, "HAMMER filesystem must be at least " 704 "version 5 to dedup"); 705 /* not reached */ 706 default: 707 fprintf(stderr, "Unknown error\n"); 708 goto terminate_early; 709 } 710 711 crc_failure: 712 /* 713 * We got a CRC collision - either ioctl failed because of 714 * the comparison failure or validation of the potential 715 * dedup pair went bad. In all cases insert both blocks 716 * into SHA subtree (this requires checksum upgrade) and mark 717 * entry that corresponds to this CRC in the CRC tree 718 * fictitious, so that all futher operations with this CRC go 719 * through SHA subtree. 720 */ 721 ++dedup_crc_failures; 722 723 /* 724 * Insert block that was represented by now fictitious dedup 725 * entry (create a new SHA entry and preserve stats of the 726 * old CRC one). If checksum upgrade fails insert the 727 * candidate into Pass2 list and return - keep both trees 728 * unmodified. 729 */ 730 sha_de = calloc(1, sizeof(*sha_de)); 731 sha_de->leaf = de->leaf; 732 sha_de->ref_blks = de->u.de.ref_blks; 733 sha_de->ref_size = de->u.de.ref_size; 734 if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) { 735 free(sha_de); 736 goto pass2_insert; 737 } 738 MemoryUse += sizeof(*sha_de); 739 740 RB_INIT(&de->u.fict_root); 741 /* 742 * Here we can insert without prior checking because the tree 743 * is empty at this point 744 */ 745 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de); 746 747 /* 748 * Mark entry in CRC tree fictitious 749 */ 750 de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS; 751 752 /* 753 * Upgrade checksum of the candidate and insert it into 754 * SHA subtree. If upgrade fails insert the candidate into 755 * Pass2 list. 756 */ 757 if (upgrade_chksum(scan_leaf, temp.sha_hash)) 758 goto pass2_insert; 759 sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root, 760 &temp); 761 if (sha_de != NULL) { 762 /* There is an entry with this SHA already, but the only 763 * RB-tree element at this point is that entry we just 764 * added. We know for sure these blocks are different 765 * (this is crc_failure branch) so treat it as SHA 766 * collision. 767 */ 768 goto sha256_failure; 769 } 770 771 sha_de = calloc(1, sizeof(*sha_de)); 772 sha_de->leaf = *scan_leaf; 773 memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH); 774 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de); 775 MemoryUse += sizeof(*sha_de); 776 goto upgrade_stats_sha; 777 } 778 779 upgrade_stats: 780 de->u.de.ref_blks += 1; 781 de->u.de.ref_size += scan_leaf->data_len; 782 return (1); 783 784 upgrade_stats_sha: 785 sha_de->ref_blks += 1; 786 sha_de->ref_size += scan_leaf->data_len; 787 return (1); 788 789 pass2_insert: 790 /* 791 * If in pass2 mode don't insert anything, fall through to 792 * terminate_early 793 */ 794 if ((flags & DEDUP_PASS2) == 0) { 795 pass2_de = calloc(1, sizeof(*pass2_de)); 796 pass2_de->leaf = *scan_leaf; 797 STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry); 798 dedup_skipped_size += scan_leaf->data_len; 799 return (1); 800 } 801 802 terminate_early: 803 /* 804 * Early termination path. Fixup stats. 805 */ 806 dedup_alloc_size += scan_leaf->data_len; 807 dedup_ref_size += scan_leaf->data_len; 808 return (0); 809 } 810 811 static 812 int 813 upgrade_chksum(hammer_btree_leaf_elm_t leaf, uint8_t *sha_hash) 814 { 815 struct hammer_ioc_data data; 816 char *buf = malloc(DEDUP_BUF); 817 SHA256_CTX ctx; 818 int error; 819 820 bzero(&data, sizeof(data)); 821 data.elm = leaf->base; 822 data.ubuf = buf; 823 data.size = DEDUP_BUF; 824 825 error = 0; 826 if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) { 827 fprintf(stderr, "Get-data failed: %s\n", strerror(errno)); 828 error = 1; 829 goto done; 830 } 831 DedupDataReads += leaf->data_len; 832 833 if (data.leaf.data_len != leaf->data_len) { 834 error = 1; 835 goto done; 836 } 837 838 if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD && 839 data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) { 840 SHA256_Init(&ctx); 841 SHA256_Update(&ctx, (void *)buf, data.leaf.data_len); 842 SHA256_Final(sha_hash, &ctx); 843 } 844 845 done: 846 free(buf); 847 return (error); 848 } 849 850 static 851 void 852 sigAlrm(int signo __unused) 853 { 854 SigAlrmFlag = 1; 855 } 856 857 static 858 void 859 sigInfo(int signo __unused) 860 { 861 SigInfoFlag = 1; 862 } 863 864 static 865 void 866 scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id) 867 { 868 struct hammer_ioc_mirror_rw mirror; 869 hammer_ioc_mrecord_any_t mrec; 870 struct hammer_btree_leaf_elm elm; 871 char *buf = malloc(DEDUP_BUF); 872 char buf1[8]; 873 char buf2[8]; 874 int offset, bytes; 875 876 SigInfoFlag = 0; 877 DedupDataReads = 0; 878 DedupCurrentRecords = 0; 879 signal(SIGINFO, sigInfo); 880 signal(SIGALRM, sigAlrm); 881 882 /* 883 * Deduplication happens per element so hammer(8) is in full 884 * control of the ioctl()s to actually perform it. SIGALRM 885 * needs to be handled within hammer(8) but a checkpoint 886 * is needed for resuming. Use cycle file for that. 887 * 888 * Try to obtain the previous obj_id from the cycle file and 889 * if not available just start from the beginning. 890 */ 891 bzero(&mirror, sizeof(mirror)); 892 hammer_key_beg_init(&mirror.key_beg); 893 hammer_get_cycle(&mirror.key_beg, &mirror.tid_beg); 894 895 if (mirror.key_beg.obj_id != (int64_t)HAMMER_MIN_OBJID) { 896 if (VerboseOpt) { 897 fprintf(stderr, "%s: mirror-read: Resuming at object %016jx\n", 898 id, (uintmax_t)mirror.key_beg.obj_id); 899 } 900 } 901 902 hammer_key_end_init(&mirror.key_end); 903 904 mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid; 905 mirror.tid_end = glob_pfs.ondisk->sync_end_tid; 906 mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */ 907 mirror.ubuf = buf; 908 mirror.size = DEDUP_BUF; 909 mirror.pfs_id = glob_pfs.pfs_id; 910 mirror.shared_uuid = glob_pfs.ondisk->shared_uuid; 911 912 if (VerboseOpt && DedupCrcStart == 0) { 913 printf("%s %s: objspace %016jx:%04x %016jx:%04x\n", 914 id, filesystem, 915 (uintmax_t)mirror.key_beg.obj_id, 916 mirror.key_beg.localization, 917 (uintmax_t)mirror.key_end.obj_id, 918 mirror.key_end.localization); 919 printf("%s %s: pfs_id %d\n", 920 id, filesystem, glob_pfs.pfs_id); 921 } 922 fflush(stdout); 923 fflush(stderr); 924 925 do { 926 mirror.count = 0; 927 mirror.pfs_id = glob_pfs.pfs_id; 928 mirror.shared_uuid = glob_pfs.ondisk->shared_uuid; 929 if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) { 930 err(1, "Mirror-read %s failed", filesystem); 931 /* not reached */ 932 } 933 if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) { 934 errx(1, "Mirror-read %s fatal error %d", 935 filesystem, mirror.head.error); 936 /* not reached */ 937 } 938 if (mirror.count) { 939 offset = 0; 940 while (offset < mirror.count) { 941 mrec = (void *)((char *)buf + offset); 942 bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size); 943 if (offset + bytes > mirror.count) { 944 errx(1, "Misaligned record"); 945 /* not reached */ 946 } 947 assert((mrec->head.type & 948 HAMMER_MRECF_TYPE_MASK) == 949 HAMMER_MREC_TYPE_REC); 950 offset += bytes; 951 elm = mrec->rec.leaf; 952 if (elm.base.btype != HAMMER_BTREE_TYPE_RECORD) 953 continue; 954 if (elm.base.rec_type != HAMMER_RECTYPE_DATA) 955 continue; 956 ++DedupCurrentRecords; 957 if (DedupCrcStart != DedupCrcEnd) { 958 if (elm.data_crc < DedupCrcStart) 959 continue; 960 if (DedupCrcEnd && 961 elm.data_crc >= DedupCrcEnd) { 962 continue; 963 } 964 } 965 func(&elm, 0); 966 } 967 } 968 mirror.key_beg = mirror.key_cur; 969 if (DidInterrupt || SigAlrmFlag) { 970 if (VerboseOpt) { 971 fprintf(stderr, "%s\n", 972 (DidInterrupt ? "Interrupted" : "Timeout")); 973 } 974 hammer_set_cycle(&mirror.key_cur, mirror.tid_beg); 975 if (VerboseOpt) { 976 fprintf(stderr, "Cyclefile %s updated for " 977 "continuation\n", CyclePath); 978 } 979 exit(1); 980 } 981 if (SigInfoFlag) { 982 if (DedupTotalRecords) { 983 humanize_unsigned(buf1, sizeof(buf1), 984 DedupDataReads, 985 "B", 1024); 986 humanize_unsigned(buf2, sizeof(buf2), 987 dedup_successes_bytes, 988 "B", 1024); 989 fprintf(stderr, "%s count %7jd/%jd " 990 "(%02d.%02d%%) " 991 "ioread %s newddup %s\n", 992 id, 993 (intmax_t)DedupCurrentRecords, 994 (intmax_t)DedupTotalRecords, 995 (int)(DedupCurrentRecords * 100 / 996 DedupTotalRecords), 997 (int)(DedupCurrentRecords * 10000 / 998 DedupTotalRecords % 100), 999 buf1, buf2); 1000 } else { 1001 fprintf(stderr, "%s count %-7jd\n", 1002 id, 1003 (intmax_t)DedupCurrentRecords); 1004 } 1005 SigInfoFlag = 0; 1006 } 1007 } while (mirror.count != 0); 1008 1009 signal(SIGINFO, SIG_IGN); 1010 signal(SIGALRM, SIG_IGN); 1011 1012 free(buf); 1013 } 1014 1015 static 1016 void 1017 dump_simulated_dedup(void) 1018 { 1019 struct sim_dedup_entry *sim_de; 1020 1021 printf("=== Dumping simulated dedup entries:\n"); 1022 RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) { 1023 printf("\tcrc=%08x cnt=%ju size=%ju\n", 1024 sim_de->crc, 1025 (uintmax_t)sim_de->ref_blks, 1026 (uintmax_t)sim_de->ref_size); 1027 } 1028 printf("end of dump ===\n"); 1029 } 1030 1031 static 1032 void 1033 dump_real_dedup(void) 1034 { 1035 struct dedup_entry *de; 1036 struct sha_dedup_entry *sha_de; 1037 int i; 1038 1039 printf("=== Dumping dedup entries:\n"); 1040 RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) { 1041 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) { 1042 printf("\tcrc=%08x fictitious\n", de->leaf.data_crc); 1043 1044 RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) { 1045 printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t" 1046 "\t\tsha=", 1047 sha_de->leaf.data_crc, 1048 (uintmax_t)sha_de->ref_blks, 1049 (uintmax_t)sha_de->ref_size); 1050 for (i = 0; i < SHA256_DIGEST_LENGTH; ++i) 1051 printf("%02x", sha_de->sha_hash[i]); 1052 printf("\n"); 1053 } 1054 } else { 1055 printf("\tcrc=%08x cnt=%ju size=%ju\n", 1056 de->leaf.data_crc, 1057 (uintmax_t)de->u.de.ref_blks, 1058 (uintmax_t)de->u.de.ref_size); 1059 } 1060 } 1061 printf("end of dump ===\n"); 1062 } 1063 1064 static 1065 void 1066 dedup_usage(int code) 1067 { 1068 fprintf(stderr, 1069 "hammer dedup-simulate <filesystem>\n" 1070 "hammer dedup <filesystem>\n" 1071 ); 1072 exit(code); 1073 } 1074