1 /* 2 * Copyright (c) 2010 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Ilya Dryomov <idryomov@gmail.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include "hammer.h" 36 #include <libutil.h> 37 #include <crypto/sha2/sha2.h> 38 39 #define DEDUP_BUF (64 * 1024) 40 41 /* Sorted list of block CRCs - light version for dedup-simulate */ 42 struct sim_dedup_entry_rb_tree; 43 RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree = 44 RB_INITIALIZER(&sim_dedup_tree); 45 RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry, 46 rb_sim_dedup_entry_compare, hammer_crc_t); 47 48 struct sim_dedup_entry { 49 hammer_crc_t crc; 50 u_int64_t ref_blks; /* number of blocks referenced */ 51 u_int64_t ref_size; /* size of data referenced */ 52 RB_ENTRY(sim_dedup_entry) rb_entry; 53 }; 54 55 /* Sorted list of HAMMER B-Tree keys */ 56 struct dedup_entry_rb_tree; 57 struct sha_dedup_entry_rb_tree; 58 59 RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree = 60 RB_INITIALIZER(&dedup_tree); 61 RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry, 62 rb_dedup_entry_compare, hammer_crc_t); 63 64 RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry, 65 rb_sha_dedup_entry_compare); 66 67 struct dedup_entry { 68 struct hammer_btree_leaf_elm leaf; 69 union { 70 struct { 71 u_int64_t ref_blks; 72 u_int64_t ref_size; 73 } de; 74 RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root; 75 } u; 76 u_int8_t flags; 77 RB_ENTRY(dedup_entry) rb_entry; 78 }; 79 80 #define HAMMER_DEDUP_ENTRY_FICTITIOUS 0x0001 81 82 struct sha_dedup_entry { 83 struct hammer_btree_leaf_elm leaf; 84 u_int64_t ref_blks; 85 u_int64_t ref_size; 86 u_int8_t sha_hash[SHA256_DIGEST_LENGTH]; 87 RB_ENTRY(sha_dedup_entry) fict_entry; 88 }; 89 90 /* 91 * Pass2 list - contains entries that were not dedup'ed because ioctl failed 92 */ 93 STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue = 94 STAILQ_HEAD_INITIALIZER(pass2_dedup_queue); 95 96 struct pass2_dedup_entry { 97 struct hammer_btree_leaf_elm leaf; 98 STAILQ_ENTRY(pass2_dedup_entry) sq_entry; 99 }; 100 101 #define DEDUP_PASS2 0x0001 /* process_btree_elm() mode */ 102 103 static int SigInfoFlag; 104 static int64_t DedupDataReads; 105 static int64_t DedupCurrentRecords; 106 static int64_t DedupTotalRecords; 107 static u_int32_t DedupCrcStart; 108 static u_int32_t DedupCrcEnd; 109 static u_int64_t MemoryUse; 110 111 /* PFS global ids - we deal with just one PFS at a run */ 112 int glob_fd; 113 struct hammer_ioc_pseudofs_rw glob_pfs; 114 115 /* 116 * Global accounting variables 117 * 118 * Last three don't have to be 64-bit, just to be safe.. 119 */ 120 u_int64_t dedup_alloc_size = 0; 121 u_int64_t dedup_ref_size = 0; 122 u_int64_t dedup_skipped_size = 0; 123 u_int64_t dedup_crc_failures = 0; 124 u_int64_t dedup_sha_failures = 0; 125 u_int64_t dedup_underflows = 0; 126 u_int64_t dedup_successes_count = 0; 127 u_int64_t dedup_successes_bytes = 0; 128 129 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1, 130 struct sim_dedup_entry *sim_de2); 131 static int rb_dedup_entry_compare(struct dedup_entry *de1, 132 struct dedup_entry *de2); 133 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1, 134 struct sha_dedup_entry *sha_de2); 135 typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags); 136 static void scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id); 137 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags); 138 static int count_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags); 139 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags); 140 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash); 141 static void dump_simulated_dedup(void); 142 static void dump_real_dedup(void); 143 static void dedup_usage(int code); 144 145 RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry, 146 rb_sim_dedup_entry_compare, hammer_crc_t, crc); 147 RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry, 148 rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc); 149 RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry, 150 rb_sha_dedup_entry_compare); 151 152 static int 153 rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1, 154 struct sim_dedup_entry *sim_de2) 155 { 156 if (sim_de1->crc < sim_de2->crc) 157 return (-1); 158 if (sim_de1->crc > sim_de2->crc) 159 return (1); 160 161 return (0); 162 } 163 164 static int 165 rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2) 166 { 167 if (de1->leaf.data_crc < de2->leaf.data_crc) 168 return (-1); 169 if (de1->leaf.data_crc > de2->leaf.data_crc) 170 return (1); 171 172 return (0); 173 } 174 175 static int 176 rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1, 177 struct sha_dedup_entry *sha_de2) 178 { 179 unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash; 180 unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash; 181 int i; 182 183 for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) { 184 if (h1[i] < h2[i]) 185 return (-1); 186 if (h1[i] > h2[i]) 187 return (1); 188 } 189 190 return (0); 191 } 192 193 /* 194 * dedup-simulate <filesystem> 195 */ 196 void 197 hammer_cmd_dedup_simulate(char **av, int ac) 198 { 199 struct sim_dedup_entry *sim_de; 200 201 if (ac != 1) 202 dedup_usage(1); 203 204 glob_fd = getpfs(&glob_pfs, av[0]); 205 206 /* 207 * Collection passes (memory limited) 208 */ 209 printf("Dedup-simulate running\n"); 210 do { 211 DedupCrcStart = DedupCrcEnd; 212 DedupCrcEnd = 0; 213 MemoryUse = 0; 214 215 if (VerboseOpt) { 216 printf("b-tree pass crc-range %08x-max\n", 217 DedupCrcStart); 218 fflush(stdout); 219 } 220 scan_pfs(av[0], collect_btree_elm, "simu-pass"); 221 222 if (VerboseOpt >= 2) 223 dump_simulated_dedup(); 224 225 /* 226 * Calculate simulated dedup ratio and get rid of the tree 227 */ 228 while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) { 229 assert(sim_de->ref_blks != 0); 230 dedup_ref_size += sim_de->ref_size; 231 dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks; 232 233 RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de); 234 free(sim_de); 235 } 236 if (DedupCrcEnd && VerboseOpt == 0) 237 printf("."); 238 } while (DedupCrcEnd); 239 240 printf("Dedup-simulate %s succeeded\n", av[0]); 241 relpfs(glob_fd, &glob_pfs); 242 243 printf("Simulated dedup ratio = %.2f\n", 244 (dedup_alloc_size != 0) ? 245 (double)dedup_ref_size / dedup_alloc_size : 0); 246 } 247 248 /* 249 * dedup <filesystem> 250 */ 251 void 252 hammer_cmd_dedup(char **av, int ac) 253 { 254 struct dedup_entry *de; 255 struct sha_dedup_entry *sha_de; 256 struct pass2_dedup_entry *pass2_de; 257 char buf[8]; 258 259 if (TimeoutOpt > 0) 260 alarm(TimeoutOpt); 261 262 if (ac != 1) 263 dedup_usage(1); 264 265 STAILQ_INIT(&pass2_dedup_queue); 266 267 glob_fd = getpfs(&glob_pfs, av[0]); 268 269 /* 270 * Pre-pass to cache the btree 271 */ 272 scan_pfs(av[0], count_btree_elm, "pre-pass "); 273 DedupTotalRecords = DedupCurrentRecords; 274 275 /* 276 * Collection passes (memory limited) 277 */ 278 printf("Dedup running\n"); 279 do { 280 DedupCrcStart = DedupCrcEnd; 281 DedupCrcEnd = 0; 282 MemoryUse = 0; 283 284 if (VerboseOpt) { 285 printf("b-tree pass crc-range %08x-max\n", 286 DedupCrcStart); 287 fflush(stdout); 288 } 289 scan_pfs(av[0], process_btree_elm, "main-pass"); 290 291 while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) { 292 if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2)) 293 dedup_skipped_size -= pass2_de->leaf.data_len; 294 295 STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry); 296 free(pass2_de); 297 } 298 assert(STAILQ_EMPTY(&pass2_dedup_queue)); 299 300 if (VerboseOpt >= 2) 301 dump_real_dedup(); 302 303 /* 304 * Calculate dedup ratio and get rid of the trees 305 */ 306 while ((de = RB_ROOT(&dedup_tree)) != NULL) { 307 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) { 308 while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) { 309 assert(sha_de->ref_blks != 0); 310 dedup_ref_size += sha_de->ref_size; 311 dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks; 312 313 RB_REMOVE(sha_dedup_entry_rb_tree, 314 &de->u.fict_root, sha_de); 315 free(sha_de); 316 } 317 assert(RB_EMPTY(&de->u.fict_root)); 318 } else { 319 assert(de->u.de.ref_blks != 0); 320 dedup_ref_size += de->u.de.ref_size; 321 dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks; 322 } 323 324 RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de); 325 free(de); 326 } 327 assert(RB_EMPTY(&dedup_tree)); 328 if (DedupCrcEnd && VerboseOpt == 0) 329 printf("."); 330 } while (DedupCrcEnd); 331 332 printf("Dedup %s succeeded\n", av[0]); 333 relpfs(glob_fd, &glob_pfs); 334 335 humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024); 336 printf("Dedup ratio = %.2f\n" 337 " %8s referenced\n", 338 ((dedup_alloc_size != 0) ? 339 (double)dedup_ref_size / dedup_alloc_size : 0), 340 buf 341 ); 342 humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024); 343 printf(" %8s allocated\n", buf); 344 humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024); 345 printf(" %8s skipped\n", buf); 346 printf(" %8jd CRC collisions\n" 347 " %8jd SHA collisions\n" 348 " %8jd bigblock underflows\n" 349 " %8jd new dedup records\n" 350 " %8jd new dedup bytes\n", 351 (intmax_t)dedup_crc_failures, 352 (intmax_t)dedup_sha_failures, 353 (intmax_t)dedup_underflows, 354 (intmax_t)dedup_successes_count, 355 (intmax_t)dedup_successes_bytes 356 ); 357 } 358 359 static int 360 count_btree_elm(hammer_btree_leaf_elm_t scan_leaf __unused, int flags __unused) 361 { 362 return(1); 363 } 364 365 static int 366 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused) 367 { 368 struct sim_dedup_entry *sim_de; 369 370 /* 371 * If we are using too much memory we have to clean some out, which 372 * will cause the run to use multiple passes. Be careful of integer 373 * overflows! 374 */ 375 if (MemoryUse > MemoryLimit) { 376 DedupCrcEnd = DedupCrcStart + 377 (u_int32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2; 378 if (VerboseOpt) { 379 printf("memory limit crc-range %08x-%08x\n", 380 DedupCrcStart, DedupCrcEnd); 381 fflush(stdout); 382 } 383 for (;;) { 384 sim_de = RB_MAX(sim_dedup_entry_rb_tree, 385 &sim_dedup_tree); 386 if (sim_de == NULL || sim_de->crc < DedupCrcEnd) 387 break; 388 RB_REMOVE(sim_dedup_entry_rb_tree, 389 &sim_dedup_tree, sim_de); 390 MemoryUse -= sizeof(*sim_de); 391 free(sim_de); 392 } 393 } 394 395 /* 396 * Collect statistics based on the CRC only, do not try to read 397 * any data blocks or run SHA hashes. 398 */ 399 sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree, 400 scan_leaf->data_crc); 401 402 if (sim_de == NULL) { 403 sim_de = calloc(sizeof(*sim_de), 1); 404 sim_de->crc = scan_leaf->data_crc; 405 RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de); 406 MemoryUse += sizeof(*sim_de); 407 } 408 409 sim_de->ref_blks += 1; 410 sim_de->ref_size += scan_leaf->data_len; 411 return (1); 412 } 413 414 static __inline int 415 validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q) 416 { 417 if ((p->data_offset & HAMMER_OFF_ZONE_MASK) != 418 (q->data_offset & HAMMER_OFF_ZONE_MASK)) { 419 return (1); 420 } 421 if (p->data_len != q->data_len) { 422 return (1); 423 } 424 425 return (0); 426 } 427 428 #define DEDUP_TECH_FAILURE 1 429 #define DEDUP_CMP_FAILURE 2 430 #define DEDUP_INVALID_ZONE 3 431 #define DEDUP_UNDERFLOW 4 432 #define DEDUP_VERS_FAILURE 5 433 434 static __inline int 435 deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q) 436 { 437 struct hammer_ioc_dedup dedup; 438 439 bzero(&dedup, sizeof(dedup)); 440 441 /* 442 * If data_offset fields are the same there is no need to run ioctl, 443 * candidate is already dedup'ed. 444 */ 445 if (p->data_offset == q->data_offset) { 446 return (0); 447 } 448 449 dedup.elm1 = p->base; 450 dedup.elm2 = q->base; 451 RunningIoctl = 1; 452 if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) { 453 if (errno == EOPNOTSUPP) { 454 /* must be at least version 5 */ 455 return (DEDUP_VERS_FAILURE); 456 } 457 /* Technical failure - locking or w/e */ 458 return (DEDUP_TECH_FAILURE); 459 } 460 if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE) { 461 return (DEDUP_CMP_FAILURE); 462 } 463 if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE) { 464 return (DEDUP_INVALID_ZONE); 465 } 466 if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW) { 467 return (DEDUP_UNDERFLOW); 468 } 469 RunningIoctl = 0; 470 ++dedup_successes_count; 471 dedup_successes_bytes += p->data_len; 472 return (0); 473 } 474 475 static int 476 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags) 477 { 478 struct dedup_entry *de; 479 struct sha_dedup_entry *sha_de, temp; 480 struct pass2_dedup_entry *pass2_de; 481 int error; 482 483 /* 484 * If we are using too much memory we have to clean some out, which 485 * will cause the run to use multiple passes. Be careful of integer 486 * overflows! 487 */ 488 while (MemoryUse > MemoryLimit) { 489 DedupCrcEnd = DedupCrcStart + 490 (u_int32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2; 491 if (VerboseOpt) { 492 printf("memory limit crc-range %08x-%08x\n", 493 DedupCrcStart, DedupCrcEnd); 494 fflush(stdout); 495 } 496 497 for (;;) { 498 de = RB_MAX(dedup_entry_rb_tree, &dedup_tree); 499 if (de == NULL || de->leaf.data_crc < DedupCrcEnd) 500 break; 501 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) { 502 while ((sha_de = RB_ROOT(&de->u.fict_root)) != 503 NULL) { 504 RB_REMOVE(sha_dedup_entry_rb_tree, 505 &de->u.fict_root, sha_de); 506 MemoryUse -= sizeof(*sha_de); 507 free(sha_de); 508 } 509 } 510 RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de); 511 MemoryUse -= sizeof(*de); 512 free(de); 513 } 514 } 515 516 /* 517 * Collect statistics based on the CRC. Colliding CRCs usually 518 * cause a SHA sub-tree to be created under the de. 519 * 520 * Trivial case if de not found. 521 */ 522 de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc); 523 if (de == NULL) { 524 de = calloc(sizeof(*de), 1); 525 de->leaf = *scan_leaf; 526 RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de); 527 MemoryUse += sizeof(*de); 528 goto upgrade_stats; 529 } 530 531 /* 532 * Found entry in CRC tree 533 */ 534 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) { 535 /* 536 * Optimize the case where a CRC failure results in multiple 537 * SHA entries. If we unconditionally issue a data-read a 538 * degenerate situation where a colliding CRC's second SHA 539 * entry contains the lion's share of the deduplication 540 * candidates will result in excessive data block reads. 541 * 542 * Deal with the degenerate case by looking for a matching 543 * data_offset/data_len in the SHA elements we already have 544 * before reading the data block and generating a new SHA. 545 */ 546 RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) { 547 if (sha_de->leaf.data_offset == 548 scan_leaf->data_offset && 549 sha_de->leaf.data_len == scan_leaf->data_len) { 550 memcpy(temp.sha_hash, sha_de->sha_hash, 551 SHA256_DIGEST_LENGTH); 552 break; 553 } 554 } 555 556 /* 557 * Entry in CRC tree is fictitious, so we already had problems 558 * with this CRC. Upgrade (compute SHA) the candidate and 559 * dive into SHA subtree. If upgrade fails insert the candidate 560 * into Pass2 list (it will be processed later). 561 */ 562 if (sha_de == NULL) { 563 if (upgrade_chksum(scan_leaf, temp.sha_hash)) 564 goto pass2_insert; 565 566 sha_de = RB_FIND(sha_dedup_entry_rb_tree, 567 &de->u.fict_root, &temp); 568 } 569 570 /* 571 * Nothing in SHA subtree so far, so this is a new 572 * 'dataset'. Insert new entry into SHA subtree. 573 */ 574 if (sha_de == NULL) { 575 sha_de = calloc(sizeof(*sha_de), 1); 576 sha_de->leaf = *scan_leaf; 577 memcpy(sha_de->sha_hash, temp.sha_hash, 578 SHA256_DIGEST_LENGTH); 579 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, 580 sha_de); 581 MemoryUse += sizeof(*sha_de); 582 goto upgrade_stats_sha; 583 } 584 585 /* 586 * Found entry in SHA subtree, it means we have a potential 587 * dedup pair. Validate it (zones have to match and data_len 588 * field have to be the same too. If validation fails, treat 589 * it as a SHA collision (jump to sha256_failure). 590 */ 591 if (validate_dedup_pair(&sha_de->leaf, scan_leaf)) 592 goto sha256_failure; 593 594 /* 595 * We have a valid dedup pair (SHA match, validated). 596 * 597 * In case of technical failure (dedup pair was good, but 598 * ioctl failed anyways) insert the candidate into Pass2 list 599 * (we will try to dedup it after we are done with the rest of 600 * the tree). 601 * 602 * If ioctl fails because either of blocks is in the non-dedup 603 * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't 604 * bother with the candidate and terminate early. 605 * 606 * If ioctl fails because of bigblock underflow replace the 607 * leaf node that found dedup entry represents with scan_leaf. 608 */ 609 error = deduplicate(&sha_de->leaf, scan_leaf); 610 switch(error) { 611 case 0: 612 goto upgrade_stats_sha; 613 case DEDUP_TECH_FAILURE: 614 goto pass2_insert; 615 case DEDUP_CMP_FAILURE: 616 goto sha256_failure; 617 case DEDUP_INVALID_ZONE: 618 goto terminate_early; 619 case DEDUP_UNDERFLOW: 620 ++dedup_underflows; 621 sha_de->leaf = *scan_leaf; 622 memcpy(sha_de->sha_hash, temp.sha_hash, 623 SHA256_DIGEST_LENGTH); 624 goto upgrade_stats_sha; 625 case DEDUP_VERS_FAILURE: 626 fprintf(stderr, 627 "HAMMER filesystem must be at least " 628 "version 5 to dedup\n"); 629 exit (1); 630 default: 631 fprintf(stderr, "Unknown error\n"); 632 goto terminate_early; 633 } 634 635 /* 636 * Ooh la la.. SHA-256 collision. Terminate early, there's 637 * nothing we can do here. 638 */ 639 sha256_failure: 640 ++dedup_sha_failures; 641 goto terminate_early; 642 } else { 643 /* 644 * Candidate CRC is good for now (we found an entry in CRC 645 * tree and it's not fictitious). This means we have a 646 * potential dedup pair. 647 */ 648 if (validate_dedup_pair(&de->leaf, scan_leaf)) 649 goto crc_failure; 650 651 /* 652 * We have a valid dedup pair (CRC match, validated) 653 */ 654 error = deduplicate(&de->leaf, scan_leaf); 655 switch(error) { 656 case 0: 657 goto upgrade_stats; 658 case DEDUP_TECH_FAILURE: 659 goto pass2_insert; 660 case DEDUP_CMP_FAILURE: 661 goto crc_failure; 662 case DEDUP_INVALID_ZONE: 663 goto terminate_early; 664 case DEDUP_UNDERFLOW: 665 ++dedup_underflows; 666 de->leaf = *scan_leaf; 667 goto upgrade_stats; 668 case DEDUP_VERS_FAILURE: 669 fprintf(stderr, 670 "HAMMER filesystem must be at least " 671 "version 5 to dedup\n"); 672 exit (1); 673 default: 674 fprintf(stderr, "Unknown error\n"); 675 goto terminate_early; 676 } 677 678 crc_failure: 679 /* 680 * We got a CRC collision - either ioctl failed because of 681 * the comparison failure or validation of the potential 682 * dedup pair went bad. In all cases insert both blocks 683 * into SHA subtree (this requires checksum upgrade) and mark 684 * entry that corresponds to this CRC in the CRC tree 685 * fictitious, so that all futher operations with this CRC go 686 * through SHA subtree. 687 */ 688 ++dedup_crc_failures; 689 690 /* 691 * Insert block that was represented by now fictitious dedup 692 * entry (create a new SHA entry and preserve stats of the 693 * old CRC one). If checksum upgrade fails insert the 694 * candidate into Pass2 list and return - keep both trees 695 * unmodified. 696 */ 697 sha_de = calloc(sizeof(*sha_de), 1); 698 sha_de->leaf = de->leaf; 699 sha_de->ref_blks = de->u.de.ref_blks; 700 sha_de->ref_size = de->u.de.ref_size; 701 if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) { 702 free(sha_de); 703 goto pass2_insert; 704 } 705 MemoryUse += sizeof(*sha_de); 706 707 RB_INIT(&de->u.fict_root); 708 /* 709 * Here we can insert without prior checking because the tree 710 * is empty at this point 711 */ 712 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de); 713 714 /* 715 * Mark entry in CRC tree fictitious 716 */ 717 de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS; 718 719 /* 720 * Upgrade checksum of the candidate and insert it into 721 * SHA subtree. If upgrade fails insert the candidate into 722 * Pass2 list. 723 */ 724 if (upgrade_chksum(scan_leaf, temp.sha_hash)) { 725 goto pass2_insert; 726 } 727 sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root, 728 &temp); 729 if (sha_de != NULL) 730 /* There is an entry with this SHA already, but the only 731 * RB-tree element at this point is that entry we just 732 * added. We know for sure these blocks are different 733 * (this is crc_failure branch) so treat it as SHA 734 * collision. 735 */ 736 goto sha256_failure; 737 738 sha_de = calloc(sizeof(*sha_de), 1); 739 sha_de->leaf = *scan_leaf; 740 memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH); 741 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de); 742 MemoryUse += sizeof(*sha_de); 743 goto upgrade_stats_sha; 744 } 745 746 upgrade_stats: 747 de->u.de.ref_blks += 1; 748 de->u.de.ref_size += scan_leaf->data_len; 749 return (1); 750 751 upgrade_stats_sha: 752 sha_de->ref_blks += 1; 753 sha_de->ref_size += scan_leaf->data_len; 754 return (1); 755 756 pass2_insert: 757 /* 758 * If in pass2 mode don't insert anything, fall through to 759 * terminate_early 760 */ 761 if ((flags & DEDUP_PASS2) == 0) { 762 pass2_de = calloc(sizeof(*pass2_de), 1); 763 pass2_de->leaf = *scan_leaf; 764 STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry); 765 dedup_skipped_size += scan_leaf->data_len; 766 return (1); 767 } 768 769 terminate_early: 770 /* 771 * Early termination path. Fixup stats. 772 */ 773 dedup_alloc_size += scan_leaf->data_len; 774 dedup_ref_size += scan_leaf->data_len; 775 return (0); 776 } 777 778 static int 779 upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash) 780 { 781 struct hammer_ioc_data data; 782 char *buf = malloc(DEDUP_BUF); 783 SHA256_CTX ctx; 784 int error; 785 786 bzero(&data, sizeof(data)); 787 data.elm = leaf->base; 788 data.ubuf = buf; 789 data.size = DEDUP_BUF; 790 791 error = 0; 792 if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) { 793 fprintf(stderr, "Get-data failed: %s\n", strerror(errno)); 794 error = 1; 795 goto done; 796 } 797 DedupDataReads += leaf->data_len; 798 799 if (data.leaf.data_len != leaf->data_len) { 800 error = 1; 801 goto done; 802 } 803 804 if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD && 805 data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) { 806 SHA256_Init(&ctx); 807 SHA256_Update(&ctx, (void *)buf, data.leaf.data_len); 808 SHA256_Final(sha_hash, &ctx); 809 } 810 811 done: 812 free(buf); 813 return (error); 814 } 815 816 static void 817 sigInfo(int signo __unused) 818 { 819 SigInfoFlag = 1; 820 } 821 822 static void 823 scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id) 824 { 825 struct hammer_ioc_mirror_rw mirror; 826 hammer_ioc_mrecord_any_t mrec; 827 struct hammer_btree_leaf_elm elm; 828 char *buf = malloc(DEDUP_BUF); 829 char buf1[8]; 830 char buf2[8]; 831 int offset, bytes; 832 833 SigInfoFlag = 0; 834 DedupDataReads = 0; 835 DedupCurrentRecords = 0; 836 signal(SIGINFO, sigInfo); 837 838 bzero(&mirror, sizeof(mirror)); 839 hammer_key_beg_init(&mirror.key_beg); 840 hammer_key_end_init(&mirror.key_end); 841 842 mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid; 843 mirror.tid_end = glob_pfs.ondisk->sync_end_tid; 844 mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */ 845 mirror.ubuf = buf; 846 mirror.size = DEDUP_BUF; 847 mirror.pfs_id = glob_pfs.pfs_id; 848 mirror.shared_uuid = glob_pfs.ondisk->shared_uuid; 849 850 if (VerboseOpt && DedupCrcStart == 0) { 851 printf("%s %s: objspace %016jx:%04x %016jx:%04x\n", 852 id, filesystem, 853 (uintmax_t)mirror.key_beg.obj_id, 854 mirror.key_beg.localization, 855 (uintmax_t)mirror.key_end.obj_id, 856 mirror.key_end.localization); 857 printf("%s %s: pfs_id %d\n", 858 id, filesystem, glob_pfs.pfs_id); 859 } 860 fflush(stdout); 861 fflush(stderr); 862 863 do { 864 mirror.count = 0; 865 mirror.pfs_id = glob_pfs.pfs_id; 866 mirror.shared_uuid = glob_pfs.ondisk->shared_uuid; 867 if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) { 868 fprintf(stderr, "Mirror-read %s failed: %s\n", 869 filesystem, strerror(errno)); 870 exit(1); 871 } 872 if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) { 873 fprintf(stderr, "Mirror-read %s fatal error %d\n", 874 filesystem, mirror.head.error); 875 exit(1); 876 } 877 if (mirror.count) { 878 offset = 0; 879 while (offset < mirror.count) { 880 mrec = (void *)((char *)buf + offset); 881 bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size); 882 if (offset + bytes > mirror.count) { 883 fprintf(stderr, "Misaligned record\n"); 884 exit(1); 885 } 886 assert((mrec->head.type & 887 HAMMER_MRECF_TYPE_MASK) == 888 HAMMER_MREC_TYPE_REC); 889 offset += bytes; 890 elm = mrec->rec.leaf; 891 if (elm.base.btype != HAMMER_BTREE_TYPE_RECORD) 892 continue; 893 if (elm.base.rec_type != HAMMER_RECTYPE_DATA) 894 continue; 895 ++DedupCurrentRecords; 896 if (DedupCrcStart != DedupCrcEnd) { 897 if (elm.data_crc < DedupCrcStart) 898 continue; 899 if (DedupCrcEnd && 900 elm.data_crc >= DedupCrcEnd) { 901 continue; 902 } 903 } 904 func(&elm, 0); 905 } 906 } 907 mirror.key_beg = mirror.key_cur; 908 if (DidInterrupt) { 909 fprintf(stderr, "Interrupted\n"); 910 exit(1); 911 } 912 if (SigInfoFlag) { 913 if (DedupTotalRecords) { 914 humanize_unsigned(buf1, sizeof(buf1), 915 DedupDataReads, 916 "B", 1024); 917 humanize_unsigned(buf2, sizeof(buf2), 918 dedup_successes_bytes, 919 "B", 1024); 920 fprintf(stderr, "%s count %7jd/%jd " 921 "(%02d.%02d%%) " 922 "ioread %s newddup %s\n", 923 id, 924 (intmax_t)DedupCurrentRecords, 925 (intmax_t)DedupTotalRecords, 926 (int)(DedupCurrentRecords * 100 / 927 DedupTotalRecords), 928 (int)(DedupCurrentRecords * 10000 / 929 DedupTotalRecords % 100), 930 buf1, buf2); 931 } else { 932 fprintf(stderr, "%s count %-7jd\n", 933 id, 934 (intmax_t)DedupCurrentRecords); 935 } 936 SigInfoFlag = 0; 937 } 938 } while (mirror.count != 0); 939 940 signal(SIGINFO, SIG_IGN); 941 942 free(buf); 943 } 944 945 static void 946 dump_simulated_dedup(void) 947 { 948 struct sim_dedup_entry *sim_de; 949 950 printf("=== Dumping simulated dedup entries:\n"); 951 RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) { 952 printf("\tcrc=%08x cnt=%ju size=%ju\n", 953 sim_de->crc, 954 (intmax_t)sim_de->ref_blks, 955 (intmax_t)sim_de->ref_size); 956 } 957 printf("end of dump ===\n"); 958 } 959 960 static void 961 dump_real_dedup(void) 962 { 963 struct dedup_entry *de; 964 struct sha_dedup_entry *sha_de; 965 int i; 966 967 printf("=== Dumping dedup entries:\n"); 968 RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) { 969 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) { 970 printf("\tcrc=%08x fictitious\n", de->leaf.data_crc); 971 972 RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) { 973 printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t" 974 "\t\tsha=", 975 sha_de->leaf.data_crc, 976 (intmax_t)sha_de->ref_blks, 977 (intmax_t)sha_de->ref_size); 978 for (i = 0; i < SHA256_DIGEST_LENGTH; ++i) 979 printf("%02x", sha_de->sha_hash[i]); 980 printf("\n"); 981 } 982 } else { 983 printf("\tcrc=%08x cnt=%ju size=%ju\n", 984 de->leaf.data_crc, 985 (intmax_t)de->u.de.ref_blks, 986 (intmax_t)de->u.de.ref_size); 987 } 988 } 989 printf("end of dump ===\n"); 990 } 991 992 static void 993 dedup_usage(int code) 994 { 995 fprintf(stderr, 996 "hammer dedup-simulate <filesystem>\n" 997 "hammer dedup <filesystem>\n" 998 ); 999 exit(code); 1000 } 1001