1 /* 2 * Copyright (c) 2010 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Ilya Dryomov <idryomov@gmail.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <libutil.h> 36 #include <crypto/sha2/sha2.h> 37 38 #include "hammer.h" 39 40 #define DEDUP_BUF (64 * 1024) 41 42 /* Sorted list of block CRCs - light version for dedup-simulate */ 43 struct sim_dedup_entry_rb_tree; 44 RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree = 45 RB_INITIALIZER(&sim_dedup_tree); 46 RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry, 47 rb_sim_dedup_entry_compare, hammer_crc_t); 48 49 struct sim_dedup_entry { 50 hammer_crc_t crc; 51 u_int64_t ref_blks; /* number of blocks referenced */ 52 u_int64_t ref_size; /* size of data referenced */ 53 RB_ENTRY(sim_dedup_entry) rb_entry; 54 }; 55 56 struct dedup_entry { 57 struct hammer_btree_leaf_elm leaf; 58 union { 59 struct { 60 u_int64_t ref_blks; 61 u_int64_t ref_size; 62 } de; 63 RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root; 64 } u; 65 u_int8_t flags; 66 RB_ENTRY(dedup_entry) rb_entry; 67 }; 68 69 #define HAMMER_DEDUP_ENTRY_FICTITIOUS 0x0001 70 71 struct sha_dedup_entry { 72 struct hammer_btree_leaf_elm leaf; 73 u_int64_t ref_blks; 74 u_int64_t ref_size; 75 u_int8_t sha_hash[SHA256_DIGEST_LENGTH]; 76 RB_ENTRY(sha_dedup_entry) fict_entry; 77 }; 78 79 /* Sorted list of HAMMER B-Tree keys */ 80 struct dedup_entry_rb_tree; 81 struct sha_dedup_entry_rb_tree; 82 83 RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree = 84 RB_INITIALIZER(&dedup_tree); 85 RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry, 86 rb_dedup_entry_compare, hammer_crc_t); 87 88 RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry, 89 rb_sha_dedup_entry_compare); 90 91 /* 92 * Pass2 list - contains entries that were not dedup'ed because ioctl failed 93 */ 94 STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue = 95 STAILQ_HEAD_INITIALIZER(pass2_dedup_queue); 96 97 struct pass2_dedup_entry { 98 struct hammer_btree_leaf_elm leaf; 99 STAILQ_ENTRY(pass2_dedup_entry) sq_entry; 100 }; 101 102 #define DEDUP_PASS2 0x0001 /* process_btree_elm() mode */ 103 104 static int SigInfoFlag; 105 static int SigAlrmFlag; 106 static int64_t DedupDataReads; 107 static int64_t DedupCurrentRecords; 108 static int64_t DedupTotalRecords; 109 static u_int32_t DedupCrcStart; 110 static u_int32_t DedupCrcEnd; 111 static u_int64_t MemoryUse; 112 113 /* PFS global ids - we deal with just one PFS at a run */ 114 int glob_fd; 115 struct hammer_ioc_pseudofs_rw glob_pfs; 116 117 /* 118 * Global accounting variables 119 * 120 * Last three don't have to be 64-bit, just to be safe.. 121 */ 122 u_int64_t dedup_alloc_size = 0; 123 u_int64_t dedup_ref_size = 0; 124 u_int64_t dedup_skipped_size = 0; 125 u_int64_t dedup_crc_failures = 0; 126 u_int64_t dedup_sha_failures = 0; 127 u_int64_t dedup_underflows = 0; 128 u_int64_t dedup_successes_count = 0; 129 u_int64_t dedup_successes_bytes = 0; 130 131 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1, 132 struct sim_dedup_entry *sim_de2); 133 static int rb_dedup_entry_compare(struct dedup_entry *de1, 134 struct dedup_entry *de2); 135 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1, 136 struct sha_dedup_entry *sha_de2); 137 typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags); 138 static void scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id); 139 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags); 140 static int count_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags); 141 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags); 142 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash); 143 static void dump_simulated_dedup(void); 144 static void dump_real_dedup(void); 145 static void dedup_usage(int code); 146 147 RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry, 148 rb_sim_dedup_entry_compare, hammer_crc_t, crc); 149 RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry, 150 rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc); 151 RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry, 152 rb_sha_dedup_entry_compare); 153 154 static int 155 rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1, 156 struct sim_dedup_entry *sim_de2) 157 { 158 if (sim_de1->crc < sim_de2->crc) 159 return (-1); 160 if (sim_de1->crc > sim_de2->crc) 161 return (1); 162 163 return (0); 164 } 165 166 static int 167 rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2) 168 { 169 if (de1->leaf.data_crc < de2->leaf.data_crc) 170 return (-1); 171 if (de1->leaf.data_crc > de2->leaf.data_crc) 172 return (1); 173 174 return (0); 175 } 176 177 static int 178 rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1, 179 struct sha_dedup_entry *sha_de2) 180 { 181 unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash; 182 unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash; 183 int i; 184 185 for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) { 186 if (h1[i] < h2[i]) 187 return (-1); 188 if (h1[i] > h2[i]) 189 return (1); 190 } 191 192 return (0); 193 } 194 195 /* 196 * dedup-simulate <filesystem> 197 */ 198 void 199 hammer_cmd_dedup_simulate(char **av, int ac) 200 { 201 struct sim_dedup_entry *sim_de; 202 203 if (ac != 1) 204 dedup_usage(1); 205 206 glob_fd = getpfs(&glob_pfs, av[0]); 207 208 /* 209 * Collection passes (memory limited) 210 */ 211 printf("Dedup-simulate running\n"); 212 do { 213 DedupCrcStart = DedupCrcEnd; 214 DedupCrcEnd = 0; 215 MemoryUse = 0; 216 217 if (VerboseOpt) { 218 printf("B-Tree pass crc-range %08x-max\n", 219 DedupCrcStart); 220 fflush(stdout); 221 } 222 scan_pfs(av[0], collect_btree_elm, "simu-pass"); 223 224 if (VerboseOpt >= 2) 225 dump_simulated_dedup(); 226 227 /* 228 * Calculate simulated dedup ratio and get rid of the tree 229 */ 230 while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) { 231 assert(sim_de->ref_blks != 0); 232 dedup_ref_size += sim_de->ref_size; 233 dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks; 234 235 RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de); 236 free(sim_de); 237 } 238 if (DedupCrcEnd && VerboseOpt == 0) 239 printf("."); 240 } while (DedupCrcEnd); 241 242 printf("Dedup-simulate %s succeeded\n", av[0]); 243 relpfs(glob_fd, &glob_pfs); 244 245 printf("Simulated dedup ratio = %.2f\n", 246 (dedup_alloc_size != 0) ? 247 (double)dedup_ref_size / dedup_alloc_size : 0); 248 } 249 250 /* 251 * dedup <filesystem> 252 */ 253 void 254 hammer_cmd_dedup(char **av, int ac) 255 { 256 struct dedup_entry *de; 257 struct sha_dedup_entry *sha_de; 258 struct pass2_dedup_entry *pass2_de; 259 char *tmp; 260 char buf[8]; 261 int needfree = 0; 262 263 if (TimeoutOpt > 0) 264 alarm(TimeoutOpt); 265 266 if (ac != 1) 267 dedup_usage(1); 268 269 STAILQ_INIT(&pass2_dedup_queue); 270 271 glob_fd = getpfs(&glob_pfs, av[0]); 272 273 /* 274 * A cycle file is _required_ for resuming dedup after the timeout 275 * specified with -t has expired. If no -c option, then place a 276 * .dedup.cycle file either in the PFS snapshots directory or in 277 * the default snapshots directory. 278 */ 279 if (!CyclePath) { 280 if (glob_pfs.ondisk->snapshots[0] != '/') 281 asprintf(&tmp, "%s/%s/.dedup.cycle", 282 SNAPSHOTS_BASE, av[0]); 283 else 284 asprintf(&tmp, "%s/.dedup.cycle", 285 glob_pfs.ondisk->snapshots); 286 CyclePath = tmp; 287 needfree = 1; 288 } 289 290 /* 291 * Pre-pass to cache the btree 292 */ 293 scan_pfs(av[0], count_btree_elm, "pre-pass "); 294 DedupTotalRecords = DedupCurrentRecords; 295 296 /* 297 * Collection passes (memory limited) 298 */ 299 printf("Dedup running\n"); 300 do { 301 DedupCrcStart = DedupCrcEnd; 302 DedupCrcEnd = 0; 303 MemoryUse = 0; 304 305 if (VerboseOpt) { 306 printf("B-Tree pass crc-range %08x-max\n", 307 DedupCrcStart); 308 fflush(stdout); 309 } 310 scan_pfs(av[0], process_btree_elm, "main-pass"); 311 312 while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) { 313 if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2)) 314 dedup_skipped_size -= pass2_de->leaf.data_len; 315 316 STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry); 317 free(pass2_de); 318 } 319 assert(STAILQ_EMPTY(&pass2_dedup_queue)); 320 321 if (VerboseOpt >= 2) 322 dump_real_dedup(); 323 324 /* 325 * Calculate dedup ratio and get rid of the trees 326 */ 327 while ((de = RB_ROOT(&dedup_tree)) != NULL) { 328 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) { 329 while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) { 330 assert(sha_de->ref_blks != 0); 331 dedup_ref_size += sha_de->ref_size; 332 dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks; 333 334 RB_REMOVE(sha_dedup_entry_rb_tree, 335 &de->u.fict_root, sha_de); 336 free(sha_de); 337 } 338 assert(RB_EMPTY(&de->u.fict_root)); 339 } else { 340 assert(de->u.de.ref_blks != 0); 341 dedup_ref_size += de->u.de.ref_size; 342 dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks; 343 } 344 345 RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de); 346 free(de); 347 } 348 assert(RB_EMPTY(&dedup_tree)); 349 if (DedupCrcEnd && VerboseOpt == 0) 350 printf("."); 351 } while (DedupCrcEnd); 352 353 printf("Dedup %s succeeded\n", av[0]); 354 relpfs(glob_fd, &glob_pfs); 355 356 humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024); 357 printf("Dedup ratio = %.2f (in this run)\n" 358 " %8s referenced\n", 359 ((dedup_alloc_size != 0) ? 360 (double)dedup_ref_size / dedup_alloc_size : 0), 361 buf 362 ); 363 humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024); 364 printf(" %8s allocated\n", buf); 365 humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024); 366 printf(" %8s skipped\n", buf); 367 printf(" %8jd CRC collisions\n" 368 " %8jd SHA collisions\n" 369 " %8jd big-block underflows\n" 370 " %8jd new dedup records\n" 371 " %8jd new dedup bytes\n", 372 (intmax_t)dedup_crc_failures, 373 (intmax_t)dedup_sha_failures, 374 (intmax_t)dedup_underflows, 375 (intmax_t)dedup_successes_count, 376 (intmax_t)dedup_successes_bytes 377 ); 378 379 /* Once completed remove cycle file */ 380 hammer_reset_cycle(); 381 382 /* We don't want to mess up with other directives */ 383 if (needfree) { 384 free(tmp); 385 CyclePath = NULL; 386 } 387 } 388 389 static int 390 count_btree_elm(hammer_btree_leaf_elm_t scan_leaf __unused, int flags __unused) 391 { 392 return(1); 393 } 394 395 static int 396 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused) 397 { 398 struct sim_dedup_entry *sim_de; 399 400 /* 401 * If we are using too much memory we have to clean some out, which 402 * will cause the run to use multiple passes. Be careful of integer 403 * overflows! 404 */ 405 if (MemoryUse > MemoryLimit) { 406 DedupCrcEnd = DedupCrcStart + 407 (u_int32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2; 408 if (VerboseOpt) { 409 printf("memory limit crc-range %08x-%08x\n", 410 DedupCrcStart, DedupCrcEnd); 411 fflush(stdout); 412 } 413 for (;;) { 414 sim_de = RB_MAX(sim_dedup_entry_rb_tree, 415 &sim_dedup_tree); 416 if (sim_de == NULL || sim_de->crc < DedupCrcEnd) 417 break; 418 RB_REMOVE(sim_dedup_entry_rb_tree, 419 &sim_dedup_tree, sim_de); 420 MemoryUse -= sizeof(*sim_de); 421 free(sim_de); 422 } 423 } 424 425 /* 426 * Collect statistics based on the CRC only, do not try to read 427 * any data blocks or run SHA hashes. 428 */ 429 sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree, 430 scan_leaf->data_crc); 431 432 if (sim_de == NULL) { 433 sim_de = calloc(sizeof(*sim_de), 1); 434 sim_de->crc = scan_leaf->data_crc; 435 RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de); 436 MemoryUse += sizeof(*sim_de); 437 } 438 439 sim_de->ref_blks += 1; 440 sim_de->ref_size += scan_leaf->data_len; 441 return (1); 442 } 443 444 static __inline int 445 validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q) 446 { 447 if ((p->data_offset & HAMMER_OFF_ZONE_MASK) != 448 (q->data_offset & HAMMER_OFF_ZONE_MASK)) { 449 return (1); 450 } 451 if (p->data_len != q->data_len) { 452 return (1); 453 } 454 455 return (0); 456 } 457 458 #define DEDUP_TECH_FAILURE 1 459 #define DEDUP_CMP_FAILURE 2 460 #define DEDUP_INVALID_ZONE 3 461 #define DEDUP_UNDERFLOW 4 462 #define DEDUP_VERS_FAILURE 5 463 464 static __inline int 465 deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q) 466 { 467 struct hammer_ioc_dedup dedup; 468 469 bzero(&dedup, sizeof(dedup)); 470 471 /* 472 * If data_offset fields are the same there is no need to run ioctl, 473 * candidate is already dedup'ed. 474 */ 475 if (p->data_offset == q->data_offset) { 476 return (0); 477 } 478 479 dedup.elm1 = p->base; 480 dedup.elm2 = q->base; 481 RunningIoctl = 1; 482 if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) { 483 if (errno == EOPNOTSUPP) { 484 /* must be at least version 5 */ 485 return (DEDUP_VERS_FAILURE); 486 } 487 /* Technical failure - locking or w/e */ 488 return (DEDUP_TECH_FAILURE); 489 } 490 if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE) { 491 return (DEDUP_CMP_FAILURE); 492 } 493 if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE) { 494 return (DEDUP_INVALID_ZONE); 495 } 496 if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW) { 497 return (DEDUP_UNDERFLOW); 498 } 499 RunningIoctl = 0; 500 ++dedup_successes_count; 501 dedup_successes_bytes += p->data_len; 502 return (0); 503 } 504 505 static int 506 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags) 507 { 508 struct dedup_entry *de; 509 struct sha_dedup_entry *sha_de, temp; 510 struct pass2_dedup_entry *pass2_de; 511 int error; 512 513 /* 514 * If we are using too much memory we have to clean some out, which 515 * will cause the run to use multiple passes. Be careful of integer 516 * overflows! 517 */ 518 while (MemoryUse > MemoryLimit) { 519 DedupCrcEnd = DedupCrcStart + 520 (u_int32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2; 521 if (VerboseOpt) { 522 printf("memory limit crc-range %08x-%08x\n", 523 DedupCrcStart, DedupCrcEnd); 524 fflush(stdout); 525 } 526 527 for (;;) { 528 de = RB_MAX(dedup_entry_rb_tree, &dedup_tree); 529 if (de == NULL || de->leaf.data_crc < DedupCrcEnd) 530 break; 531 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) { 532 while ((sha_de = RB_ROOT(&de->u.fict_root)) != 533 NULL) { 534 RB_REMOVE(sha_dedup_entry_rb_tree, 535 &de->u.fict_root, sha_de); 536 MemoryUse -= sizeof(*sha_de); 537 free(sha_de); 538 } 539 } 540 RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de); 541 MemoryUse -= sizeof(*de); 542 free(de); 543 } 544 } 545 546 /* 547 * Collect statistics based on the CRC. Colliding CRCs usually 548 * cause a SHA sub-tree to be created under the de. 549 * 550 * Trivial case if de not found. 551 */ 552 de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc); 553 if (de == NULL) { 554 de = calloc(sizeof(*de), 1); 555 de->leaf = *scan_leaf; 556 RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de); 557 MemoryUse += sizeof(*de); 558 goto upgrade_stats; 559 } 560 561 /* 562 * Found entry in CRC tree 563 */ 564 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) { 565 /* 566 * Optimize the case where a CRC failure results in multiple 567 * SHA entries. If we unconditionally issue a data-read a 568 * degenerate situation where a colliding CRC's second SHA 569 * entry contains the lion's share of the deduplication 570 * candidates will result in excessive data block reads. 571 * 572 * Deal with the degenerate case by looking for a matching 573 * data_offset/data_len in the SHA elements we already have 574 * before reading the data block and generating a new SHA. 575 */ 576 RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) { 577 if (sha_de->leaf.data_offset == 578 scan_leaf->data_offset && 579 sha_de->leaf.data_len == scan_leaf->data_len) { 580 memcpy(temp.sha_hash, sha_de->sha_hash, 581 SHA256_DIGEST_LENGTH); 582 break; 583 } 584 } 585 586 /* 587 * Entry in CRC tree is fictitious, so we already had problems 588 * with this CRC. Upgrade (compute SHA) the candidate and 589 * dive into SHA subtree. If upgrade fails insert the candidate 590 * into Pass2 list (it will be processed later). 591 */ 592 if (sha_de == NULL) { 593 if (upgrade_chksum(scan_leaf, temp.sha_hash)) 594 goto pass2_insert; 595 596 sha_de = RB_FIND(sha_dedup_entry_rb_tree, 597 &de->u.fict_root, &temp); 598 } 599 600 /* 601 * Nothing in SHA subtree so far, so this is a new 602 * 'dataset'. Insert new entry into SHA subtree. 603 */ 604 if (sha_de == NULL) { 605 sha_de = calloc(sizeof(*sha_de), 1); 606 sha_de->leaf = *scan_leaf; 607 memcpy(sha_de->sha_hash, temp.sha_hash, 608 SHA256_DIGEST_LENGTH); 609 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, 610 sha_de); 611 MemoryUse += sizeof(*sha_de); 612 goto upgrade_stats_sha; 613 } 614 615 /* 616 * Found entry in SHA subtree, it means we have a potential 617 * dedup pair. Validate it (zones have to match and data_len 618 * field have to be the same too. If validation fails, treat 619 * it as a SHA collision (jump to sha256_failure). 620 */ 621 if (validate_dedup_pair(&sha_de->leaf, scan_leaf)) 622 goto sha256_failure; 623 624 /* 625 * We have a valid dedup pair (SHA match, validated). 626 * 627 * In case of technical failure (dedup pair was good, but 628 * ioctl failed anyways) insert the candidate into Pass2 list 629 * (we will try to dedup it after we are done with the rest of 630 * the tree). 631 * 632 * If ioctl fails because either of blocks is in the non-dedup 633 * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't 634 * bother with the candidate and terminate early. 635 * 636 * If ioctl fails because of big-block underflow replace the 637 * leaf node that found dedup entry represents with scan_leaf. 638 */ 639 error = deduplicate(&sha_de->leaf, scan_leaf); 640 switch(error) { 641 case 0: 642 goto upgrade_stats_sha; 643 case DEDUP_TECH_FAILURE: 644 goto pass2_insert; 645 case DEDUP_CMP_FAILURE: 646 goto sha256_failure; 647 case DEDUP_INVALID_ZONE: 648 goto terminate_early; 649 case DEDUP_UNDERFLOW: 650 ++dedup_underflows; 651 sha_de->leaf = *scan_leaf; 652 memcpy(sha_de->sha_hash, temp.sha_hash, 653 SHA256_DIGEST_LENGTH); 654 goto upgrade_stats_sha; 655 case DEDUP_VERS_FAILURE: 656 fprintf(stderr, 657 "HAMMER filesystem must be at least " 658 "version 5 to dedup\n"); 659 exit (1); 660 default: 661 fprintf(stderr, "Unknown error\n"); 662 goto terminate_early; 663 } 664 665 /* 666 * Ooh la la.. SHA-256 collision. Terminate early, there's 667 * nothing we can do here. 668 */ 669 sha256_failure: 670 ++dedup_sha_failures; 671 goto terminate_early; 672 } else { 673 /* 674 * Candidate CRC is good for now (we found an entry in CRC 675 * tree and it's not fictitious). This means we have a 676 * potential dedup pair. 677 */ 678 if (validate_dedup_pair(&de->leaf, scan_leaf)) 679 goto crc_failure; 680 681 /* 682 * We have a valid dedup pair (CRC match, validated) 683 */ 684 error = deduplicate(&de->leaf, scan_leaf); 685 switch(error) { 686 case 0: 687 goto upgrade_stats; 688 case DEDUP_TECH_FAILURE: 689 goto pass2_insert; 690 case DEDUP_CMP_FAILURE: 691 goto crc_failure; 692 case DEDUP_INVALID_ZONE: 693 goto terminate_early; 694 case DEDUP_UNDERFLOW: 695 ++dedup_underflows; 696 de->leaf = *scan_leaf; 697 goto upgrade_stats; 698 case DEDUP_VERS_FAILURE: 699 fprintf(stderr, 700 "HAMMER filesystem must be at least " 701 "version 5 to dedup\n"); 702 exit (1); 703 default: 704 fprintf(stderr, "Unknown error\n"); 705 goto terminate_early; 706 } 707 708 crc_failure: 709 /* 710 * We got a CRC collision - either ioctl failed because of 711 * the comparison failure or validation of the potential 712 * dedup pair went bad. In all cases insert both blocks 713 * into SHA subtree (this requires checksum upgrade) and mark 714 * entry that corresponds to this CRC in the CRC tree 715 * fictitious, so that all futher operations with this CRC go 716 * through SHA subtree. 717 */ 718 ++dedup_crc_failures; 719 720 /* 721 * Insert block that was represented by now fictitious dedup 722 * entry (create a new SHA entry and preserve stats of the 723 * old CRC one). If checksum upgrade fails insert the 724 * candidate into Pass2 list and return - keep both trees 725 * unmodified. 726 */ 727 sha_de = calloc(sizeof(*sha_de), 1); 728 sha_de->leaf = de->leaf; 729 sha_de->ref_blks = de->u.de.ref_blks; 730 sha_de->ref_size = de->u.de.ref_size; 731 if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) { 732 free(sha_de); 733 goto pass2_insert; 734 } 735 MemoryUse += sizeof(*sha_de); 736 737 RB_INIT(&de->u.fict_root); 738 /* 739 * Here we can insert without prior checking because the tree 740 * is empty at this point 741 */ 742 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de); 743 744 /* 745 * Mark entry in CRC tree fictitious 746 */ 747 de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS; 748 749 /* 750 * Upgrade checksum of the candidate and insert it into 751 * SHA subtree. If upgrade fails insert the candidate into 752 * Pass2 list. 753 */ 754 if (upgrade_chksum(scan_leaf, temp.sha_hash)) { 755 goto pass2_insert; 756 } 757 sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root, 758 &temp); 759 if (sha_de != NULL) 760 /* There is an entry with this SHA already, but the only 761 * RB-tree element at this point is that entry we just 762 * added. We know for sure these blocks are different 763 * (this is crc_failure branch) so treat it as SHA 764 * collision. 765 */ 766 goto sha256_failure; 767 768 sha_de = calloc(sizeof(*sha_de), 1); 769 sha_de->leaf = *scan_leaf; 770 memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH); 771 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de); 772 MemoryUse += sizeof(*sha_de); 773 goto upgrade_stats_sha; 774 } 775 776 upgrade_stats: 777 de->u.de.ref_blks += 1; 778 de->u.de.ref_size += scan_leaf->data_len; 779 return (1); 780 781 upgrade_stats_sha: 782 sha_de->ref_blks += 1; 783 sha_de->ref_size += scan_leaf->data_len; 784 return (1); 785 786 pass2_insert: 787 /* 788 * If in pass2 mode don't insert anything, fall through to 789 * terminate_early 790 */ 791 if ((flags & DEDUP_PASS2) == 0) { 792 pass2_de = calloc(sizeof(*pass2_de), 1); 793 pass2_de->leaf = *scan_leaf; 794 STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry); 795 dedup_skipped_size += scan_leaf->data_len; 796 return (1); 797 } 798 799 terminate_early: 800 /* 801 * Early termination path. Fixup stats. 802 */ 803 dedup_alloc_size += scan_leaf->data_len; 804 dedup_ref_size += scan_leaf->data_len; 805 return (0); 806 } 807 808 static int 809 upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash) 810 { 811 struct hammer_ioc_data data; 812 char *buf = malloc(DEDUP_BUF); 813 SHA256_CTX ctx; 814 int error; 815 816 bzero(&data, sizeof(data)); 817 data.elm = leaf->base; 818 data.ubuf = buf; 819 data.size = DEDUP_BUF; 820 821 error = 0; 822 if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) { 823 fprintf(stderr, "Get-data failed: %s\n", strerror(errno)); 824 error = 1; 825 goto done; 826 } 827 DedupDataReads += leaf->data_len; 828 829 if (data.leaf.data_len != leaf->data_len) { 830 error = 1; 831 goto done; 832 } 833 834 if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD && 835 data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) { 836 SHA256_Init(&ctx); 837 SHA256_Update(&ctx, (void *)buf, data.leaf.data_len); 838 SHA256_Final(sha_hash, &ctx); 839 } 840 841 done: 842 free(buf); 843 return (error); 844 } 845 846 static void 847 sigAlrm(int signo __unused) 848 { 849 SigAlrmFlag = 1; 850 } 851 852 static void 853 sigInfo(int signo __unused) 854 { 855 SigInfoFlag = 1; 856 } 857 858 static void 859 scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id) 860 { 861 struct hammer_ioc_mirror_rw mirror; 862 hammer_ioc_mrecord_any_t mrec; 863 struct hammer_btree_leaf_elm elm; 864 char *buf = malloc(DEDUP_BUF); 865 char buf1[8]; 866 char buf2[8]; 867 int offset, bytes; 868 869 SigInfoFlag = 0; 870 DedupDataReads = 0; 871 DedupCurrentRecords = 0; 872 signal(SIGINFO, sigInfo); 873 signal(SIGALRM, sigAlrm); 874 875 /* 876 * Deduplication happens per element so hammer(8) is in full 877 * control of the ioctl()s to actually perform it. SIGALRM 878 * needs to be handled within hammer(8) but a checkpoint 879 * is needed for resuming. Use cycle file for that. 880 * 881 * Try to obtain the previous obj_id from the cycle file and 882 * if not available just start from the beginning. 883 */ 884 bzero(&mirror, sizeof(mirror)); 885 hammer_key_beg_init(&mirror.key_beg); 886 hammer_get_cycle(&mirror.key_beg, &mirror.tid_beg); 887 888 if (mirror.key_beg.obj_id != (int64_t)HAMMER_MIN_OBJID) { 889 if (VerboseOpt) 890 fprintf(stderr, "%s: mirror-read: Resuming at object %016jx\n", 891 id, (uintmax_t)mirror.key_beg.obj_id); 892 } 893 894 hammer_key_end_init(&mirror.key_end); 895 896 mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid; 897 mirror.tid_end = glob_pfs.ondisk->sync_end_tid; 898 mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */ 899 mirror.ubuf = buf; 900 mirror.size = DEDUP_BUF; 901 mirror.pfs_id = glob_pfs.pfs_id; 902 mirror.shared_uuid = glob_pfs.ondisk->shared_uuid; 903 904 if (VerboseOpt && DedupCrcStart == 0) { 905 printf("%s %s: objspace %016jx:%04x %016jx:%04x\n", 906 id, filesystem, 907 (uintmax_t)mirror.key_beg.obj_id, 908 mirror.key_beg.localization, 909 (uintmax_t)mirror.key_end.obj_id, 910 mirror.key_end.localization); 911 printf("%s %s: pfs_id %d\n", 912 id, filesystem, glob_pfs.pfs_id); 913 } 914 fflush(stdout); 915 fflush(stderr); 916 917 do { 918 mirror.count = 0; 919 mirror.pfs_id = glob_pfs.pfs_id; 920 mirror.shared_uuid = glob_pfs.ondisk->shared_uuid; 921 if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) { 922 fprintf(stderr, "Mirror-read %s failed: %s\n", 923 filesystem, strerror(errno)); 924 exit(1); 925 } 926 if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) { 927 fprintf(stderr, "Mirror-read %s fatal error %d\n", 928 filesystem, mirror.head.error); 929 exit(1); 930 } 931 if (mirror.count) { 932 offset = 0; 933 while (offset < mirror.count) { 934 mrec = (void *)((char *)buf + offset); 935 bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size); 936 if (offset + bytes > mirror.count) { 937 fprintf(stderr, "Misaligned record\n"); 938 exit(1); 939 } 940 assert((mrec->head.type & 941 HAMMER_MRECF_TYPE_MASK) == 942 HAMMER_MREC_TYPE_REC); 943 offset += bytes; 944 elm = mrec->rec.leaf; 945 if (elm.base.btype != HAMMER_BTREE_TYPE_RECORD) 946 continue; 947 if (elm.base.rec_type != HAMMER_RECTYPE_DATA) 948 continue; 949 ++DedupCurrentRecords; 950 if (DedupCrcStart != DedupCrcEnd) { 951 if (elm.data_crc < DedupCrcStart) 952 continue; 953 if (DedupCrcEnd && 954 elm.data_crc >= DedupCrcEnd) { 955 continue; 956 } 957 } 958 func(&elm, 0); 959 } 960 } 961 mirror.key_beg = mirror.key_cur; 962 if (DidInterrupt || SigAlrmFlag) { 963 if (VerboseOpt) 964 fprintf(stderr, "%s\n", 965 (DidInterrupt ? "Interrupted" : "Timeout")); 966 hammer_set_cycle(&mirror.key_cur, mirror.tid_beg); 967 if (VerboseOpt) 968 fprintf(stderr, "Cyclefile %s updated for " 969 "continuation\n", CyclePath); 970 exit(1); 971 } 972 if (SigInfoFlag) { 973 if (DedupTotalRecords) { 974 humanize_unsigned(buf1, sizeof(buf1), 975 DedupDataReads, 976 "B", 1024); 977 humanize_unsigned(buf2, sizeof(buf2), 978 dedup_successes_bytes, 979 "B", 1024); 980 fprintf(stderr, "%s count %7jd/%jd " 981 "(%02d.%02d%%) " 982 "ioread %s newddup %s\n", 983 id, 984 (intmax_t)DedupCurrentRecords, 985 (intmax_t)DedupTotalRecords, 986 (int)(DedupCurrentRecords * 100 / 987 DedupTotalRecords), 988 (int)(DedupCurrentRecords * 10000 / 989 DedupTotalRecords % 100), 990 buf1, buf2); 991 } else { 992 fprintf(stderr, "%s count %-7jd\n", 993 id, 994 (intmax_t)DedupCurrentRecords); 995 } 996 SigInfoFlag = 0; 997 } 998 } while (mirror.count != 0); 999 1000 signal(SIGINFO, SIG_IGN); 1001 signal(SIGALRM, SIG_IGN); 1002 1003 free(buf); 1004 } 1005 1006 static void 1007 dump_simulated_dedup(void) 1008 { 1009 struct sim_dedup_entry *sim_de; 1010 1011 printf("=== Dumping simulated dedup entries:\n"); 1012 RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) { 1013 printf("\tcrc=%08x cnt=%ju size=%ju\n", 1014 sim_de->crc, 1015 (intmax_t)sim_de->ref_blks, 1016 (intmax_t)sim_de->ref_size); 1017 } 1018 printf("end of dump ===\n"); 1019 } 1020 1021 static void 1022 dump_real_dedup(void) 1023 { 1024 struct dedup_entry *de; 1025 struct sha_dedup_entry *sha_de; 1026 int i; 1027 1028 printf("=== Dumping dedup entries:\n"); 1029 RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) { 1030 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) { 1031 printf("\tcrc=%08x fictitious\n", de->leaf.data_crc); 1032 1033 RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) { 1034 printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t" 1035 "\t\tsha=", 1036 sha_de->leaf.data_crc, 1037 (intmax_t)sha_de->ref_blks, 1038 (intmax_t)sha_de->ref_size); 1039 for (i = 0; i < SHA256_DIGEST_LENGTH; ++i) 1040 printf("%02x", sha_de->sha_hash[i]); 1041 printf("\n"); 1042 } 1043 } else { 1044 printf("\tcrc=%08x cnt=%ju size=%ju\n", 1045 de->leaf.data_crc, 1046 (intmax_t)de->u.de.ref_blks, 1047 (intmax_t)de->u.de.ref_size); 1048 } 1049 } 1050 printf("end of dump ===\n"); 1051 } 1052 1053 static void 1054 dedup_usage(int code) 1055 { 1056 fprintf(stderr, 1057 "hammer dedup-simulate <filesystem>\n" 1058 "hammer dedup <filesystem>\n" 1059 ); 1060 exit(code); 1061 } 1062