xref: /dragonfly/sbin/hammer/cmd_dedup.c (revision fa419eae)
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer.h"
36 #include <libutil.h>
37 #include <crypto/sha2/sha2.h>
38 
39 #define DEDUP_BUF (64 * 1024)
40 
41 /* Sorted list of block CRCs - light version for dedup-simulate */
42 struct sim_dedup_entry_rb_tree;
43 RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree =
44 					RB_INITIALIZER(&sim_dedup_tree);
45 RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
46 		rb_sim_dedup_entry_compare, hammer_crc_t);
47 
48 struct sim_dedup_entry {
49 	hammer_crc_t	crc;
50 	u_int64_t	ref_blks; /* number of blocks referenced */
51 	u_int64_t	ref_size; /* size of data referenced */
52 	RB_ENTRY(sim_dedup_entry) rb_entry;
53 };
54 
55 /* Sorted list of HAMMER B-Tree keys */
56 struct dedup_entry_rb_tree;
57 struct sha_dedup_entry_rb_tree;
58 
59 RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree =
60 					RB_INITIALIZER(&dedup_tree);
61 RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
62 		rb_dedup_entry_compare, hammer_crc_t);
63 
64 RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
65 		rb_sha_dedup_entry_compare);
66 
67 struct dedup_entry {
68 	struct hammer_btree_leaf_elm leaf;
69 	union {
70 		struct {
71 			u_int64_t ref_blks;
72 			u_int64_t ref_size;
73 		} de;
74 		RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root;
75 	} u;
76 	u_int8_t flags;
77 	RB_ENTRY(dedup_entry) rb_entry;
78 };
79 
80 #define HAMMER_DEDUP_ENTRY_FICTITIOUS	0x0001
81 
82 struct sha_dedup_entry {
83 	struct hammer_btree_leaf_elm	leaf;
84 	u_int64_t			ref_blks;
85 	u_int64_t			ref_size;
86 	u_int8_t			sha_hash[SHA256_DIGEST_LENGTH];
87 	RB_ENTRY(sha_dedup_entry)	fict_entry;
88 };
89 
90 /*
91  * Pass2 list - contains entries that were not dedup'ed because ioctl failed
92  */
93 STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue =
94 				STAILQ_HEAD_INITIALIZER(pass2_dedup_queue);
95 
96 struct pass2_dedup_entry {
97 	struct hammer_btree_leaf_elm	leaf;
98 	STAILQ_ENTRY(pass2_dedup_entry)	sq_entry;
99 };
100 
101 #define DEDUP_PASS2	0x0001 /* process_btree_elm() mode */
102 
103 /* PFS global ids - we deal with just one PFS at a run */
104 int glob_fd;
105 struct hammer_ioc_pseudofs_rw glob_pfs;
106 
107 /*
108  * Global accounting variables
109  *
110  * Last three don't have to be 64-bit, just to be safe..
111  */
112 u_int64_t dedup_alloc_size = 0;
113 u_int64_t dedup_ref_size = 0;
114 u_int64_t dedup_skipped_size = 0;
115 u_int64_t dedup_crc_failures = 0;
116 u_int64_t dedup_sha_failures = 0;
117 u_int64_t dedup_underflows = 0;
118 
119 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
120 				struct sim_dedup_entry *sim_de2);
121 static int rb_dedup_entry_compare(struct dedup_entry *de1,
122 				struct dedup_entry *de2);
123 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
124 				struct sha_dedup_entry *sha_de2);
125 typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags);
126 static void scan_pfs(char *filesystem, scan_pfs_cb_t func);
127 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
128 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
129 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash);
130 static void dump_simulated_dedup(void);
131 static void dump_real_dedup(void);
132 static void dedup_usage(int code);
133 
134 RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
135 		rb_sim_dedup_entry_compare, hammer_crc_t, crc);
136 RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
137 		rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc);
138 RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
139 		rb_sha_dedup_entry_compare);
140 
141 static int
142 rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
143 			struct sim_dedup_entry *sim_de2)
144 {
145 	if (sim_de1->crc < sim_de2->crc)
146 		return (-1);
147 	if (sim_de1->crc > sim_de2->crc)
148 		return (1);
149 
150 	return (0);
151 }
152 
153 static int
154 rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2)
155 {
156 	if (de1->leaf.data_crc < de2->leaf.data_crc)
157 		return (-1);
158 	if (de1->leaf.data_crc > de2->leaf.data_crc)
159 		return (1);
160 
161 	return (0);
162 }
163 
164 static int
165 rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
166 			struct sha_dedup_entry *sha_de2)
167 {
168 	unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash;
169 	unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash;
170 	int i;
171 
172 	for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) {
173 		if (h1[i] < h2[i])
174 			return (-1);
175 		if (h1[i] > h2[i])
176 			return (1);
177 	}
178 
179 	return (0);
180 }
181 
182 /*
183  * dedup-simulate <filesystem>
184  */
185 void
186 hammer_cmd_dedup_simulate(char **av, int ac)
187 {
188 	struct sim_dedup_entry *sim_de;
189 
190 	if (ac != 1)
191 		dedup_usage(1);
192 
193 	glob_fd = getpfs(&glob_pfs, av[0]);
194 
195 	printf("Dedup-simulate ");
196 	scan_pfs(av[0], &collect_btree_elm);
197 	printf("Dedup-simulate %s succeeded\n", av[0]);
198 
199 	relpfs(glob_fd, &glob_pfs);
200 
201 	if (VerboseOpt >= 2)
202 		dump_simulated_dedup();
203 
204 	/*
205 	 * Calculate simulated dedup ratio and get rid of the tree
206 	 */
207 	while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) {
208 		assert(sim_de->ref_blks != 0);
209 		dedup_ref_size += sim_de->ref_size;
210 		dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks;
211 
212 		RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
213 		free(sim_de);
214 	}
215 
216 	printf("Simulated dedup ratio = %.2f\n",
217 	    (double)dedup_ref_size / dedup_alloc_size);
218 }
219 
220 /*
221  * dedup <filesystem>
222  */
223 void
224 hammer_cmd_dedup(char **av, int ac)
225 {
226 	struct dedup_entry *de;
227 	struct sha_dedup_entry *sha_de;
228 	struct pass2_dedup_entry *pass2_de;
229 	char buf[8];
230 
231 	if (TimeoutOpt > 0)
232 		alarm(TimeoutOpt);
233 
234 	if (ac != 1)
235 		dedup_usage(1);
236 
237 	STAILQ_INIT(&pass2_dedup_queue);
238 
239 	glob_fd = getpfs(&glob_pfs, av[0]);
240 
241 	printf("Dedup ");
242 	scan_pfs(av[0], &process_btree_elm);
243 
244 	while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) {
245 		if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2))
246 			dedup_skipped_size -= pass2_de->leaf.data_len;
247 
248 		STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry);
249 		free(pass2_de);
250 	}
251 	assert(STAILQ_EMPTY(&pass2_dedup_queue));
252 
253 	printf("Dedup %s succeeded\n", av[0]);
254 
255 	relpfs(glob_fd, &glob_pfs);
256 
257 	if (VerboseOpt >= 2)
258 		dump_real_dedup();
259 
260 	/*
261 	 * Calculate dedup ratio and get rid of the trees
262 	 */
263 	while ((de = RB_ROOT(&dedup_tree)) != NULL) {
264 		if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
265 			while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) {
266 				assert(sha_de->ref_blks != 0);
267 				dedup_ref_size += sha_de->ref_size;
268 				dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks;
269 
270 				RB_REMOVE(sha_dedup_entry_rb_tree,
271 						&de->u.fict_root, sha_de);
272 				free(sha_de);
273 			}
274 			assert(RB_EMPTY(&de->u.fict_root));
275 		} else {
276 			assert(de->u.de.ref_blks != 0);
277 			dedup_ref_size += de->u.de.ref_size;
278 			dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks;
279 		}
280 
281 		RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
282 		free(de);
283 	}
284 	assert(RB_EMPTY(&dedup_tree));
285 
286 	assert(dedup_alloc_size != 0);
287 	humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024);
288 	printf("Dedup ratio = %.2f\n"
289 	       "    %8s referenced\n",
290 	       (double)dedup_ref_size / dedup_alloc_size,
291 	       buf
292 	);
293 	humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024);
294 	printf("    %8s allocated\n", buf);
295 	humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024);
296 	printf("    %8s skipped\n", buf);
297 	printf("    %8jd CRC collisions\n"
298 	       "    %8jd SHA collisions\n"
299 	       "    %8jd bigblock underflows\n",
300 	       (intmax_t)dedup_crc_failures,
301 	       (intmax_t)dedup_sha_failures,
302                (intmax_t)dedup_underflows
303 	);
304 }
305 
306 static int
307 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused)
308 {
309 	struct sim_dedup_entry *sim_de;
310 
311 	sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree, scan_leaf->data_crc);
312 
313 	if (sim_de == NULL) {
314 		sim_de = calloc(sizeof(*sim_de), 1);
315 		sim_de->crc = scan_leaf->data_crc;
316 		RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
317 	}
318 
319 	sim_de->ref_blks += 1;
320 	sim_de->ref_size += scan_leaf->data_len;
321 	return (1);
322 }
323 
324 static __inline int
325 validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
326 {
327 	if ((p->data_offset & HAMMER_OFF_ZONE_MASK) !=
328 	    (q->data_offset & HAMMER_OFF_ZONE_MASK)) {
329 		return (1);
330 	}
331 	if (p->data_len != q->data_len) {
332 		return (1);
333 	}
334 
335 	return (0);
336 }
337 
338 #define DEDUP_TECH_FAILURE	1
339 #define DEDUP_CMP_FAILURE	2
340 #define DEDUP_INVALID_ZONE	3
341 #define DEDUP_UNDERFLOW		4
342 #define DEDUP_VERS_FAILURE	5
343 
344 static __inline int
345 deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
346 {
347 	struct hammer_ioc_dedup dedup;
348 
349 	bzero(&dedup, sizeof(dedup));
350 
351 	/*
352 	 * If data_offset fields are the same there is no need to run ioctl,
353 	 * candidate is already dedup'ed.
354 	 */
355 	if (p->data_offset == q->data_offset) {
356 		return (0);
357 	}
358 
359 	dedup.elm1 = p->base;
360 	dedup.elm2 = q->base;
361 	RunningIoctl = 1;
362 	if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) {
363 		if (errno == EOPNOTSUPP) {
364 			/* must be at least version 5 */
365 			return (DEDUP_VERS_FAILURE);
366 		}
367 		/* Technical failure - locking or w/e */
368 		return (DEDUP_TECH_FAILURE);
369 	}
370 	if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE) {
371 		return (DEDUP_CMP_FAILURE);
372 	}
373 	if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE) {
374 		return (DEDUP_INVALID_ZONE);
375 	}
376 	if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW) {
377 		return (DEDUP_UNDERFLOW);
378 	}
379 	RunningIoctl = 0;
380 
381 	return (0);
382 }
383 
384 static int
385 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags)
386 {
387 	struct dedup_entry *de;
388 	struct sha_dedup_entry *sha_de, temp;
389 	struct pass2_dedup_entry *pass2_de;
390 	int error;
391 
392 	de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc);
393 	if (de == NULL) {
394 		/*
395 		 * Insert new entry into CRC tree
396 		 */
397 		de = calloc(sizeof(*de), 1);
398 		de->leaf = *scan_leaf;
399 		RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de);
400 		goto upgrade_stats;
401 	}
402 
403 	/*
404 	 * Found entry in CRC tree
405 	 */
406 	if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
407 		/*
408 		 * Entry in CRC tree is fictious, so we already had problems
409 		 * with this CRC. Upgrade (compute SHA) the candidate and
410 		 * dive into SHA subtree. If upgrade fails insert the candidate
411 		 * into Pass2 list (it will be processed later).
412 		 */
413 		if (upgrade_chksum(scan_leaf, temp.sha_hash))
414 			goto pass2_insert;
415 
416 		sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root, &temp);
417 		if (sha_de == NULL) {
418 			/*
419 			 * Nothing in SHA subtree so far, so this is a new
420 			 * 'dataset'. Insert new entry into SHA subtree.
421 			 */
422 			sha_de = calloc(sizeof(*sha_de), 1);
423 			sha_de->leaf = *scan_leaf;
424 			memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
425 			RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
426 			goto upgrade_stats_sha;
427 		}
428 
429 		/*
430 		 * Found entry in SHA subtree, it means we have a potential
431 		 * dedup pair. Validate it (zones have to match and data_len
432 		 * field have to be the same too. If validation fails, treat
433 		 * it as a SHA collision (jump to sha256_failure).
434 		 */
435 		if (validate_dedup_pair(&sha_de->leaf, scan_leaf))
436 			goto sha256_failure;
437 
438 		/*
439 		 * We have a valid dedup pair (SHA match, validated).
440 		 *
441 		 * In case of technical failure (dedup pair was good, but
442 		 * ioctl failed anyways) insert the candidate into Pass2 list
443 		 * (we will try to dedup it after we are done with the rest of
444 		 * the tree).
445 		 *
446 		 * If ioctl fails because either of blocks is in the non-dedup
447 		 * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't
448 		 * bother with the candidate and terminate early.
449 		 *
450 		 * If ioctl fails because of bigblock underflow replace the
451 		 * leaf node that found dedup entry represents with scan_leaf.
452 		 */
453 		error = deduplicate(&sha_de->leaf, scan_leaf);
454 		switch(error) {
455 		case 0:
456 			goto upgrade_stats_sha;
457 		case DEDUP_TECH_FAILURE:
458 			goto pass2_insert;
459 		case DEDUP_CMP_FAILURE:
460 			goto sha256_failure;
461 		case DEDUP_INVALID_ZONE:
462 			goto terminate_early;
463 		case DEDUP_UNDERFLOW:
464 			++dedup_underflows;
465 			sha_de->leaf = *scan_leaf;
466 			memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
467 			goto upgrade_stats_sha;
468 		case DEDUP_VERS_FAILURE:
469 			fprintf(stderr,
470 				"HAMMER filesystem must be at least "
471 				"version 5 to dedup\n");
472 			exit (1);
473 		default:
474 			fprintf(stderr, "Unknown error\n");
475 			goto terminate_early;
476 		}
477 
478 		/*
479 		 * Ooh la la.. SHA-256 collision. Terminate early, there's
480 		 * nothing we can do here.
481 		 */
482 sha256_failure:
483 		++dedup_sha_failures;
484 		goto terminate_early;
485 	} else {
486 		/*
487 		 * Candidate CRC is good for now (we found an entry in CRC
488 		 * tree and it's not fictious). This means we have a
489 		 * potential dedup pair.
490 		 */
491 		if (validate_dedup_pair(&de->leaf, scan_leaf))
492 			goto crc_failure;
493 
494 		/*
495 		 * We have a valid dedup pair (CRC match, validated)
496 		 */
497 		error = deduplicate(&de->leaf, scan_leaf);
498 		switch(error) {
499 		case 0:
500 			goto upgrade_stats;
501 		case DEDUP_TECH_FAILURE:
502 			goto pass2_insert;
503 		case DEDUP_CMP_FAILURE:
504 			goto crc_failure;
505 		case DEDUP_INVALID_ZONE:
506 			goto terminate_early;
507 		case DEDUP_UNDERFLOW:
508 			++dedup_underflows;
509 			de->leaf = *scan_leaf;
510 			goto upgrade_stats;
511 		case DEDUP_VERS_FAILURE:
512 			fprintf(stderr,
513 				"HAMMER filesystem must be at least "
514 				"version 5 to dedup\n");
515 			exit (1);
516 		default:
517 			fprintf(stderr, "Unknown error\n");
518 			goto terminate_early;
519 		}
520 
521 crc_failure:
522 		/*
523 		 * We got a CRC collision - either ioctl failed because of
524 		 * the comparison failure or validation of the potential
525 		 * dedup pair went bad. In all cases insert both blocks
526 		 * into SHA subtree (this requires checksum upgrade) and mark
527 		 * entry that corresponds to this CRC in the CRC tree
528 		 * fictious, so that all futher operations with this CRC go
529 		 * through SHA subtree.
530 		 */
531 		++dedup_crc_failures;
532 		/*
533 		 * Insert block that was represented by now fictious dedup
534 		 * entry (create a new SHA entry and preserve stats of the
535 		 * old CRC one). If checksum upgrade fails insert the
536 		 * candidate into Pass2 list and return - keep both trees
537 		 * unmodified.
538 		 */
539 		sha_de = calloc(sizeof(*sha_de), 1);
540 		sha_de->leaf = de->leaf;
541 		sha_de->ref_blks = de->u.de.ref_blks;
542 		sha_de->ref_size = de->u.de.ref_size;
543 		if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) {
544 			free(sha_de);
545 			goto pass2_insert;
546 		}
547 
548 		RB_INIT(&de->u.fict_root);
549 		/*
550 		 * Here we can insert without prior checking because the tree
551 		 * is empty at this point
552 		 */
553 		RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
554 
555 		/*
556 		 * Mark entry in CRC tree fictious
557 		 */
558 		de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS;
559 
560 		/*
561 		 * Upgrade checksum of the candidate and insert it into
562 		 * SHA subtree. If upgrade fails insert the candidate into
563 		 * Pass2 list.
564 		 */
565 		if (upgrade_chksum(scan_leaf, temp.sha_hash)) {
566 			goto pass2_insert;
567 		}
568 		sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root, &temp);
569 		if (sha_de != NULL)
570 			/* There is an entry with this SHA already, but the only
571 			 * RB-tree element at this point is that entry we just
572 			 * added. We know for sure these blocks are different
573 			 * (this is crc_failure branch) so treat it as SHA
574 			 * collision.
575 			 */
576 			goto sha256_failure;
577 
578 		sha_de = calloc(sizeof(*sha_de), 1);
579 		sha_de->leaf = *scan_leaf;
580 		memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
581 		RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
582 		goto upgrade_stats_sha;
583 	}
584 
585 upgrade_stats:
586 	de->u.de.ref_blks += 1;
587 	de->u.de.ref_size += scan_leaf->data_len;
588 	return (1);
589 
590 upgrade_stats_sha:
591 	sha_de->ref_blks += 1;
592 	sha_de->ref_size += scan_leaf->data_len;
593 	return (1);
594 
595 pass2_insert:
596 	/*
597 	 * If in pass2 mode don't insert anything, fall through to
598 	 * terminate_early
599 	 */
600 	if ((flags & DEDUP_PASS2) == 0) {
601 		pass2_de = calloc(sizeof(*pass2_de), 1);
602 		pass2_de->leaf = *scan_leaf;
603 		STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry);
604 		dedup_skipped_size += scan_leaf->data_len;
605 		return (1);
606 	}
607 
608 terminate_early:
609 	/*
610 	 * Early termination path. Fixup stats.
611 	 */
612 	dedup_alloc_size += scan_leaf->data_len;
613 	dedup_ref_size += scan_leaf->data_len;
614 	return (0);
615 }
616 
617 static int
618 upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash)
619 {
620 	struct hammer_ioc_data data;
621 	char *buf = malloc(DEDUP_BUF);
622 	SHA256_CTX ctx;
623 	int error;
624 
625 	bzero(&data, sizeof(data));
626 	data.elm = leaf->base;
627 	data.ubuf = buf;
628 	data.size = DEDUP_BUF;
629 
630 	error = 0;
631 	if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) {
632 		fprintf(stderr, "Get-data failed: %s\n", strerror(errno));
633 		error = 1;
634 		goto done;
635 	}
636 
637 	if (data.leaf.data_len != leaf->data_len) {
638 		error = 1;
639 		goto done;
640 	}
641 
642 	if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD &&
643 	    data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) {
644 		SHA256_Init(&ctx);
645 		SHA256_Update(&ctx, (void *)buf, data.leaf.data_len);
646 		SHA256_Final(sha_hash, &ctx);
647 	}
648 
649 done:
650 	free(buf);
651 	return (error);
652 }
653 
654 static void
655 scan_pfs(char *filesystem, scan_pfs_cb_t func)
656 {
657 	struct hammer_ioc_mirror_rw mirror;
658 	hammer_ioc_mrecord_any_t mrec;
659 	struct hammer_btree_leaf_elm elm;
660 	char *buf = malloc(DEDUP_BUF);
661 	int offset, bytes;
662 
663 	bzero(&mirror, sizeof(mirror));
664 	hammer_key_beg_init(&mirror.key_beg);
665 	hammer_key_end_init(&mirror.key_end);
666 
667 	mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid;
668 	mirror.tid_end = glob_pfs.ondisk->sync_end_tid;
669 	mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */
670 	mirror.ubuf = buf;
671 	mirror.size = DEDUP_BUF;
672 	mirror.pfs_id = glob_pfs.pfs_id;
673 	mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
674 
675 	printf("%s: objspace %016jx:%04x %016jx:%04x pfs_id %d\n",
676 		filesystem,
677 		(uintmax_t)mirror.key_beg.obj_id,
678 		mirror.key_beg.localization,
679 		(uintmax_t)mirror.key_end.obj_id,
680 		mirror.key_end.localization,
681 		glob_pfs.pfs_id);
682 	fflush(stdout);
683 
684 	do {
685 		mirror.count = 0;
686 		mirror.pfs_id = glob_pfs.pfs_id;
687 		mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
688 		if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) {
689 			fprintf(stderr, "Mirror-read %s failed: %s\n",
690 				filesystem, strerror(errno));
691 			exit(1);
692 		}
693 		if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) {
694 			fprintf(stderr, "Mirror-read %s fatal error %d\n",
695 				filesystem, mirror.head.error);
696 			exit(1);
697 		}
698 		if (mirror.count) {
699 			offset = 0;
700 			while (offset < mirror.count) {
701 				mrec = (void *)((char *)buf + offset);
702 				bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size);
703 				if (offset + bytes > mirror.count) {
704 					fprintf(stderr, "Misaligned record\n");
705 					exit(1);
706 				}
707 				assert((mrec->head.type &
708 				    HAMMER_MRECF_TYPE_MASK) == HAMMER_MREC_TYPE_REC);
709 
710 				elm = mrec->rec.leaf;
711 				if (elm.base.btype == HAMMER_BTREE_TYPE_RECORD &&
712 				    elm.base.rec_type == HAMMER_RECTYPE_DATA) {
713 					func(&elm, 0);
714 				}
715 				offset += bytes;
716 			}
717 		}
718 		mirror.key_beg = mirror.key_cur;
719 	} while (mirror.count != 0);
720 
721 	free(buf);
722 }
723 
724 static void
725 dump_simulated_dedup(void)
726 {
727 	struct sim_dedup_entry *sim_de;
728 
729 	printf("=== Dumping simulated dedup entries:\n");
730 	RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) {
731 		printf("\tcrc=%08x cnt=%ju size=%ju\n",
732 			sim_de->crc,
733 			(intmax_t)sim_de->ref_blks,
734 			(intmax_t)sim_de->ref_size);
735 	}
736 	printf("end of dump ===\n");
737 }
738 
739 static void
740 dump_real_dedup(void)
741 {
742 	struct dedup_entry *de;
743 	struct sha_dedup_entry *sha_de;
744 	int i;
745 
746 	printf("=== Dumping dedup entries:\n");
747 	RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) {
748 		if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
749 			printf("\tcrc=%08x fictious\n", de->leaf.data_crc);
750 
751 			RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
752 				printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t"
753 				       "\t\tsha=",
754 				       sha_de->leaf.data_crc,
755 				       (intmax_t)sha_de->ref_blks,
756 				       (intmax_t)sha_de->ref_size);
757 				for (i = 0; i < SHA256_DIGEST_LENGTH; ++i)
758 					printf("%02x", sha_de->sha_hash[i]);
759 				printf("\n");
760 			}
761 		} else {
762 			printf("\tcrc=%08x cnt=%ju size=%ju\n",
763 			       de->leaf.data_crc,
764 			       (intmax_t)de->u.de.ref_blks,
765 			       (intmax_t)de->u.de.ref_size);
766 		}
767 	}
768 	printf("end of dump ===\n");
769 }
770 
771 static void
772 dedup_usage(int code)
773 {
774 	fprintf(stderr,
775 		"hammer dedup-simulate <filesystem>\n"
776 		"hammer dedup <filesystem>\n"
777 	);
778 	exit(code);
779 }
780