xref: /dragonfly/sbin/hammer/cmd_dedup.c (revision 81c11cd3)
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer.h"
36 #include <libutil.h>
37 #include <crypto/sha2/sha2.h>
38 
39 #define DEDUP_BUF (64 * 1024)
40 
41 /* Sorted list of block CRCs - light version for dedup-simulate */
42 struct sim_dedup_entry_rb_tree;
43 RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree =
44 					RB_INITIALIZER(&sim_dedup_tree);
45 RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
46 		rb_sim_dedup_entry_compare, hammer_crc_t);
47 
48 struct sim_dedup_entry {
49 	hammer_crc_t	crc;
50 	u_int64_t	ref_blks; /* number of blocks referenced */
51 	u_int64_t	ref_size; /* size of data referenced */
52 	RB_ENTRY(sim_dedup_entry) rb_entry;
53 };
54 
55 /* Sorted list of HAMMER B-Tree keys */
56 struct dedup_entry_rb_tree;
57 struct sha_dedup_entry_rb_tree;
58 
59 RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree =
60 					RB_INITIALIZER(&dedup_tree);
61 RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
62 		rb_dedup_entry_compare, hammer_crc_t);
63 
64 RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
65 		rb_sha_dedup_entry_compare);
66 
67 struct dedup_entry {
68 	struct hammer_btree_leaf_elm leaf;
69 	union {
70 		struct {
71 			u_int64_t ref_blks;
72 			u_int64_t ref_size;
73 		} de;
74 		RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root;
75 	} u;
76 	u_int8_t flags;
77 	RB_ENTRY(dedup_entry) rb_entry;
78 };
79 
80 #define HAMMER_DEDUP_ENTRY_FICTITIOUS	0x0001
81 
82 struct sha_dedup_entry {
83 	struct hammer_btree_leaf_elm	leaf;
84 	u_int64_t			ref_blks;
85 	u_int64_t			ref_size;
86 	u_int8_t			sha_hash[SHA256_DIGEST_LENGTH];
87 	RB_ENTRY(sha_dedup_entry)	fict_entry;
88 };
89 
90 /*
91  * Pass2 list - contains entries that were not dedup'ed because ioctl failed
92  */
93 STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue =
94 				STAILQ_HEAD_INITIALIZER(pass2_dedup_queue);
95 
96 struct pass2_dedup_entry {
97 	struct hammer_btree_leaf_elm	leaf;
98 	STAILQ_ENTRY(pass2_dedup_entry)	sq_entry;
99 };
100 
101 #define DEDUP_PASS2	0x0001 /* process_btree_elm() mode */
102 
103 /* PFS global ids - we deal with just one PFS at a run */
104 int glob_fd;
105 struct hammer_ioc_pseudofs_rw glob_pfs;
106 
107 /*
108  * Global accounting variables
109  *
110  * Last three don't have to be 64-bit, just to be safe..
111  */
112 u_int64_t dedup_alloc_size = 0;
113 u_int64_t dedup_ref_size = 0;
114 u_int64_t dedup_skipped_size = 0;
115 u_int64_t dedup_crc_failures = 0;
116 u_int64_t dedup_sha_failures = 0;
117 u_int64_t dedup_underflows = 0;
118 
119 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
120 				struct sim_dedup_entry *sim_de2);
121 static int rb_dedup_entry_compare(struct dedup_entry *de1,
122 				struct dedup_entry *de2);
123 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
124 				struct sha_dedup_entry *sha_de2);
125 typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags);
126 static void scan_pfs(char *filesystem, scan_pfs_cb_t func);
127 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
128 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
129 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash);
130 static void dump_simulated_dedup(void);
131 static void dump_real_dedup(void);
132 static void dedup_usage(int code);
133 
134 RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
135 		rb_sim_dedup_entry_compare, hammer_crc_t, crc);
136 RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
137 		rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc);
138 RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
139 		rb_sha_dedup_entry_compare);
140 
141 static int
142 rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
143 			struct sim_dedup_entry *sim_de2)
144 {
145 	if (sim_de1->crc < sim_de2->crc)
146 		return (-1);
147 	if (sim_de1->crc > sim_de2->crc)
148 		return (1);
149 
150 	return (0);
151 }
152 
153 static int
154 rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2)
155 {
156 	if (de1->leaf.data_crc < de2->leaf.data_crc)
157 		return (-1);
158 	if (de1->leaf.data_crc > de2->leaf.data_crc)
159 		return (1);
160 
161 	return (0);
162 }
163 
164 static int
165 rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
166 			struct sha_dedup_entry *sha_de2)
167 {
168 	unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash;
169 	unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash;
170 	int i;
171 
172 	for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) {
173 		if (h1[i] < h2[i])
174 			return (-1);
175 		if (h1[i] > h2[i])
176 			return (1);
177 	}
178 
179 	return (0);
180 }
181 
182 /*
183  * dedup-simulate <filesystem>
184  */
185 void
186 hammer_cmd_dedup_simulate(char **av, int ac)
187 {
188 	struct sim_dedup_entry *sim_de;
189 
190 	if (ac != 1)
191 		dedup_usage(1);
192 
193 	glob_fd = getpfs(&glob_pfs, av[0]);
194 
195 	printf("Dedup-simulate ");
196 	scan_pfs(av[0], &collect_btree_elm);
197 	printf("Dedup-simulate %s succeeded\n", av[0]);
198 
199 	relpfs(glob_fd, &glob_pfs);
200 
201 	if (VerboseOpt >= 2)
202 		dump_simulated_dedup();
203 
204 	/*
205 	 * Calculate simulated dedup ratio and get rid of the tree
206 	 */
207 	while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) {
208 		assert(sim_de->ref_blks != 0);
209 		dedup_ref_size += sim_de->ref_size;
210 		dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks;
211 
212 		RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
213 		free(sim_de);
214 	}
215 
216 	printf("Simulated dedup ratio = %.2f\n",
217 	    (dedup_alloc_size != 0) ?
218 		(double)dedup_ref_size / dedup_alloc_size : 0);
219 }
220 
221 /*
222  * dedup <filesystem>
223  */
224 void
225 hammer_cmd_dedup(char **av, int ac)
226 {
227 	struct dedup_entry *de;
228 	struct sha_dedup_entry *sha_de;
229 	struct pass2_dedup_entry *pass2_de;
230 	char buf[8];
231 
232 	if (TimeoutOpt > 0)
233 		alarm(TimeoutOpt);
234 
235 	if (ac != 1)
236 		dedup_usage(1);
237 
238 	STAILQ_INIT(&pass2_dedup_queue);
239 
240 	glob_fd = getpfs(&glob_pfs, av[0]);
241 
242 	printf("Dedup ");
243 	scan_pfs(av[0], &process_btree_elm);
244 
245 	while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) {
246 		if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2))
247 			dedup_skipped_size -= pass2_de->leaf.data_len;
248 
249 		STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry);
250 		free(pass2_de);
251 	}
252 	assert(STAILQ_EMPTY(&pass2_dedup_queue));
253 
254 	printf("Dedup %s succeeded\n", av[0]);
255 
256 	relpfs(glob_fd, &glob_pfs);
257 
258 	if (VerboseOpt >= 2)
259 		dump_real_dedup();
260 
261 	/*
262 	 * Calculate dedup ratio and get rid of the trees
263 	 */
264 	while ((de = RB_ROOT(&dedup_tree)) != NULL) {
265 		if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
266 			while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) {
267 				assert(sha_de->ref_blks != 0);
268 				dedup_ref_size += sha_de->ref_size;
269 				dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks;
270 
271 				RB_REMOVE(sha_dedup_entry_rb_tree,
272 						&de->u.fict_root, sha_de);
273 				free(sha_de);
274 			}
275 			assert(RB_EMPTY(&de->u.fict_root));
276 		} else {
277 			assert(de->u.de.ref_blks != 0);
278 			dedup_ref_size += de->u.de.ref_size;
279 			dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks;
280 		}
281 
282 		RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
283 		free(de);
284 	}
285 	assert(RB_EMPTY(&dedup_tree));
286 
287 	humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024);
288 	printf("Dedup ratio = %.2f\n"
289 	       "    %8s referenced\n",
290 	       ((dedup_alloc_size != 0) ?
291 			(double)dedup_ref_size / dedup_alloc_size : 0),
292 	       buf
293 	);
294 	humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024);
295 	printf("    %8s allocated\n", buf);
296 	humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024);
297 	printf("    %8s skipped\n", buf);
298 	printf("    %8jd CRC collisions\n"
299 	       "    %8jd SHA collisions\n"
300 	       "    %8jd bigblock underflows\n",
301 	       (intmax_t)dedup_crc_failures,
302 	       (intmax_t)dedup_sha_failures,
303                (intmax_t)dedup_underflows
304 	);
305 }
306 
307 static int
308 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused)
309 {
310 	struct sim_dedup_entry *sim_de;
311 
312 	sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree, scan_leaf->data_crc);
313 
314 	if (sim_de == NULL) {
315 		sim_de = calloc(sizeof(*sim_de), 1);
316 		sim_de->crc = scan_leaf->data_crc;
317 		RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
318 	}
319 
320 	sim_de->ref_blks += 1;
321 	sim_de->ref_size += scan_leaf->data_len;
322 	return (1);
323 }
324 
325 static __inline int
326 validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
327 {
328 	if ((p->data_offset & HAMMER_OFF_ZONE_MASK) !=
329 	    (q->data_offset & HAMMER_OFF_ZONE_MASK)) {
330 		return (1);
331 	}
332 	if (p->data_len != q->data_len) {
333 		return (1);
334 	}
335 
336 	return (0);
337 }
338 
339 #define DEDUP_TECH_FAILURE	1
340 #define DEDUP_CMP_FAILURE	2
341 #define DEDUP_INVALID_ZONE	3
342 #define DEDUP_UNDERFLOW		4
343 #define DEDUP_VERS_FAILURE	5
344 
345 static __inline int
346 deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
347 {
348 	struct hammer_ioc_dedup dedup;
349 
350 	bzero(&dedup, sizeof(dedup));
351 
352 	/*
353 	 * If data_offset fields are the same there is no need to run ioctl,
354 	 * candidate is already dedup'ed.
355 	 */
356 	if (p->data_offset == q->data_offset) {
357 		return (0);
358 	}
359 
360 	dedup.elm1 = p->base;
361 	dedup.elm2 = q->base;
362 	RunningIoctl = 1;
363 	if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) {
364 		if (errno == EOPNOTSUPP) {
365 			/* must be at least version 5 */
366 			return (DEDUP_VERS_FAILURE);
367 		}
368 		/* Technical failure - locking or w/e */
369 		return (DEDUP_TECH_FAILURE);
370 	}
371 	if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE) {
372 		return (DEDUP_CMP_FAILURE);
373 	}
374 	if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE) {
375 		return (DEDUP_INVALID_ZONE);
376 	}
377 	if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW) {
378 		return (DEDUP_UNDERFLOW);
379 	}
380 	RunningIoctl = 0;
381 
382 	return (0);
383 }
384 
385 static int
386 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags)
387 {
388 	struct dedup_entry *de;
389 	struct sha_dedup_entry *sha_de, temp;
390 	struct pass2_dedup_entry *pass2_de;
391 	int error;
392 
393 	de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc);
394 	if (de == NULL) {
395 		/*
396 		 * Insert new entry into CRC tree
397 		 */
398 		de = calloc(sizeof(*de), 1);
399 		de->leaf = *scan_leaf;
400 		RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de);
401 		goto upgrade_stats;
402 	}
403 
404 	/*
405 	 * Found entry in CRC tree
406 	 */
407 	if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
408 		/*
409 		 * Entry in CRC tree is fictitious, so we already had problems
410 		 * with this CRC. Upgrade (compute SHA) the candidate and
411 		 * dive into SHA subtree. If upgrade fails insert the candidate
412 		 * into Pass2 list (it will be processed later).
413 		 */
414 		if (upgrade_chksum(scan_leaf, temp.sha_hash))
415 			goto pass2_insert;
416 
417 		sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root, &temp);
418 		if (sha_de == NULL) {
419 			/*
420 			 * Nothing in SHA subtree so far, so this is a new
421 			 * 'dataset'. Insert new entry into SHA subtree.
422 			 */
423 			sha_de = calloc(sizeof(*sha_de), 1);
424 			sha_de->leaf = *scan_leaf;
425 			memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
426 			RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
427 			goto upgrade_stats_sha;
428 		}
429 
430 		/*
431 		 * Found entry in SHA subtree, it means we have a potential
432 		 * dedup pair. Validate it (zones have to match and data_len
433 		 * field have to be the same too. If validation fails, treat
434 		 * it as a SHA collision (jump to sha256_failure).
435 		 */
436 		if (validate_dedup_pair(&sha_de->leaf, scan_leaf))
437 			goto sha256_failure;
438 
439 		/*
440 		 * We have a valid dedup pair (SHA match, validated).
441 		 *
442 		 * In case of technical failure (dedup pair was good, but
443 		 * ioctl failed anyways) insert the candidate into Pass2 list
444 		 * (we will try to dedup it after we are done with the rest of
445 		 * the tree).
446 		 *
447 		 * If ioctl fails because either of blocks is in the non-dedup
448 		 * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't
449 		 * bother with the candidate and terminate early.
450 		 *
451 		 * If ioctl fails because of bigblock underflow replace the
452 		 * leaf node that found dedup entry represents with scan_leaf.
453 		 */
454 		error = deduplicate(&sha_de->leaf, scan_leaf);
455 		switch(error) {
456 		case 0:
457 			goto upgrade_stats_sha;
458 		case DEDUP_TECH_FAILURE:
459 			goto pass2_insert;
460 		case DEDUP_CMP_FAILURE:
461 			goto sha256_failure;
462 		case DEDUP_INVALID_ZONE:
463 			goto terminate_early;
464 		case DEDUP_UNDERFLOW:
465 			++dedup_underflows;
466 			sha_de->leaf = *scan_leaf;
467 			memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
468 			goto upgrade_stats_sha;
469 		case DEDUP_VERS_FAILURE:
470 			fprintf(stderr,
471 				"HAMMER filesystem must be at least "
472 				"version 5 to dedup\n");
473 			exit (1);
474 		default:
475 			fprintf(stderr, "Unknown error\n");
476 			goto terminate_early;
477 		}
478 
479 		/*
480 		 * Ooh la la.. SHA-256 collision. Terminate early, there's
481 		 * nothing we can do here.
482 		 */
483 sha256_failure:
484 		++dedup_sha_failures;
485 		goto terminate_early;
486 	} else {
487 		/*
488 		 * Candidate CRC is good for now (we found an entry in CRC
489 		 * tree and it's not fictitious). This means we have a
490 		 * potential dedup pair.
491 		 */
492 		if (validate_dedup_pair(&de->leaf, scan_leaf))
493 			goto crc_failure;
494 
495 		/*
496 		 * We have a valid dedup pair (CRC match, validated)
497 		 */
498 		error = deduplicate(&de->leaf, scan_leaf);
499 		switch(error) {
500 		case 0:
501 			goto upgrade_stats;
502 		case DEDUP_TECH_FAILURE:
503 			goto pass2_insert;
504 		case DEDUP_CMP_FAILURE:
505 			goto crc_failure;
506 		case DEDUP_INVALID_ZONE:
507 			goto terminate_early;
508 		case DEDUP_UNDERFLOW:
509 			++dedup_underflows;
510 			de->leaf = *scan_leaf;
511 			goto upgrade_stats;
512 		case DEDUP_VERS_FAILURE:
513 			fprintf(stderr,
514 				"HAMMER filesystem must be at least "
515 				"version 5 to dedup\n");
516 			exit (1);
517 		default:
518 			fprintf(stderr, "Unknown error\n");
519 			goto terminate_early;
520 		}
521 
522 crc_failure:
523 		/*
524 		 * We got a CRC collision - either ioctl failed because of
525 		 * the comparison failure or validation of the potential
526 		 * dedup pair went bad. In all cases insert both blocks
527 		 * into SHA subtree (this requires checksum upgrade) and mark
528 		 * entry that corresponds to this CRC in the CRC tree
529 		 * fictitious, so that all futher operations with this CRC go
530 		 * through SHA subtree.
531 		 */
532 		++dedup_crc_failures;
533 		/*
534 		 * Insert block that was represented by now fictitious dedup
535 		 * entry (create a new SHA entry and preserve stats of the
536 		 * old CRC one). If checksum upgrade fails insert the
537 		 * candidate into Pass2 list and return - keep both trees
538 		 * unmodified.
539 		 */
540 		sha_de = calloc(sizeof(*sha_de), 1);
541 		sha_de->leaf = de->leaf;
542 		sha_de->ref_blks = de->u.de.ref_blks;
543 		sha_de->ref_size = de->u.de.ref_size;
544 		if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) {
545 			free(sha_de);
546 			goto pass2_insert;
547 		}
548 
549 		RB_INIT(&de->u.fict_root);
550 		/*
551 		 * Here we can insert without prior checking because the tree
552 		 * is empty at this point
553 		 */
554 		RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
555 
556 		/*
557 		 * Mark entry in CRC tree fictitious
558 		 */
559 		de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS;
560 
561 		/*
562 		 * Upgrade checksum of the candidate and insert it into
563 		 * SHA subtree. If upgrade fails insert the candidate into
564 		 * Pass2 list.
565 		 */
566 		if (upgrade_chksum(scan_leaf, temp.sha_hash)) {
567 			goto pass2_insert;
568 		}
569 		sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root, &temp);
570 		if (sha_de != NULL)
571 			/* There is an entry with this SHA already, but the only
572 			 * RB-tree element at this point is that entry we just
573 			 * added. We know for sure these blocks are different
574 			 * (this is crc_failure branch) so treat it as SHA
575 			 * collision.
576 			 */
577 			goto sha256_failure;
578 
579 		sha_de = calloc(sizeof(*sha_de), 1);
580 		sha_de->leaf = *scan_leaf;
581 		memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
582 		RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
583 		goto upgrade_stats_sha;
584 	}
585 
586 upgrade_stats:
587 	de->u.de.ref_blks += 1;
588 	de->u.de.ref_size += scan_leaf->data_len;
589 	return (1);
590 
591 upgrade_stats_sha:
592 	sha_de->ref_blks += 1;
593 	sha_de->ref_size += scan_leaf->data_len;
594 	return (1);
595 
596 pass2_insert:
597 	/*
598 	 * If in pass2 mode don't insert anything, fall through to
599 	 * terminate_early
600 	 */
601 	if ((flags & DEDUP_PASS2) == 0) {
602 		pass2_de = calloc(sizeof(*pass2_de), 1);
603 		pass2_de->leaf = *scan_leaf;
604 		STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry);
605 		dedup_skipped_size += scan_leaf->data_len;
606 		return (1);
607 	}
608 
609 terminate_early:
610 	/*
611 	 * Early termination path. Fixup stats.
612 	 */
613 	dedup_alloc_size += scan_leaf->data_len;
614 	dedup_ref_size += scan_leaf->data_len;
615 	return (0);
616 }
617 
618 static int
619 upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash)
620 {
621 	struct hammer_ioc_data data;
622 	char *buf = malloc(DEDUP_BUF);
623 	SHA256_CTX ctx;
624 	int error;
625 
626 	bzero(&data, sizeof(data));
627 	data.elm = leaf->base;
628 	data.ubuf = buf;
629 	data.size = DEDUP_BUF;
630 
631 	error = 0;
632 	if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) {
633 		fprintf(stderr, "Get-data failed: %s\n", strerror(errno));
634 		error = 1;
635 		goto done;
636 	}
637 
638 	if (data.leaf.data_len != leaf->data_len) {
639 		error = 1;
640 		goto done;
641 	}
642 
643 	if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD &&
644 	    data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) {
645 		SHA256_Init(&ctx);
646 		SHA256_Update(&ctx, (void *)buf, data.leaf.data_len);
647 		SHA256_Final(sha_hash, &ctx);
648 	}
649 
650 done:
651 	free(buf);
652 	return (error);
653 }
654 
655 static void
656 scan_pfs(char *filesystem, scan_pfs_cb_t func)
657 {
658 	struct hammer_ioc_mirror_rw mirror;
659 	hammer_ioc_mrecord_any_t mrec;
660 	struct hammer_btree_leaf_elm elm;
661 	char *buf = malloc(DEDUP_BUF);
662 	int offset, bytes;
663 
664 	bzero(&mirror, sizeof(mirror));
665 	hammer_key_beg_init(&mirror.key_beg);
666 	hammer_key_end_init(&mirror.key_end);
667 
668 	mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid;
669 	mirror.tid_end = glob_pfs.ondisk->sync_end_tid;
670 	mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */
671 	mirror.ubuf = buf;
672 	mirror.size = DEDUP_BUF;
673 	mirror.pfs_id = glob_pfs.pfs_id;
674 	mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
675 
676 	printf("%s: objspace %016jx:%04x %016jx:%04x pfs_id %d\n",
677 		filesystem,
678 		(uintmax_t)mirror.key_beg.obj_id,
679 		mirror.key_beg.localization,
680 		(uintmax_t)mirror.key_end.obj_id,
681 		mirror.key_end.localization,
682 		glob_pfs.pfs_id);
683 	fflush(stdout);
684 
685 	do {
686 		mirror.count = 0;
687 		mirror.pfs_id = glob_pfs.pfs_id;
688 		mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
689 		if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) {
690 			fprintf(stderr, "Mirror-read %s failed: %s\n",
691 				filesystem, strerror(errno));
692 			exit(1);
693 		}
694 		if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) {
695 			fprintf(stderr, "Mirror-read %s fatal error %d\n",
696 				filesystem, mirror.head.error);
697 			exit(1);
698 		}
699 		if (mirror.count) {
700 			offset = 0;
701 			while (offset < mirror.count) {
702 				mrec = (void *)((char *)buf + offset);
703 				bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size);
704 				if (offset + bytes > mirror.count) {
705 					fprintf(stderr, "Misaligned record\n");
706 					exit(1);
707 				}
708 				assert((mrec->head.type &
709 				    HAMMER_MRECF_TYPE_MASK) == HAMMER_MREC_TYPE_REC);
710 
711 				elm = mrec->rec.leaf;
712 				if (elm.base.btype == HAMMER_BTREE_TYPE_RECORD &&
713 				    elm.base.rec_type == HAMMER_RECTYPE_DATA) {
714 					func(&elm, 0);
715 				}
716 				offset += bytes;
717 			}
718 		}
719 		mirror.key_beg = mirror.key_cur;
720 	} while (mirror.count != 0);
721 
722 	free(buf);
723 }
724 
725 static void
726 dump_simulated_dedup(void)
727 {
728 	struct sim_dedup_entry *sim_de;
729 
730 	printf("=== Dumping simulated dedup entries:\n");
731 	RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) {
732 		printf("\tcrc=%08x cnt=%ju size=%ju\n",
733 			sim_de->crc,
734 			(intmax_t)sim_de->ref_blks,
735 			(intmax_t)sim_de->ref_size);
736 	}
737 	printf("end of dump ===\n");
738 }
739 
740 static void
741 dump_real_dedup(void)
742 {
743 	struct dedup_entry *de;
744 	struct sha_dedup_entry *sha_de;
745 	int i;
746 
747 	printf("=== Dumping dedup entries:\n");
748 	RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) {
749 		if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
750 			printf("\tcrc=%08x fictitious\n", de->leaf.data_crc);
751 
752 			RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
753 				printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t"
754 				       "\t\tsha=",
755 				       sha_de->leaf.data_crc,
756 				       (intmax_t)sha_de->ref_blks,
757 				       (intmax_t)sha_de->ref_size);
758 				for (i = 0; i < SHA256_DIGEST_LENGTH; ++i)
759 					printf("%02x", sha_de->sha_hash[i]);
760 				printf("\n");
761 			}
762 		} else {
763 			printf("\tcrc=%08x cnt=%ju size=%ju\n",
764 			       de->leaf.data_crc,
765 			       (intmax_t)de->u.de.ref_blks,
766 			       (intmax_t)de->u.de.ref_size);
767 		}
768 	}
769 	printf("end of dump ===\n");
770 }
771 
772 static void
773 dedup_usage(int code)
774 {
775 	fprintf(stderr,
776 		"hammer dedup-simulate <filesystem>\n"
777 		"hammer dedup <filesystem>\n"
778 	);
779 	exit(code);
780 }
781