xref: /dragonfly/sbin/hammer/cmd_dedup.c (revision 896f2e3a)
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer.h"
36 #include <libutil.h>
37 #include <crypto/sha2/sha2.h>
38 
39 #define DEDUP_BUF (64 * 1024)
40 
41 /* Sorted list of block CRCs - light version for dedup-simulate */
42 struct sim_dedup_entry_rb_tree;
43 RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree =
44 					RB_INITIALIZER(&sim_dedup_tree);
45 RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
46 		rb_sim_dedup_entry_compare, hammer_crc_t);
47 
48 struct sim_dedup_entry {
49 	hammer_crc_t	crc;
50 	u_int64_t	ref_blks; /* number of blocks referenced */
51 	u_int64_t	ref_size; /* size of data referenced */
52 	RB_ENTRY(sim_dedup_entry) rb_entry;
53 };
54 
55 struct dedup_entry {
56 	struct hammer_btree_leaf_elm leaf;
57 	union {
58 		struct {
59 			u_int64_t ref_blks;
60 			u_int64_t ref_size;
61 		} de;
62 		RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root;
63 	} u;
64 	u_int8_t flags;
65 	RB_ENTRY(dedup_entry) rb_entry;
66 };
67 
68 #define HAMMER_DEDUP_ENTRY_FICTITIOUS	0x0001
69 
70 struct sha_dedup_entry {
71 	struct hammer_btree_leaf_elm	leaf;
72 	u_int64_t			ref_blks;
73 	u_int64_t			ref_size;
74 	u_int8_t			sha_hash[SHA256_DIGEST_LENGTH];
75 	RB_ENTRY(sha_dedup_entry)	fict_entry;
76 };
77 
78 /* Sorted list of HAMMER B-Tree keys */
79 struct dedup_entry_rb_tree;
80 struct sha_dedup_entry_rb_tree;
81 
82 RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree =
83 					RB_INITIALIZER(&dedup_tree);
84 RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
85 		rb_dedup_entry_compare, hammer_crc_t);
86 
87 RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
88 		rb_sha_dedup_entry_compare);
89 
90 /*
91  * Pass2 list - contains entries that were not dedup'ed because ioctl failed
92  */
93 STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue =
94 				STAILQ_HEAD_INITIALIZER(pass2_dedup_queue);
95 
96 struct pass2_dedup_entry {
97 	struct hammer_btree_leaf_elm	leaf;
98 	STAILQ_ENTRY(pass2_dedup_entry)	sq_entry;
99 };
100 
101 #define DEDUP_PASS2	0x0001 /* process_btree_elm() mode */
102 
103 static int SigInfoFlag;
104 static int SigAlrmFlag;
105 static int64_t DedupDataReads;
106 static int64_t DedupCurrentRecords;
107 static int64_t DedupTotalRecords;
108 static u_int32_t DedupCrcStart;
109 static u_int32_t DedupCrcEnd;
110 static u_int64_t MemoryUse;
111 
112 /* PFS global ids - we deal with just one PFS at a run */
113 int glob_fd;
114 struct hammer_ioc_pseudofs_rw glob_pfs;
115 
116 /*
117  * Global accounting variables
118  *
119  * Last three don't have to be 64-bit, just to be safe..
120  */
121 u_int64_t dedup_alloc_size = 0;
122 u_int64_t dedup_ref_size = 0;
123 u_int64_t dedup_skipped_size = 0;
124 u_int64_t dedup_crc_failures = 0;
125 u_int64_t dedup_sha_failures = 0;
126 u_int64_t dedup_underflows = 0;
127 u_int64_t dedup_successes_count = 0;
128 u_int64_t dedup_successes_bytes = 0;
129 
130 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
131 				struct sim_dedup_entry *sim_de2);
132 static int rb_dedup_entry_compare(struct dedup_entry *de1,
133 				struct dedup_entry *de2);
134 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
135 				struct sha_dedup_entry *sha_de2);
136 typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags);
137 static void scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id);
138 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
139 static int count_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
140 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
141 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash);
142 static void dump_simulated_dedup(void);
143 static void dump_real_dedup(void);
144 static void dedup_usage(int code);
145 
146 RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
147 		rb_sim_dedup_entry_compare, hammer_crc_t, crc);
148 RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
149 		rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc);
150 RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
151 		rb_sha_dedup_entry_compare);
152 
153 static int
154 rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
155 			struct sim_dedup_entry *sim_de2)
156 {
157 	if (sim_de1->crc < sim_de2->crc)
158 		return (-1);
159 	if (sim_de1->crc > sim_de2->crc)
160 		return (1);
161 
162 	return (0);
163 }
164 
165 static int
166 rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2)
167 {
168 	if (de1->leaf.data_crc < de2->leaf.data_crc)
169 		return (-1);
170 	if (de1->leaf.data_crc > de2->leaf.data_crc)
171 		return (1);
172 
173 	return (0);
174 }
175 
176 static int
177 rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
178 			struct sha_dedup_entry *sha_de2)
179 {
180 	unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash;
181 	unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash;
182 	int i;
183 
184 	for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) {
185 		if (h1[i] < h2[i])
186 			return (-1);
187 		if (h1[i] > h2[i])
188 			return (1);
189 	}
190 
191 	return (0);
192 }
193 
194 /*
195  * dedup-simulate <filesystem>
196  */
197 void
198 hammer_cmd_dedup_simulate(char **av, int ac)
199 {
200 	struct sim_dedup_entry *sim_de;
201 
202 	if (ac != 1)
203 		dedup_usage(1);
204 
205 	glob_fd = getpfs(&glob_pfs, av[0]);
206 
207 	/*
208 	 * Collection passes (memory limited)
209 	 */
210 	printf("Dedup-simulate running\n");
211 	do {
212 		DedupCrcStart = DedupCrcEnd;
213 		DedupCrcEnd = 0;
214 		MemoryUse = 0;
215 
216 		if (VerboseOpt) {
217 			printf("b-tree pass  crc-range %08x-max\n",
218 				DedupCrcStart);
219 			fflush(stdout);
220 		}
221 		scan_pfs(av[0], collect_btree_elm, "simu-pass");
222 
223 		if (VerboseOpt >= 2)
224 			dump_simulated_dedup();
225 
226 		/*
227 		 * Calculate simulated dedup ratio and get rid of the tree
228 		 */
229 		while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) {
230 			assert(sim_de->ref_blks != 0);
231 			dedup_ref_size += sim_de->ref_size;
232 			dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks;
233 
234 			RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
235 			free(sim_de);
236 		}
237 		if (DedupCrcEnd && VerboseOpt == 0)
238 			printf(".");
239 	} while (DedupCrcEnd);
240 
241 	printf("Dedup-simulate %s succeeded\n", av[0]);
242 	relpfs(glob_fd, &glob_pfs);
243 
244 	printf("Simulated dedup ratio = %.2f\n",
245 	    (dedup_alloc_size != 0) ?
246 		(double)dedup_ref_size / dedup_alloc_size : 0);
247 }
248 
249 /*
250  * dedup <filesystem>
251  */
252 void
253 hammer_cmd_dedup(char **av, int ac)
254 {
255 	struct dedup_entry *de;
256 	struct sha_dedup_entry *sha_de;
257 	struct pass2_dedup_entry *pass2_de;
258 	char *tmp;
259 	char buf[8];
260 	int needfree = 0;
261 
262 	if (TimeoutOpt > 0)
263 		alarm(TimeoutOpt);
264 
265 	if (ac != 1)
266 		dedup_usage(1);
267 
268 	STAILQ_INIT(&pass2_dedup_queue);
269 
270 	glob_fd = getpfs(&glob_pfs, av[0]);
271 
272 	/*
273 	 * A cycle file is _required_ for resuming dedup after the timeout
274 	 * specified with -t has expired. If no -c option, then place a
275 	 * .dedup.cycle file either in the PFS snapshots directory or in
276 	 * the default snapshots directory.
277 	 */
278 	if (!CyclePath) {
279 		if (glob_pfs.ondisk->snapshots[0] != '/')
280 			asprintf(&tmp, "%s/%s/.dedup.cycle",
281 			    SNAPSHOTS_BASE, av[0]);
282 		else
283 			asprintf(&tmp, "%s/.dedup.cycle",
284 			    glob_pfs.ondisk->snapshots);
285 		CyclePath = tmp;
286 		needfree = 1;
287 	}
288 
289 	/*
290 	 * Pre-pass to cache the btree
291 	 */
292 	scan_pfs(av[0], count_btree_elm, "pre-pass ");
293 	DedupTotalRecords = DedupCurrentRecords;
294 
295 	/*
296 	 * Collection passes (memory limited)
297 	 */
298 	printf("Dedup running\n");
299 	do {
300 		DedupCrcStart = DedupCrcEnd;
301 		DedupCrcEnd = 0;
302 		MemoryUse = 0;
303 
304 		if (VerboseOpt) {
305 			printf("b-tree pass  crc-range %08x-max\n",
306 				DedupCrcStart);
307 			fflush(stdout);
308 		}
309 		scan_pfs(av[0], process_btree_elm, "main-pass");
310 
311 		while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) {
312 			if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2))
313 				dedup_skipped_size -= pass2_de->leaf.data_len;
314 
315 			STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry);
316 			free(pass2_de);
317 		}
318 		assert(STAILQ_EMPTY(&pass2_dedup_queue));
319 
320 		if (VerboseOpt >= 2)
321 			dump_real_dedup();
322 
323 		/*
324 		 * Calculate dedup ratio and get rid of the trees
325 		 */
326 		while ((de = RB_ROOT(&dedup_tree)) != NULL) {
327 			if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
328 				while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) {
329 					assert(sha_de->ref_blks != 0);
330 					dedup_ref_size += sha_de->ref_size;
331 					dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks;
332 
333 					RB_REMOVE(sha_dedup_entry_rb_tree,
334 							&de->u.fict_root, sha_de);
335 					free(sha_de);
336 				}
337 				assert(RB_EMPTY(&de->u.fict_root));
338 			} else {
339 				assert(de->u.de.ref_blks != 0);
340 				dedup_ref_size += de->u.de.ref_size;
341 				dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks;
342 			}
343 
344 			RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
345 			free(de);
346 		}
347 		assert(RB_EMPTY(&dedup_tree));
348 		if (DedupCrcEnd && VerboseOpt == 0)
349 			printf(".");
350 	} while (DedupCrcEnd);
351 
352 	printf("Dedup %s succeeded\n", av[0]);
353 	relpfs(glob_fd, &glob_pfs);
354 
355 	humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024);
356 	printf("Dedup ratio = %.2f (in this run)\n"
357 	       "    %8s referenced\n",
358 	       ((dedup_alloc_size != 0) ?
359 			(double)dedup_ref_size / dedup_alloc_size : 0),
360 	       buf
361 	);
362 	humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024);
363 	printf("    %8s allocated\n", buf);
364 	humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024);
365 	printf("    %8s skipped\n", buf);
366 	printf("    %8jd CRC collisions\n"
367 	       "    %8jd SHA collisions\n"
368 	       "    %8jd bigblock underflows\n"
369 	       "    %8jd new dedup records\n"
370 	       "    %8jd new dedup bytes\n",
371 	       (intmax_t)dedup_crc_failures,
372 	       (intmax_t)dedup_sha_failures,
373                (intmax_t)dedup_underflows,
374                (intmax_t)dedup_successes_count,
375                (intmax_t)dedup_successes_bytes
376 	);
377 
378 	/* Once completed remove cycle file */
379 	hammer_reset_cycle();
380 
381 	/* We don't want to mess up with other directives */
382 	if (needfree) {
383 		free(tmp);
384 		CyclePath = NULL;
385 	}
386 }
387 
388 static int
389 count_btree_elm(hammer_btree_leaf_elm_t scan_leaf __unused, int flags __unused)
390 {
391 	return(1);
392 }
393 
394 static int
395 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused)
396 {
397 	struct sim_dedup_entry *sim_de;
398 
399 	/*
400 	 * If we are using too much memory we have to clean some out, which
401 	 * will cause the run to use multiple passes.  Be careful of integer
402 	 * overflows!
403 	 */
404 	if (MemoryUse > MemoryLimit) {
405 		DedupCrcEnd = DedupCrcStart +
406 			      (u_int32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
407 		if (VerboseOpt) {
408 			printf("memory limit  crc-range %08x-%08x\n",
409 				DedupCrcStart, DedupCrcEnd);
410 			fflush(stdout);
411 		}
412 		for (;;) {
413 			sim_de = RB_MAX(sim_dedup_entry_rb_tree,
414 					&sim_dedup_tree);
415 			if (sim_de == NULL || sim_de->crc < DedupCrcEnd)
416 				break;
417 			RB_REMOVE(sim_dedup_entry_rb_tree,
418 				  &sim_dedup_tree, sim_de);
419 			MemoryUse -= sizeof(*sim_de);
420 			free(sim_de);
421 		}
422 	}
423 
424 	/*
425 	 * Collect statistics based on the CRC only, do not try to read
426 	 * any data blocks or run SHA hashes.
427 	 */
428 	sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree,
429 			   scan_leaf->data_crc);
430 
431 	if (sim_de == NULL) {
432 		sim_de = calloc(sizeof(*sim_de), 1);
433 		sim_de->crc = scan_leaf->data_crc;
434 		RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
435 		MemoryUse += sizeof(*sim_de);
436 	}
437 
438 	sim_de->ref_blks += 1;
439 	sim_de->ref_size += scan_leaf->data_len;
440 	return (1);
441 }
442 
443 static __inline int
444 validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
445 {
446 	if ((p->data_offset & HAMMER_OFF_ZONE_MASK) !=
447 	    (q->data_offset & HAMMER_OFF_ZONE_MASK)) {
448 		return (1);
449 	}
450 	if (p->data_len != q->data_len) {
451 		return (1);
452 	}
453 
454 	return (0);
455 }
456 
457 #define DEDUP_TECH_FAILURE	1
458 #define DEDUP_CMP_FAILURE	2
459 #define DEDUP_INVALID_ZONE	3
460 #define DEDUP_UNDERFLOW		4
461 #define DEDUP_VERS_FAILURE	5
462 
463 static __inline int
464 deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
465 {
466 	struct hammer_ioc_dedup dedup;
467 
468 	bzero(&dedup, sizeof(dedup));
469 
470 	/*
471 	 * If data_offset fields are the same there is no need to run ioctl,
472 	 * candidate is already dedup'ed.
473 	 */
474 	if (p->data_offset == q->data_offset) {
475 		return (0);
476 	}
477 
478 	dedup.elm1 = p->base;
479 	dedup.elm2 = q->base;
480 	RunningIoctl = 1;
481 	if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) {
482 		if (errno == EOPNOTSUPP) {
483 			/* must be at least version 5 */
484 			return (DEDUP_VERS_FAILURE);
485 		}
486 		/* Technical failure - locking or w/e */
487 		return (DEDUP_TECH_FAILURE);
488 	}
489 	if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE) {
490 		return (DEDUP_CMP_FAILURE);
491 	}
492 	if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE) {
493 		return (DEDUP_INVALID_ZONE);
494 	}
495 	if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW) {
496 		return (DEDUP_UNDERFLOW);
497 	}
498 	RunningIoctl = 0;
499 	++dedup_successes_count;
500 	dedup_successes_bytes += p->data_len;
501 	return (0);
502 }
503 
504 static int
505 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags)
506 {
507 	struct dedup_entry *de;
508 	struct sha_dedup_entry *sha_de, temp;
509 	struct pass2_dedup_entry *pass2_de;
510 	int error;
511 
512 	/*
513 	 * If we are using too much memory we have to clean some out, which
514 	 * will cause the run to use multiple passes.  Be careful of integer
515 	 * overflows!
516 	 */
517 	while (MemoryUse > MemoryLimit) {
518 		DedupCrcEnd = DedupCrcStart +
519 			      (u_int32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
520 		if (VerboseOpt) {
521 			printf("memory limit  crc-range %08x-%08x\n",
522 				DedupCrcStart, DedupCrcEnd);
523 			fflush(stdout);
524 		}
525 
526 		for (;;) {
527 			de = RB_MAX(dedup_entry_rb_tree, &dedup_tree);
528 			if (de == NULL || de->leaf.data_crc < DedupCrcEnd)
529 				break;
530 			if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
531 				while ((sha_de = RB_ROOT(&de->u.fict_root)) !=
532 				       NULL) {
533 					RB_REMOVE(sha_dedup_entry_rb_tree,
534 						  &de->u.fict_root, sha_de);
535 					MemoryUse -= sizeof(*sha_de);
536 					free(sha_de);
537 				}
538 			}
539 			RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
540 			MemoryUse -= sizeof(*de);
541 			free(de);
542 		}
543 	}
544 
545 	/*
546 	 * Collect statistics based on the CRC.  Colliding CRCs usually
547 	 * cause a SHA sub-tree to be created under the de.
548 	 *
549 	 * Trivial case if de not found.
550 	 */
551 	de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc);
552 	if (de == NULL) {
553 		de = calloc(sizeof(*de), 1);
554 		de->leaf = *scan_leaf;
555 		RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de);
556 		MemoryUse += sizeof(*de);
557 		goto upgrade_stats;
558 	}
559 
560 	/*
561 	 * Found entry in CRC tree
562 	 */
563 	if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
564 		/*
565 		 * Optimize the case where a CRC failure results in multiple
566 		 * SHA entries.  If we unconditionally issue a data-read a
567 		 * degenerate situation where a colliding CRC's second SHA
568 		 * entry contains the lion's share of the deduplication
569 		 * candidates will result in excessive data block reads.
570 		 *
571 		 * Deal with the degenerate case by looking for a matching
572 		 * data_offset/data_len in the SHA elements we already have
573 		 * before reading the data block and generating a new SHA.
574 		 */
575 		RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
576 			if (sha_de->leaf.data_offset ==
577 						scan_leaf->data_offset &&
578 			    sha_de->leaf.data_len == scan_leaf->data_len) {
579 				memcpy(temp.sha_hash, sha_de->sha_hash,
580 					SHA256_DIGEST_LENGTH);
581 				break;
582 			}
583 		}
584 
585 		/*
586 		 * Entry in CRC tree is fictitious, so we already had problems
587 		 * with this CRC. Upgrade (compute SHA) the candidate and
588 		 * dive into SHA subtree. If upgrade fails insert the candidate
589 		 * into Pass2 list (it will be processed later).
590 		 */
591 		if (sha_de == NULL) {
592 			if (upgrade_chksum(scan_leaf, temp.sha_hash))
593 				goto pass2_insert;
594 
595 			sha_de = RB_FIND(sha_dedup_entry_rb_tree,
596 					 &de->u.fict_root, &temp);
597 		}
598 
599 		/*
600 		 * Nothing in SHA subtree so far, so this is a new
601 		 * 'dataset'. Insert new entry into SHA subtree.
602 		 */
603 		if (sha_de == NULL) {
604 			sha_de = calloc(sizeof(*sha_de), 1);
605 			sha_de->leaf = *scan_leaf;
606 			memcpy(sha_de->sha_hash, temp.sha_hash,
607 			       SHA256_DIGEST_LENGTH);
608 			RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root,
609 				  sha_de);
610 			MemoryUse += sizeof(*sha_de);
611 			goto upgrade_stats_sha;
612 		}
613 
614 		/*
615 		 * Found entry in SHA subtree, it means we have a potential
616 		 * dedup pair. Validate it (zones have to match and data_len
617 		 * field have to be the same too. If validation fails, treat
618 		 * it as a SHA collision (jump to sha256_failure).
619 		 */
620 		if (validate_dedup_pair(&sha_de->leaf, scan_leaf))
621 			goto sha256_failure;
622 
623 		/*
624 		 * We have a valid dedup pair (SHA match, validated).
625 		 *
626 		 * In case of technical failure (dedup pair was good, but
627 		 * ioctl failed anyways) insert the candidate into Pass2 list
628 		 * (we will try to dedup it after we are done with the rest of
629 		 * the tree).
630 		 *
631 		 * If ioctl fails because either of blocks is in the non-dedup
632 		 * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't
633 		 * bother with the candidate and terminate early.
634 		 *
635 		 * If ioctl fails because of bigblock underflow replace the
636 		 * leaf node that found dedup entry represents with scan_leaf.
637 		 */
638 		error = deduplicate(&sha_de->leaf, scan_leaf);
639 		switch(error) {
640 		case 0:
641 			goto upgrade_stats_sha;
642 		case DEDUP_TECH_FAILURE:
643 			goto pass2_insert;
644 		case DEDUP_CMP_FAILURE:
645 			goto sha256_failure;
646 		case DEDUP_INVALID_ZONE:
647 			goto terminate_early;
648 		case DEDUP_UNDERFLOW:
649 			++dedup_underflows;
650 			sha_de->leaf = *scan_leaf;
651 			memcpy(sha_de->sha_hash, temp.sha_hash,
652 				SHA256_DIGEST_LENGTH);
653 			goto upgrade_stats_sha;
654 		case DEDUP_VERS_FAILURE:
655 			fprintf(stderr,
656 				"HAMMER filesystem must be at least "
657 				"version 5 to dedup\n");
658 			exit (1);
659 		default:
660 			fprintf(stderr, "Unknown error\n");
661 			goto terminate_early;
662 		}
663 
664 		/*
665 		 * Ooh la la.. SHA-256 collision. Terminate early, there's
666 		 * nothing we can do here.
667 		 */
668 sha256_failure:
669 		++dedup_sha_failures;
670 		goto terminate_early;
671 	} else {
672 		/*
673 		 * Candidate CRC is good for now (we found an entry in CRC
674 		 * tree and it's not fictitious). This means we have a
675 		 * potential dedup pair.
676 		 */
677 		if (validate_dedup_pair(&de->leaf, scan_leaf))
678 			goto crc_failure;
679 
680 		/*
681 		 * We have a valid dedup pair (CRC match, validated)
682 		 */
683 		error = deduplicate(&de->leaf, scan_leaf);
684 		switch(error) {
685 		case 0:
686 			goto upgrade_stats;
687 		case DEDUP_TECH_FAILURE:
688 			goto pass2_insert;
689 		case DEDUP_CMP_FAILURE:
690 			goto crc_failure;
691 		case DEDUP_INVALID_ZONE:
692 			goto terminate_early;
693 		case DEDUP_UNDERFLOW:
694 			++dedup_underflows;
695 			de->leaf = *scan_leaf;
696 			goto upgrade_stats;
697 		case DEDUP_VERS_FAILURE:
698 			fprintf(stderr,
699 				"HAMMER filesystem must be at least "
700 				"version 5 to dedup\n");
701 			exit (1);
702 		default:
703 			fprintf(stderr, "Unknown error\n");
704 			goto terminate_early;
705 		}
706 
707 crc_failure:
708 		/*
709 		 * We got a CRC collision - either ioctl failed because of
710 		 * the comparison failure or validation of the potential
711 		 * dedup pair went bad. In all cases insert both blocks
712 		 * into SHA subtree (this requires checksum upgrade) and mark
713 		 * entry that corresponds to this CRC in the CRC tree
714 		 * fictitious, so that all futher operations with this CRC go
715 		 * through SHA subtree.
716 		 */
717 		++dedup_crc_failures;
718 
719 		/*
720 		 * Insert block that was represented by now fictitious dedup
721 		 * entry (create a new SHA entry and preserve stats of the
722 		 * old CRC one). If checksum upgrade fails insert the
723 		 * candidate into Pass2 list and return - keep both trees
724 		 * unmodified.
725 		 */
726 		sha_de = calloc(sizeof(*sha_de), 1);
727 		sha_de->leaf = de->leaf;
728 		sha_de->ref_blks = de->u.de.ref_blks;
729 		sha_de->ref_size = de->u.de.ref_size;
730 		if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) {
731 			free(sha_de);
732 			goto pass2_insert;
733 		}
734 		MemoryUse += sizeof(*sha_de);
735 
736 		RB_INIT(&de->u.fict_root);
737 		/*
738 		 * Here we can insert without prior checking because the tree
739 		 * is empty at this point
740 		 */
741 		RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
742 
743 		/*
744 		 * Mark entry in CRC tree fictitious
745 		 */
746 		de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS;
747 
748 		/*
749 		 * Upgrade checksum of the candidate and insert it into
750 		 * SHA subtree. If upgrade fails insert the candidate into
751 		 * Pass2 list.
752 		 */
753 		if (upgrade_chksum(scan_leaf, temp.sha_hash)) {
754 			goto pass2_insert;
755 		}
756 		sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root,
757 				 &temp);
758 		if (sha_de != NULL)
759 			/* There is an entry with this SHA already, but the only
760 			 * RB-tree element at this point is that entry we just
761 			 * added. We know for sure these blocks are different
762 			 * (this is crc_failure branch) so treat it as SHA
763 			 * collision.
764 			 */
765 			goto sha256_failure;
766 
767 		sha_de = calloc(sizeof(*sha_de), 1);
768 		sha_de->leaf = *scan_leaf;
769 		memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
770 		RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
771 		MemoryUse += sizeof(*sha_de);
772 		goto upgrade_stats_sha;
773 	}
774 
775 upgrade_stats:
776 	de->u.de.ref_blks += 1;
777 	de->u.de.ref_size += scan_leaf->data_len;
778 	return (1);
779 
780 upgrade_stats_sha:
781 	sha_de->ref_blks += 1;
782 	sha_de->ref_size += scan_leaf->data_len;
783 	return (1);
784 
785 pass2_insert:
786 	/*
787 	 * If in pass2 mode don't insert anything, fall through to
788 	 * terminate_early
789 	 */
790 	if ((flags & DEDUP_PASS2) == 0) {
791 		pass2_de = calloc(sizeof(*pass2_de), 1);
792 		pass2_de->leaf = *scan_leaf;
793 		STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry);
794 		dedup_skipped_size += scan_leaf->data_len;
795 		return (1);
796 	}
797 
798 terminate_early:
799 	/*
800 	 * Early termination path. Fixup stats.
801 	 */
802 	dedup_alloc_size += scan_leaf->data_len;
803 	dedup_ref_size += scan_leaf->data_len;
804 	return (0);
805 }
806 
807 static int
808 upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash)
809 {
810 	struct hammer_ioc_data data;
811 	char *buf = malloc(DEDUP_BUF);
812 	SHA256_CTX ctx;
813 	int error;
814 
815 	bzero(&data, sizeof(data));
816 	data.elm = leaf->base;
817 	data.ubuf = buf;
818 	data.size = DEDUP_BUF;
819 
820 	error = 0;
821 	if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) {
822 		fprintf(stderr, "Get-data failed: %s\n", strerror(errno));
823 		error = 1;
824 		goto done;
825 	}
826 	DedupDataReads += leaf->data_len;
827 
828 	if (data.leaf.data_len != leaf->data_len) {
829 		error = 1;
830 		goto done;
831 	}
832 
833 	if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD &&
834 	    data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) {
835 		SHA256_Init(&ctx);
836 		SHA256_Update(&ctx, (void *)buf, data.leaf.data_len);
837 		SHA256_Final(sha_hash, &ctx);
838 	}
839 
840 done:
841 	free(buf);
842 	return (error);
843 }
844 
845 static void
846 sigAlrm(int signo __unused)
847 {
848 	SigAlrmFlag = 1;
849 }
850 
851 static void
852 sigInfo(int signo __unused)
853 {
854 	SigInfoFlag = 1;
855 }
856 
857 static void
858 scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id)
859 {
860 	struct hammer_ioc_mirror_rw mirror;
861 	hammer_ioc_mrecord_any_t mrec;
862 	struct hammer_btree_leaf_elm elm;
863 	char *buf = malloc(DEDUP_BUF);
864 	char buf1[8];
865 	char buf2[8];
866 	int offset, bytes;
867 
868 	SigInfoFlag = 0;
869 	DedupDataReads = 0;
870 	DedupCurrentRecords = 0;
871 	signal(SIGINFO, sigInfo);
872 	signal(SIGALRM, sigAlrm);
873 
874 	/*
875 	 * Deduplication happens per element so hammer(8) is in full
876 	 * control of the ioctl()s to actually perform it. SIGALRM
877 	 * needs to be handled within hammer(8) but a checkpoint
878 	 * is needed for resuming. Use cycle file for that.
879 	 *
880 	 * Try to obtain the previous obj_id from the cycle file and
881 	 * if not available just start from the beginning.
882 	 */
883 	bzero(&mirror, sizeof(mirror));
884 	hammer_key_beg_init(&mirror.key_beg);
885 	hammer_get_cycle(&mirror.key_beg, &mirror.tid_beg);
886 
887 	if (mirror.key_beg.obj_id != (int64_t)HAMMER_MIN_OBJID) {
888 		if (VerboseOpt)
889 			fprintf(stderr, "%s: mirror-read: Resuming at object %016jx\n",
890 			    id, (uintmax_t)mirror.key_beg.obj_id);
891 	}
892 
893 	hammer_key_end_init(&mirror.key_end);
894 
895 	mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid;
896 	mirror.tid_end = glob_pfs.ondisk->sync_end_tid;
897 	mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */
898 	mirror.ubuf = buf;
899 	mirror.size = DEDUP_BUF;
900 	mirror.pfs_id = glob_pfs.pfs_id;
901 	mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
902 
903 	if (VerboseOpt && DedupCrcStart == 0) {
904 		printf("%s %s: objspace %016jx:%04x %016jx:%04x\n",
905 			id, filesystem,
906 			(uintmax_t)mirror.key_beg.obj_id,
907 			mirror.key_beg.localization,
908 			(uintmax_t)mirror.key_end.obj_id,
909 			mirror.key_end.localization);
910 		printf("%s %s: pfs_id %d\n",
911 			id, filesystem, glob_pfs.pfs_id);
912 	}
913 	fflush(stdout);
914 	fflush(stderr);
915 
916 	do {
917 		mirror.count = 0;
918 		mirror.pfs_id = glob_pfs.pfs_id;
919 		mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
920 		if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) {
921 			fprintf(stderr, "Mirror-read %s failed: %s\n",
922 				filesystem, strerror(errno));
923 			exit(1);
924 		}
925 		if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) {
926 			fprintf(stderr, "Mirror-read %s fatal error %d\n",
927 				filesystem, mirror.head.error);
928 			exit(1);
929 		}
930 		if (mirror.count) {
931 			offset = 0;
932 			while (offset < mirror.count) {
933 				mrec = (void *)((char *)buf + offset);
934 				bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size);
935 				if (offset + bytes > mirror.count) {
936 					fprintf(stderr, "Misaligned record\n");
937 					exit(1);
938 				}
939 				assert((mrec->head.type &
940 				       HAMMER_MRECF_TYPE_MASK) ==
941 				       HAMMER_MREC_TYPE_REC);
942 				offset += bytes;
943 				elm = mrec->rec.leaf;
944 				if (elm.base.btype != HAMMER_BTREE_TYPE_RECORD)
945 					continue;
946 				if (elm.base.rec_type != HAMMER_RECTYPE_DATA)
947 					continue;
948 				++DedupCurrentRecords;
949 				if (DedupCrcStart != DedupCrcEnd) {
950 					if (elm.data_crc < DedupCrcStart)
951 						continue;
952 					if (DedupCrcEnd &&
953 					    elm.data_crc >= DedupCrcEnd) {
954 						continue;
955 					}
956 				}
957 				func(&elm, 0);
958 			}
959 		}
960 		mirror.key_beg = mirror.key_cur;
961 		if (DidInterrupt || SigAlrmFlag) {
962 			if (VerboseOpt)
963 				fprintf(stderr, "%s\n",
964 				    (DidInterrupt ? "Interrupted" : "Timeout"));
965 			hammer_set_cycle(&mirror.key_cur, mirror.tid_beg);
966 			if (VerboseOpt)
967 				fprintf(stderr, "Cyclefile %s updated for "
968 				    "continuation\n", CyclePath);
969 			exit(1);
970 		}
971 		if (SigInfoFlag) {
972 			if (DedupTotalRecords) {
973 				humanize_unsigned(buf1, sizeof(buf1),
974 						  DedupDataReads,
975 						  "B", 1024);
976 				humanize_unsigned(buf2, sizeof(buf2),
977 						  dedup_successes_bytes,
978 						  "B", 1024);
979 				fprintf(stderr, "%s count %7jd/%jd "
980 						"(%02d.%02d%%) "
981 						"ioread %s newddup %s\n",
982 					id,
983 					(intmax_t)DedupCurrentRecords,
984 					(intmax_t)DedupTotalRecords,
985 					(int)(DedupCurrentRecords * 100 /
986 						DedupTotalRecords),
987 					(int)(DedupCurrentRecords * 10000 /
988 						DedupTotalRecords % 100),
989 					buf1, buf2);
990 			} else {
991 				fprintf(stderr, "%s count %-7jd\n",
992 					id,
993 					(intmax_t)DedupCurrentRecords);
994 			}
995 			SigInfoFlag = 0;
996 		}
997 	} while (mirror.count != 0);
998 
999 	signal(SIGINFO, SIG_IGN);
1000 	signal(SIGALRM, SIG_IGN);
1001 
1002 	free(buf);
1003 }
1004 
1005 static void
1006 dump_simulated_dedup(void)
1007 {
1008 	struct sim_dedup_entry *sim_de;
1009 
1010 	printf("=== Dumping simulated dedup entries:\n");
1011 	RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) {
1012 		printf("\tcrc=%08x cnt=%ju size=%ju\n",
1013 			sim_de->crc,
1014 			(intmax_t)sim_de->ref_blks,
1015 			(intmax_t)sim_de->ref_size);
1016 	}
1017 	printf("end of dump ===\n");
1018 }
1019 
1020 static void
1021 dump_real_dedup(void)
1022 {
1023 	struct dedup_entry *de;
1024 	struct sha_dedup_entry *sha_de;
1025 	int i;
1026 
1027 	printf("=== Dumping dedup entries:\n");
1028 	RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) {
1029 		if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
1030 			printf("\tcrc=%08x fictitious\n", de->leaf.data_crc);
1031 
1032 			RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
1033 				printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t"
1034 				       "\t\tsha=",
1035 				       sha_de->leaf.data_crc,
1036 				       (intmax_t)sha_de->ref_blks,
1037 				       (intmax_t)sha_de->ref_size);
1038 				for (i = 0; i < SHA256_DIGEST_LENGTH; ++i)
1039 					printf("%02x", sha_de->sha_hash[i]);
1040 				printf("\n");
1041 			}
1042 		} else {
1043 			printf("\tcrc=%08x cnt=%ju size=%ju\n",
1044 			       de->leaf.data_crc,
1045 			       (intmax_t)de->u.de.ref_blks,
1046 			       (intmax_t)de->u.de.ref_size);
1047 		}
1048 	}
1049 	printf("end of dump ===\n");
1050 }
1051 
1052 static void
1053 dedup_usage(int code)
1054 {
1055 	fprintf(stderr,
1056 		"hammer dedup-simulate <filesystem>\n"
1057 		"hammer dedup <filesystem>\n"
1058 	);
1059 	exit(code);
1060 }
1061