xref: /dragonfly/sbin/hammer/cmd_dedup.c (revision dd491ed2)
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer.h"
36 
37 #include <sys/tree.h>
38 #include <libutil.h>
39 #include <crypto/sha2/sha2.h>
40 
41 #define DEDUP_BUF (64 * 1024)
42 
43 /* Sorted list of block CRCs - light version for dedup-simulate */
44 struct sim_dedup_entry_rb_tree;
45 RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree =
46 					RB_INITIALIZER(&sim_dedup_tree);
47 RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
48 		rb_sim_dedup_entry_compare, hammer_crc_t);
49 
50 struct sim_dedup_entry {
51 	hammer_crc_t	crc;
52 	uint64_t	ref_blks; /* number of blocks referenced */
53 	uint64_t	ref_size; /* size of data referenced */
54 	RB_ENTRY(sim_dedup_entry) rb_entry;
55 };
56 
57 struct dedup_entry {
58 	struct hammer_btree_leaf_elm leaf;
59 	union {
60 		struct {
61 			uint64_t ref_blks;
62 			uint64_t ref_size;
63 		} de;
64 		RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root;
65 	} u;
66 	uint8_t flags;
67 	RB_ENTRY(dedup_entry) rb_entry;
68 };
69 
70 #define HAMMER_DEDUP_ENTRY_FICTITIOUS	0x0001
71 
72 struct sha_dedup_entry {
73 	struct hammer_btree_leaf_elm	leaf;
74 	uint64_t			ref_blks;
75 	uint64_t			ref_size;
76 	uint8_t				sha_hash[SHA256_DIGEST_LENGTH];
77 	RB_ENTRY(sha_dedup_entry)	fict_entry;
78 };
79 
80 /* Sorted list of HAMMER B-Tree keys */
81 struct dedup_entry_rb_tree;
82 struct sha_dedup_entry_rb_tree;
83 
84 RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree =
85 					RB_INITIALIZER(&dedup_tree);
86 RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
87 		rb_dedup_entry_compare, hammer_crc_t);
88 
89 RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
90 		rb_sha_dedup_entry_compare);
91 
92 /*
93  * Pass2 list - contains entries that were not dedup'ed because ioctl failed
94  */
95 STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue =
96 				STAILQ_HEAD_INITIALIZER(pass2_dedup_queue);
97 
98 struct pass2_dedup_entry {
99 	struct hammer_btree_leaf_elm	leaf;
100 	STAILQ_ENTRY(pass2_dedup_entry)	sq_entry;
101 };
102 
103 #define DEDUP_PASS2	0x0001 /* process_btree_elm() mode */
104 
105 static int SigInfoFlag;
106 static int SigAlrmFlag;
107 static int64_t DedupDataReads;
108 static int64_t DedupCurrentRecords;
109 static int64_t DedupTotalRecords;
110 static uint32_t DedupCrcStart;
111 static uint32_t DedupCrcEnd;
112 static uint64_t MemoryUse;
113 
114 /* PFS global ids - we deal with just one PFS at a run */
115 static int glob_fd;
116 static struct hammer_ioc_pseudofs_rw glob_pfs;
117 
118 /*
119  * Global accounting variables
120  *
121  * Last three don't have to be 64-bit, just to be safe..
122  */
123 static uint64_t dedup_alloc_size;
124 static uint64_t dedup_ref_size;
125 static uint64_t dedup_skipped_size;
126 static uint64_t dedup_crc_failures;
127 static uint64_t dedup_sha_failures;
128 static uint64_t dedup_underflows;
129 static uint64_t dedup_successes_count;
130 static uint64_t dedup_successes_bytes;
131 
132 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
133 				struct sim_dedup_entry *sim_de2);
134 static int rb_dedup_entry_compare(struct dedup_entry *de1,
135 				struct dedup_entry *de2);
136 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
137 				struct sha_dedup_entry *sha_de2);
138 typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags);
139 static void scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id);
140 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
141 static int count_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
142 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
143 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, uint8_t *sha_hash);
144 static void dump_simulated_dedup(void);
145 static void dump_real_dedup(void);
146 static void dedup_usage(int code);
147 
148 RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
149 		rb_sim_dedup_entry_compare, hammer_crc_t, crc);
150 RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
151 		rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc);
152 RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
153 		rb_sha_dedup_entry_compare);
154 
155 static
156 int
157 rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
158 			struct sim_dedup_entry *sim_de2)
159 {
160 	if (sim_de1->crc < sim_de2->crc)
161 		return (-1);
162 	if (sim_de1->crc > sim_de2->crc)
163 		return (1);
164 
165 	return (0);
166 }
167 
168 static
169 int
170 rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2)
171 {
172 	if (de1->leaf.data_crc < de2->leaf.data_crc)
173 		return (-1);
174 	if (de1->leaf.data_crc > de2->leaf.data_crc)
175 		return (1);
176 
177 	return (0);
178 }
179 
180 static
181 int
182 rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
183 			struct sha_dedup_entry *sha_de2)
184 {
185 	unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash;
186 	unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash;
187 	int i;
188 
189 	for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) {
190 		if (h1[i] < h2[i])
191 			return (-1);
192 		if (h1[i] > h2[i])
193 			return (1);
194 	}
195 
196 	return (0);
197 }
198 
199 /*
200  * dedup-simulate <filesystem>
201  */
202 void
203 hammer_cmd_dedup_simulate(char **av, int ac)
204 {
205 	struct sim_dedup_entry *sim_de;
206 
207 	if (ac != 1) {
208 		dedup_usage(1);
209 		/* not reached */
210 	}
211 
212 	glob_fd = getpfs(&glob_pfs, av[0]);
213 
214 	/*
215 	 * Collection passes (memory limited)
216 	 */
217 	printf("Dedup-simulate running\n");
218 	do {
219 		DedupCrcStart = DedupCrcEnd;
220 		DedupCrcEnd = 0;
221 		MemoryUse = 0;
222 
223 		if (VerboseOpt) {
224 			printf("B-Tree pass  crc-range %08x-max\n",
225 				DedupCrcStart);
226 			fflush(stdout);
227 		}
228 		scan_pfs(av[0], collect_btree_elm, "simu-pass");
229 
230 		if (VerboseOpt >= 2)
231 			dump_simulated_dedup();
232 
233 		/*
234 		 * Calculate simulated dedup ratio and get rid of the tree
235 		 */
236 		while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) {
237 			assert(sim_de->ref_blks != 0);
238 			dedup_ref_size += sim_de->ref_size;
239 			dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks;
240 
241 			RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
242 			free(sim_de);
243 		}
244 		if (DedupCrcEnd && VerboseOpt == 0)
245 			printf(".");
246 	} while (DedupCrcEnd);
247 
248 	printf("Dedup-simulate %s succeeded\n", av[0]);
249 	relpfs(glob_fd, &glob_pfs);
250 
251 	printf("Simulated dedup ratio = %.2f\n",
252 	    (dedup_alloc_size != 0) ?
253 		(double)dedup_ref_size / dedup_alloc_size : 0);
254 }
255 
256 /*
257  * dedup <filesystem>
258  */
259 void
260 hammer_cmd_dedup(char **av, int ac)
261 {
262 	struct dedup_entry *de;
263 	struct sha_dedup_entry *sha_de;
264 	struct pass2_dedup_entry *pass2_de;
265 	char *tmp;
266 	char buf[8];
267 	int needfree = 0;
268 
269 	if (TimeoutOpt > 0)
270 		alarm(TimeoutOpt);
271 
272 	if (ac != 1) {
273 		dedup_usage(1);
274 		/* not reached */
275 	}
276 
277 	STAILQ_INIT(&pass2_dedup_queue);
278 
279 	glob_fd = getpfs(&glob_pfs, av[0]);
280 
281 	/*
282 	 * A cycle file is _required_ for resuming dedup after the timeout
283 	 * specified with -t has expired. If no -c option, then place a
284 	 * .dedup.cycle file either in the PFS snapshots directory or in
285 	 * the default snapshots directory.
286 	 */
287 	if (!CyclePath) {
288 		if (glob_pfs.ondisk->snapshots[0] != '/') {
289 			asprintf(&tmp, "%s/%s/.dedup.cycle",
290 			    SNAPSHOTS_BASE, av[0]);
291 		} else {
292 			asprintf(&tmp, "%s/.dedup.cycle",
293 			    glob_pfs.ondisk->snapshots);
294 		}
295 		CyclePath = tmp;
296 		needfree = 1;
297 	}
298 
299 	/*
300 	 * Pre-pass to cache the btree
301 	 */
302 	scan_pfs(av[0], count_btree_elm, "pre-pass ");
303 	DedupTotalRecords = DedupCurrentRecords;
304 
305 	/*
306 	 * Collection passes (memory limited)
307 	 */
308 	printf("Dedup running\n");
309 	do {
310 		DedupCrcStart = DedupCrcEnd;
311 		DedupCrcEnd = 0;
312 		MemoryUse = 0;
313 
314 		if (VerboseOpt) {
315 			printf("B-Tree pass  crc-range %08x-max\n",
316 				DedupCrcStart);
317 			fflush(stdout);
318 		}
319 		scan_pfs(av[0], process_btree_elm, "main-pass");
320 
321 		while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) {
322 			if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2))
323 				dedup_skipped_size -= pass2_de->leaf.data_len;
324 
325 			STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry);
326 			free(pass2_de);
327 		}
328 		assert(STAILQ_EMPTY(&pass2_dedup_queue));
329 
330 		if (VerboseOpt >= 2)
331 			dump_real_dedup();
332 
333 		/*
334 		 * Calculate dedup ratio and get rid of the trees
335 		 */
336 		while ((de = RB_ROOT(&dedup_tree)) != NULL) {
337 			if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
338 				while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) {
339 					assert(sha_de->ref_blks != 0);
340 					dedup_ref_size += sha_de->ref_size;
341 					dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks;
342 
343 					RB_REMOVE(sha_dedup_entry_rb_tree,
344 							&de->u.fict_root, sha_de);
345 					free(sha_de);
346 				}
347 				assert(RB_EMPTY(&de->u.fict_root));
348 			} else {
349 				assert(de->u.de.ref_blks != 0);
350 				dedup_ref_size += de->u.de.ref_size;
351 				dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks;
352 			}
353 
354 			RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
355 			free(de);
356 		}
357 		assert(RB_EMPTY(&dedup_tree));
358 		if (DedupCrcEnd && VerboseOpt == 0)
359 			printf(".");
360 	} while (DedupCrcEnd);
361 
362 	printf("Dedup %s succeeded\n", av[0]);
363 	relpfs(glob_fd, &glob_pfs);
364 
365 	humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024);
366 	printf("Dedup ratio = %.2f (in this run)\n"
367 	       "    %8s referenced\n",
368 	       ((dedup_alloc_size != 0) ?
369 			(double)dedup_ref_size / dedup_alloc_size : 0),
370 	       buf
371 	);
372 	humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024);
373 	printf("    %8s allocated\n", buf);
374 	humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024);
375 	printf("    %8s skipped\n", buf);
376 	printf("    %8jd CRC collisions\n"
377 	       "    %8jd SHA collisions\n"
378 	       "    %8jd big-block underflows\n"
379 	       "    %8jd new dedup records\n"
380 	       "    %8jd new dedup bytes\n",
381 	       (intmax_t)dedup_crc_failures,
382 	       (intmax_t)dedup_sha_failures,
383                (intmax_t)dedup_underflows,
384                (intmax_t)dedup_successes_count,
385                (intmax_t)dedup_successes_bytes
386 	);
387 
388 	/* Once completed remove cycle file */
389 	hammer_reset_cycle();
390 
391 	/* We don't want to mess up with other directives */
392 	if (needfree) {
393 		free(tmp);
394 		CyclePath = NULL;
395 	}
396 }
397 
398 static
399 int
400 count_btree_elm(hammer_btree_leaf_elm_t scan_leaf __unused, int flags __unused)
401 {
402 	return(1);
403 }
404 
405 static
406 int
407 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused)
408 {
409 	struct sim_dedup_entry *sim_de;
410 
411 	/*
412 	 * If we are using too much memory we have to clean some out, which
413 	 * will cause the run to use multiple passes.  Be careful of integer
414 	 * overflows!
415 	 */
416 	if (MemoryUse > MemoryLimit) {
417 		DedupCrcEnd = DedupCrcStart +
418 			      (uint32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
419 		if (VerboseOpt) {
420 			printf("memory limit  crc-range %08x-%08x\n",
421 				DedupCrcStart, DedupCrcEnd);
422 			fflush(stdout);
423 		}
424 		for (;;) {
425 			sim_de = RB_MAX(sim_dedup_entry_rb_tree,
426 					&sim_dedup_tree);
427 			if (sim_de == NULL || sim_de->crc < DedupCrcEnd)
428 				break;
429 			RB_REMOVE(sim_dedup_entry_rb_tree,
430 				  &sim_dedup_tree, sim_de);
431 			MemoryUse -= sizeof(*sim_de);
432 			free(sim_de);
433 		}
434 	}
435 
436 	/*
437 	 * Collect statistics based on the CRC only, do not try to read
438 	 * any data blocks or run SHA hashes.
439 	 */
440 	sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree,
441 			   scan_leaf->data_crc);
442 
443 	if (sim_de == NULL) {
444 		sim_de = calloc(1, sizeof(*sim_de));
445 		sim_de->crc = scan_leaf->data_crc;
446 		RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
447 		MemoryUse += sizeof(*sim_de);
448 	}
449 
450 	sim_de->ref_blks += 1;
451 	sim_de->ref_size += scan_leaf->data_len;
452 	return (1);
453 }
454 
455 static __inline
456 int
457 validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
458 {
459 	if (HAMMER_ZONE(p->data_offset) != HAMMER_ZONE(q->data_offset))
460 		return (1);
461 	if (p->data_len != q->data_len)
462 		return (1);
463 
464 	return (0);
465 }
466 
467 #define DEDUP_TECH_FAILURE	1
468 #define DEDUP_CMP_FAILURE	2
469 #define DEDUP_INVALID_ZONE	3
470 #define DEDUP_UNDERFLOW		4
471 #define DEDUP_VERS_FAILURE	5
472 
473 static __inline
474 int
475 deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
476 {
477 	struct hammer_ioc_dedup dedup;
478 
479 	bzero(&dedup, sizeof(dedup));
480 
481 	/*
482 	 * If data_offset fields are the same there is no need to run ioctl,
483 	 * candidate is already dedup'ed.
484 	 */
485 	if (p->data_offset == q->data_offset)
486 		return (0);
487 
488 	dedup.elm1 = p->base;
489 	dedup.elm2 = q->base;
490 	RunningIoctl = 1;
491 	if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) {
492 		if (errno == EOPNOTSUPP)
493 			return (DEDUP_VERS_FAILURE); /* must be at least version 5 */
494 		/* Technical failure - locking or w/e */
495 		return (DEDUP_TECH_FAILURE);
496 	}
497 	if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE)
498 		return (DEDUP_CMP_FAILURE);
499 	if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE)
500 		return (DEDUP_INVALID_ZONE);
501 	if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW)
502 		return (DEDUP_UNDERFLOW);
503 	RunningIoctl = 0;
504 	++dedup_successes_count;
505 	dedup_successes_bytes += p->data_len;
506 	return (0);
507 }
508 
509 static
510 int
511 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags)
512 {
513 	struct dedup_entry *de;
514 	struct sha_dedup_entry *sha_de, temp;
515 	struct pass2_dedup_entry *pass2_de;
516 	int error;
517 
518 	/*
519 	 * If we are using too much memory we have to clean some out, which
520 	 * will cause the run to use multiple passes.  Be careful of integer
521 	 * overflows!
522 	 */
523 	while (MemoryUse > MemoryLimit) {
524 		DedupCrcEnd = DedupCrcStart +
525 			      (uint32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
526 		if (VerboseOpt) {
527 			printf("memory limit  crc-range %08x-%08x\n",
528 				DedupCrcStart, DedupCrcEnd);
529 			fflush(stdout);
530 		}
531 
532 		for (;;) {
533 			de = RB_MAX(dedup_entry_rb_tree, &dedup_tree);
534 			if (de == NULL || de->leaf.data_crc < DedupCrcEnd)
535 				break;
536 			if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
537 				while ((sha_de = RB_ROOT(&de->u.fict_root)) !=
538 				       NULL) {
539 					RB_REMOVE(sha_dedup_entry_rb_tree,
540 						  &de->u.fict_root, sha_de);
541 					MemoryUse -= sizeof(*sha_de);
542 					free(sha_de);
543 				}
544 			}
545 			RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
546 			MemoryUse -= sizeof(*de);
547 			free(de);
548 		}
549 	}
550 
551 	/*
552 	 * Collect statistics based on the CRC.  Colliding CRCs usually
553 	 * cause a SHA sub-tree to be created under the de.
554 	 *
555 	 * Trivial case if de not found.
556 	 */
557 	de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc);
558 	if (de == NULL) {
559 		de = calloc(1, sizeof(*de));
560 		de->leaf = *scan_leaf;
561 		RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de);
562 		MemoryUse += sizeof(*de);
563 		goto upgrade_stats;
564 	}
565 
566 	/*
567 	 * Found entry in CRC tree
568 	 */
569 	if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
570 		/*
571 		 * Optimize the case where a CRC failure results in multiple
572 		 * SHA entries.  If we unconditionally issue a data-read a
573 		 * degenerate situation where a colliding CRC's second SHA
574 		 * entry contains the lion's share of the deduplication
575 		 * candidates will result in excessive data block reads.
576 		 *
577 		 * Deal with the degenerate case by looking for a matching
578 		 * data_offset/data_len in the SHA elements we already have
579 		 * before reading the data block and generating a new SHA.
580 		 */
581 		RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
582 			if (sha_de->leaf.data_offset ==
583 						scan_leaf->data_offset &&
584 			    sha_de->leaf.data_len == scan_leaf->data_len) {
585 				memcpy(temp.sha_hash, sha_de->sha_hash,
586 					SHA256_DIGEST_LENGTH);
587 				break;
588 			}
589 		}
590 
591 		/*
592 		 * Entry in CRC tree is fictitious, so we already had problems
593 		 * with this CRC. Upgrade (compute SHA) the candidate and
594 		 * dive into SHA subtree. If upgrade fails insert the candidate
595 		 * into Pass2 list (it will be processed later).
596 		 */
597 		if (sha_de == NULL) {
598 			if (upgrade_chksum(scan_leaf, temp.sha_hash))
599 				goto pass2_insert;
600 
601 			sha_de = RB_FIND(sha_dedup_entry_rb_tree,
602 					 &de->u.fict_root, &temp);
603 		}
604 
605 		/*
606 		 * Nothing in SHA subtree so far, so this is a new
607 		 * 'dataset'. Insert new entry into SHA subtree.
608 		 */
609 		if (sha_de == NULL) {
610 			sha_de = calloc(1, sizeof(*sha_de));
611 			sha_de->leaf = *scan_leaf;
612 			memcpy(sha_de->sha_hash, temp.sha_hash,
613 			       SHA256_DIGEST_LENGTH);
614 			RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root,
615 				  sha_de);
616 			MemoryUse += sizeof(*sha_de);
617 			goto upgrade_stats_sha;
618 		}
619 
620 		/*
621 		 * Found entry in SHA subtree, it means we have a potential
622 		 * dedup pair. Validate it (zones have to match and data_len
623 		 * field have to be the same too. If validation fails, treat
624 		 * it as a SHA collision (jump to sha256_failure).
625 		 */
626 		if (validate_dedup_pair(&sha_de->leaf, scan_leaf))
627 			goto sha256_failure;
628 
629 		/*
630 		 * We have a valid dedup pair (SHA match, validated).
631 		 *
632 		 * In case of technical failure (dedup pair was good, but
633 		 * ioctl failed anyways) insert the candidate into Pass2 list
634 		 * (we will try to dedup it after we are done with the rest of
635 		 * the tree).
636 		 *
637 		 * If ioctl fails because either of blocks is in the non-dedup
638 		 * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't
639 		 * bother with the candidate and terminate early.
640 		 *
641 		 * If ioctl fails because of big-block underflow replace the
642 		 * leaf node that found dedup entry represents with scan_leaf.
643 		 */
644 		error = deduplicate(&sha_de->leaf, scan_leaf);
645 		switch(error) {
646 		case 0:
647 			goto upgrade_stats_sha;
648 		case DEDUP_TECH_FAILURE:
649 			goto pass2_insert;
650 		case DEDUP_CMP_FAILURE:
651 			goto sha256_failure;
652 		case DEDUP_INVALID_ZONE:
653 			goto terminate_early;
654 		case DEDUP_UNDERFLOW:
655 			++dedup_underflows;
656 			sha_de->leaf = *scan_leaf;
657 			memcpy(sha_de->sha_hash, temp.sha_hash,
658 				SHA256_DIGEST_LENGTH);
659 			goto upgrade_stats_sha;
660 		case DEDUP_VERS_FAILURE:
661 			errx(1, "HAMMER filesystem must be at least "
662 				"version 5 to dedup");
663 			/* not reached */
664 		default:
665 			fprintf(stderr, "Unknown error\n");
666 			goto terminate_early;
667 		}
668 
669 		/*
670 		 * Ooh la la.. SHA-256 collision. Terminate early, there's
671 		 * nothing we can do here.
672 		 */
673 sha256_failure:
674 		++dedup_sha_failures;
675 		goto terminate_early;
676 	} else {
677 		/*
678 		 * Candidate CRC is good for now (we found an entry in CRC
679 		 * tree and it's not fictitious). This means we have a
680 		 * potential dedup pair.
681 		 */
682 		if (validate_dedup_pair(&de->leaf, scan_leaf))
683 			goto crc_failure;
684 
685 		/*
686 		 * We have a valid dedup pair (CRC match, validated)
687 		 */
688 		error = deduplicate(&de->leaf, scan_leaf);
689 		switch(error) {
690 		case 0:
691 			goto upgrade_stats;
692 		case DEDUP_TECH_FAILURE:
693 			goto pass2_insert;
694 		case DEDUP_CMP_FAILURE:
695 			goto crc_failure;
696 		case DEDUP_INVALID_ZONE:
697 			goto terminate_early;
698 		case DEDUP_UNDERFLOW:
699 			++dedup_underflows;
700 			de->leaf = *scan_leaf;
701 			goto upgrade_stats;
702 		case DEDUP_VERS_FAILURE:
703 			errx(1, "HAMMER filesystem must be at least "
704 				"version 5 to dedup");
705 			/* not reached */
706 		default:
707 			fprintf(stderr, "Unknown error\n");
708 			goto terminate_early;
709 		}
710 
711 crc_failure:
712 		/*
713 		 * We got a CRC collision - either ioctl failed because of
714 		 * the comparison failure or validation of the potential
715 		 * dedup pair went bad. In all cases insert both blocks
716 		 * into SHA subtree (this requires checksum upgrade) and mark
717 		 * entry that corresponds to this CRC in the CRC tree
718 		 * fictitious, so that all futher operations with this CRC go
719 		 * through SHA subtree.
720 		 */
721 		++dedup_crc_failures;
722 
723 		/*
724 		 * Insert block that was represented by now fictitious dedup
725 		 * entry (create a new SHA entry and preserve stats of the
726 		 * old CRC one). If checksum upgrade fails insert the
727 		 * candidate into Pass2 list and return - keep both trees
728 		 * unmodified.
729 		 */
730 		sha_de = calloc(1, sizeof(*sha_de));
731 		sha_de->leaf = de->leaf;
732 		sha_de->ref_blks = de->u.de.ref_blks;
733 		sha_de->ref_size = de->u.de.ref_size;
734 		if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) {
735 			free(sha_de);
736 			goto pass2_insert;
737 		}
738 		MemoryUse += sizeof(*sha_de);
739 
740 		RB_INIT(&de->u.fict_root);
741 		/*
742 		 * Here we can insert without prior checking because the tree
743 		 * is empty at this point
744 		 */
745 		RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
746 
747 		/*
748 		 * Mark entry in CRC tree fictitious
749 		 */
750 		de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS;
751 
752 		/*
753 		 * Upgrade checksum of the candidate and insert it into
754 		 * SHA subtree. If upgrade fails insert the candidate into
755 		 * Pass2 list.
756 		 */
757 		if (upgrade_chksum(scan_leaf, temp.sha_hash))
758 			goto pass2_insert;
759 		sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root,
760 				 &temp);
761 		if (sha_de != NULL) {
762 			/* There is an entry with this SHA already, but the only
763 			 * RB-tree element at this point is that entry we just
764 			 * added. We know for sure these blocks are different
765 			 * (this is crc_failure branch) so treat it as SHA
766 			 * collision.
767 			 */
768 			goto sha256_failure;
769 		}
770 
771 		sha_de = calloc(1, sizeof(*sha_de));
772 		sha_de->leaf = *scan_leaf;
773 		memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
774 		RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
775 		MemoryUse += sizeof(*sha_de);
776 		goto upgrade_stats_sha;
777 	}
778 
779 upgrade_stats:
780 	de->u.de.ref_blks += 1;
781 	de->u.de.ref_size += scan_leaf->data_len;
782 	return (1);
783 
784 upgrade_stats_sha:
785 	sha_de->ref_blks += 1;
786 	sha_de->ref_size += scan_leaf->data_len;
787 	return (1);
788 
789 pass2_insert:
790 	/*
791 	 * If in pass2 mode don't insert anything, fall through to
792 	 * terminate_early
793 	 */
794 	if ((flags & DEDUP_PASS2) == 0) {
795 		pass2_de = calloc(1, sizeof(*pass2_de));
796 		pass2_de->leaf = *scan_leaf;
797 		STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry);
798 		dedup_skipped_size += scan_leaf->data_len;
799 		return (1);
800 	}
801 
802 terminate_early:
803 	/*
804 	 * Early termination path. Fixup stats.
805 	 */
806 	dedup_alloc_size += scan_leaf->data_len;
807 	dedup_ref_size += scan_leaf->data_len;
808 	return (0);
809 }
810 
811 static
812 int
813 upgrade_chksum(hammer_btree_leaf_elm_t leaf, uint8_t *sha_hash)
814 {
815 	struct hammer_ioc_data data;
816 	char *buf = malloc(DEDUP_BUF);
817 	SHA256_CTX ctx;
818 	int error;
819 
820 	bzero(&data, sizeof(data));
821 	data.elm = leaf->base;
822 	data.ubuf = buf;
823 	data.size = DEDUP_BUF;
824 
825 	error = 0;
826 	if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) {
827 		fprintf(stderr, "Get-data failed: %s\n", strerror(errno));
828 		error = 1;
829 		goto done;
830 	}
831 	DedupDataReads += leaf->data_len;
832 
833 	if (data.leaf.data_len != leaf->data_len) {
834 		error = 1;
835 		goto done;
836 	}
837 
838 	if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD &&
839 	    data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) {
840 		SHA256_Init(&ctx);
841 		SHA256_Update(&ctx, (void *)buf, data.leaf.data_len);
842 		SHA256_Final(sha_hash, &ctx);
843 	}
844 
845 done:
846 	free(buf);
847 	return (error);
848 }
849 
850 static
851 void
852 sigAlrm(int signo __unused)
853 {
854 	SigAlrmFlag = 1;
855 }
856 
857 static
858 void
859 sigInfo(int signo __unused)
860 {
861 	SigInfoFlag = 1;
862 }
863 
864 static
865 void
866 scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id)
867 {
868 	struct hammer_ioc_mirror_rw mirror;
869 	hammer_ioc_mrecord_any_t mrec;
870 	struct hammer_btree_leaf_elm elm;
871 	char *buf = malloc(DEDUP_BUF);
872 	char buf1[8];
873 	char buf2[8];
874 	int offset, bytes;
875 
876 	SigInfoFlag = 0;
877 	DedupDataReads = 0;
878 	DedupCurrentRecords = 0;
879 	signal(SIGINFO, sigInfo);
880 	signal(SIGALRM, sigAlrm);
881 
882 	/*
883 	 * Deduplication happens per element so hammer(8) is in full
884 	 * control of the ioctl()s to actually perform it. SIGALRM
885 	 * needs to be handled within hammer(8) but a checkpoint
886 	 * is needed for resuming. Use cycle file for that.
887 	 *
888 	 * Try to obtain the previous obj_id from the cycle file and
889 	 * if not available just start from the beginning.
890 	 */
891 	bzero(&mirror, sizeof(mirror));
892 	hammer_key_beg_init(&mirror.key_beg);
893 	hammer_get_cycle(&mirror.key_beg, &mirror.tid_beg);
894 
895 	if (mirror.key_beg.obj_id != (int64_t)HAMMER_MIN_OBJID) {
896 		if (VerboseOpt) {
897 			fprintf(stderr, "%s: mirror-read: Resuming at object %016jx\n",
898 			    id, (uintmax_t)mirror.key_beg.obj_id);
899 		}
900 	}
901 
902 	hammer_key_end_init(&mirror.key_end);
903 
904 	mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid;
905 	mirror.tid_end = glob_pfs.ondisk->sync_end_tid;
906 	mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */
907 	mirror.ubuf = buf;
908 	mirror.size = DEDUP_BUF;
909 	mirror.pfs_id = glob_pfs.pfs_id;
910 	mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
911 
912 	if (VerboseOpt && DedupCrcStart == 0) {
913 		printf("%s %s: objspace %016jx:%04x %016jx:%04x\n",
914 			id, filesystem,
915 			(uintmax_t)mirror.key_beg.obj_id,
916 			mirror.key_beg.localization,
917 			(uintmax_t)mirror.key_end.obj_id,
918 			mirror.key_end.localization);
919 		printf("%s %s: pfs_id %d\n",
920 			id, filesystem, glob_pfs.pfs_id);
921 	}
922 	fflush(stdout);
923 	fflush(stderr);
924 
925 	do {
926 		mirror.count = 0;
927 		mirror.pfs_id = glob_pfs.pfs_id;
928 		mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
929 		if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) {
930 			err(1, "Mirror-read %s failed", filesystem);
931 			/* not reached */
932 		}
933 		if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) {
934 			errx(1, "Mirror-read %s fatal error %d",
935 				filesystem, mirror.head.error);
936 			/* not reached */
937 		}
938 		if (mirror.count) {
939 			offset = 0;
940 			while (offset < mirror.count) {
941 				mrec = (void *)((char *)buf + offset);
942 				bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size);
943 				if (offset + bytes > mirror.count) {
944 					errx(1, "Misaligned record");
945 					/* not reached */
946 				}
947 				assert((mrec->head.type &
948 				       HAMMER_MRECF_TYPE_MASK) ==
949 				       HAMMER_MREC_TYPE_REC);
950 				offset += bytes;
951 				elm = mrec->rec.leaf;
952 				if (elm.base.btype != HAMMER_BTREE_TYPE_RECORD)
953 					continue;
954 				if (elm.base.rec_type != HAMMER_RECTYPE_DATA)
955 					continue;
956 				++DedupCurrentRecords;
957 				if (DedupCrcStart != DedupCrcEnd) {
958 					if (elm.data_crc < DedupCrcStart)
959 						continue;
960 					if (DedupCrcEnd &&
961 					    elm.data_crc >= DedupCrcEnd) {
962 						continue;
963 					}
964 				}
965 				func(&elm, 0);
966 			}
967 		}
968 		mirror.key_beg = mirror.key_cur;
969 		if (DidInterrupt || SigAlrmFlag) {
970 			if (VerboseOpt) {
971 				fprintf(stderr, "%s\n",
972 				    (DidInterrupt ? "Interrupted" : "Timeout"));
973 			}
974 			hammer_set_cycle(&mirror.key_cur, mirror.tid_beg);
975 			if (VerboseOpt) {
976 				fprintf(stderr, "Cyclefile %s updated for "
977 				    "continuation\n", CyclePath);
978 			}
979 			exit(1);
980 		}
981 		if (SigInfoFlag) {
982 			if (DedupTotalRecords) {
983 				humanize_unsigned(buf1, sizeof(buf1),
984 						  DedupDataReads,
985 						  "B", 1024);
986 				humanize_unsigned(buf2, sizeof(buf2),
987 						  dedup_successes_bytes,
988 						  "B", 1024);
989 				fprintf(stderr, "%s count %7jd/%jd "
990 						"(%02d.%02d%%) "
991 						"ioread %s newddup %s\n",
992 					id,
993 					(intmax_t)DedupCurrentRecords,
994 					(intmax_t)DedupTotalRecords,
995 					(int)(DedupCurrentRecords * 100 /
996 						DedupTotalRecords),
997 					(int)(DedupCurrentRecords * 10000 /
998 						DedupTotalRecords % 100),
999 					buf1, buf2);
1000 			} else {
1001 				fprintf(stderr, "%s count %-7jd\n",
1002 					id,
1003 					(intmax_t)DedupCurrentRecords);
1004 			}
1005 			SigInfoFlag = 0;
1006 		}
1007 	} while (mirror.count != 0);
1008 
1009 	signal(SIGINFO, SIG_IGN);
1010 	signal(SIGALRM, SIG_IGN);
1011 
1012 	free(buf);
1013 }
1014 
1015 static
1016 void
1017 dump_simulated_dedup(void)
1018 {
1019 	struct sim_dedup_entry *sim_de;
1020 
1021 	printf("=== Dumping simulated dedup entries:\n");
1022 	RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) {
1023 		printf("\tcrc=%08x cnt=%ju size=%ju\n",
1024 			sim_de->crc,
1025 			(intmax_t)sim_de->ref_blks,
1026 			(intmax_t)sim_de->ref_size);
1027 	}
1028 	printf("end of dump ===\n");
1029 }
1030 
1031 static
1032 void
1033 dump_real_dedup(void)
1034 {
1035 	struct dedup_entry *de;
1036 	struct sha_dedup_entry *sha_de;
1037 	int i;
1038 
1039 	printf("=== Dumping dedup entries:\n");
1040 	RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) {
1041 		if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
1042 			printf("\tcrc=%08x fictitious\n", de->leaf.data_crc);
1043 
1044 			RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
1045 				printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t"
1046 				       "\t\tsha=",
1047 				       sha_de->leaf.data_crc,
1048 				       (intmax_t)sha_de->ref_blks,
1049 				       (intmax_t)sha_de->ref_size);
1050 				for (i = 0; i < SHA256_DIGEST_LENGTH; ++i)
1051 					printf("%02x", sha_de->sha_hash[i]);
1052 				printf("\n");
1053 			}
1054 		} else {
1055 			printf("\tcrc=%08x cnt=%ju size=%ju\n",
1056 			       de->leaf.data_crc,
1057 			       (intmax_t)de->u.de.ref_blks,
1058 			       (intmax_t)de->u.de.ref_size);
1059 		}
1060 	}
1061 	printf("end of dump ===\n");
1062 }
1063 
1064 static
1065 void
1066 dedup_usage(int code)
1067 {
1068 	fprintf(stderr,
1069 		"hammer dedup-simulate <filesystem>\n"
1070 		"hammer dedup <filesystem>\n"
1071 	);
1072 	exit(code);
1073 }
1074