xref: /dragonfly/sbin/hammer/cmd_dedup.c (revision 31524921)
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include <libutil.h>
36 #include <crypto/sha2/sha2.h>
37 
38 #include "hammer.h"
39 
40 #define DEDUP_BUF (64 * 1024)
41 
42 /* Sorted list of block CRCs - light version for dedup-simulate */
43 struct sim_dedup_entry_rb_tree;
44 RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree =
45 					RB_INITIALIZER(&sim_dedup_tree);
46 RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
47 		rb_sim_dedup_entry_compare, hammer_crc_t);
48 
49 struct sim_dedup_entry {
50 	hammer_crc_t	crc;
51 	u_int64_t	ref_blks; /* number of blocks referenced */
52 	u_int64_t	ref_size; /* size of data referenced */
53 	RB_ENTRY(sim_dedup_entry) rb_entry;
54 };
55 
56 struct dedup_entry {
57 	struct hammer_btree_leaf_elm leaf;
58 	union {
59 		struct {
60 			u_int64_t ref_blks;
61 			u_int64_t ref_size;
62 		} de;
63 		RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root;
64 	} u;
65 	u_int8_t flags;
66 	RB_ENTRY(dedup_entry) rb_entry;
67 };
68 
69 #define HAMMER_DEDUP_ENTRY_FICTITIOUS	0x0001
70 
71 struct sha_dedup_entry {
72 	struct hammer_btree_leaf_elm	leaf;
73 	u_int64_t			ref_blks;
74 	u_int64_t			ref_size;
75 	u_int8_t			sha_hash[SHA256_DIGEST_LENGTH];
76 	RB_ENTRY(sha_dedup_entry)	fict_entry;
77 };
78 
79 /* Sorted list of HAMMER B-Tree keys */
80 struct dedup_entry_rb_tree;
81 struct sha_dedup_entry_rb_tree;
82 
83 RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree =
84 					RB_INITIALIZER(&dedup_tree);
85 RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
86 		rb_dedup_entry_compare, hammer_crc_t);
87 
88 RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
89 		rb_sha_dedup_entry_compare);
90 
91 /*
92  * Pass2 list - contains entries that were not dedup'ed because ioctl failed
93  */
94 STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue =
95 				STAILQ_HEAD_INITIALIZER(pass2_dedup_queue);
96 
97 struct pass2_dedup_entry {
98 	struct hammer_btree_leaf_elm	leaf;
99 	STAILQ_ENTRY(pass2_dedup_entry)	sq_entry;
100 };
101 
102 #define DEDUP_PASS2	0x0001 /* process_btree_elm() mode */
103 
104 static int SigInfoFlag;
105 static int SigAlrmFlag;
106 static int64_t DedupDataReads;
107 static int64_t DedupCurrentRecords;
108 static int64_t DedupTotalRecords;
109 static u_int32_t DedupCrcStart;
110 static u_int32_t DedupCrcEnd;
111 static u_int64_t MemoryUse;
112 
113 /* PFS global ids - we deal with just one PFS at a run */
114 int glob_fd;
115 struct hammer_ioc_pseudofs_rw glob_pfs;
116 
117 /*
118  * Global accounting variables
119  *
120  * Last three don't have to be 64-bit, just to be safe..
121  */
122 u_int64_t dedup_alloc_size = 0;
123 u_int64_t dedup_ref_size = 0;
124 u_int64_t dedup_skipped_size = 0;
125 u_int64_t dedup_crc_failures = 0;
126 u_int64_t dedup_sha_failures = 0;
127 u_int64_t dedup_underflows = 0;
128 u_int64_t dedup_successes_count = 0;
129 u_int64_t dedup_successes_bytes = 0;
130 
131 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
132 				struct sim_dedup_entry *sim_de2);
133 static int rb_dedup_entry_compare(struct dedup_entry *de1,
134 				struct dedup_entry *de2);
135 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
136 				struct sha_dedup_entry *sha_de2);
137 typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags);
138 static void scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id);
139 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
140 static int count_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
141 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
142 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash);
143 static void dump_simulated_dedup(void);
144 static void dump_real_dedup(void);
145 static void dedup_usage(int code);
146 
147 RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
148 		rb_sim_dedup_entry_compare, hammer_crc_t, crc);
149 RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
150 		rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc);
151 RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
152 		rb_sha_dedup_entry_compare);
153 
154 static int
155 rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
156 			struct sim_dedup_entry *sim_de2)
157 {
158 	if (sim_de1->crc < sim_de2->crc)
159 		return (-1);
160 	if (sim_de1->crc > sim_de2->crc)
161 		return (1);
162 
163 	return (0);
164 }
165 
166 static int
167 rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2)
168 {
169 	if (de1->leaf.data_crc < de2->leaf.data_crc)
170 		return (-1);
171 	if (de1->leaf.data_crc > de2->leaf.data_crc)
172 		return (1);
173 
174 	return (0);
175 }
176 
177 static int
178 rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
179 			struct sha_dedup_entry *sha_de2)
180 {
181 	unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash;
182 	unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash;
183 	int i;
184 
185 	for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) {
186 		if (h1[i] < h2[i])
187 			return (-1);
188 		if (h1[i] > h2[i])
189 			return (1);
190 	}
191 
192 	return (0);
193 }
194 
195 /*
196  * dedup-simulate <filesystem>
197  */
198 void
199 hammer_cmd_dedup_simulate(char **av, int ac)
200 {
201 	struct sim_dedup_entry *sim_de;
202 
203 	if (ac != 1)
204 		dedup_usage(1);
205 
206 	glob_fd = getpfs(&glob_pfs, av[0]);
207 
208 	/*
209 	 * Collection passes (memory limited)
210 	 */
211 	printf("Dedup-simulate running\n");
212 	do {
213 		DedupCrcStart = DedupCrcEnd;
214 		DedupCrcEnd = 0;
215 		MemoryUse = 0;
216 
217 		if (VerboseOpt) {
218 			printf("B-Tree pass  crc-range %08x-max\n",
219 				DedupCrcStart);
220 			fflush(stdout);
221 		}
222 		scan_pfs(av[0], collect_btree_elm, "simu-pass");
223 
224 		if (VerboseOpt >= 2)
225 			dump_simulated_dedup();
226 
227 		/*
228 		 * Calculate simulated dedup ratio and get rid of the tree
229 		 */
230 		while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) {
231 			assert(sim_de->ref_blks != 0);
232 			dedup_ref_size += sim_de->ref_size;
233 			dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks;
234 
235 			RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
236 			free(sim_de);
237 		}
238 		if (DedupCrcEnd && VerboseOpt == 0)
239 			printf(".");
240 	} while (DedupCrcEnd);
241 
242 	printf("Dedup-simulate %s succeeded\n", av[0]);
243 	relpfs(glob_fd, &glob_pfs);
244 
245 	printf("Simulated dedup ratio = %.2f\n",
246 	    (dedup_alloc_size != 0) ?
247 		(double)dedup_ref_size / dedup_alloc_size : 0);
248 }
249 
250 /*
251  * dedup <filesystem>
252  */
253 void
254 hammer_cmd_dedup(char **av, int ac)
255 {
256 	struct dedup_entry *de;
257 	struct sha_dedup_entry *sha_de;
258 	struct pass2_dedup_entry *pass2_de;
259 	char *tmp;
260 	char buf[8];
261 	int needfree = 0;
262 
263 	if (TimeoutOpt > 0)
264 		alarm(TimeoutOpt);
265 
266 	if (ac != 1)
267 		dedup_usage(1);
268 
269 	STAILQ_INIT(&pass2_dedup_queue);
270 
271 	glob_fd = getpfs(&glob_pfs, av[0]);
272 
273 	/*
274 	 * A cycle file is _required_ for resuming dedup after the timeout
275 	 * specified with -t has expired. If no -c option, then place a
276 	 * .dedup.cycle file either in the PFS snapshots directory or in
277 	 * the default snapshots directory.
278 	 */
279 	if (!CyclePath) {
280 		if (glob_pfs.ondisk->snapshots[0] != '/')
281 			asprintf(&tmp, "%s/%s/.dedup.cycle",
282 			    SNAPSHOTS_BASE, av[0]);
283 		else
284 			asprintf(&tmp, "%s/.dedup.cycle",
285 			    glob_pfs.ondisk->snapshots);
286 		CyclePath = tmp;
287 		needfree = 1;
288 	}
289 
290 	/*
291 	 * Pre-pass to cache the btree
292 	 */
293 	scan_pfs(av[0], count_btree_elm, "pre-pass ");
294 	DedupTotalRecords = DedupCurrentRecords;
295 
296 	/*
297 	 * Collection passes (memory limited)
298 	 */
299 	printf("Dedup running\n");
300 	do {
301 		DedupCrcStart = DedupCrcEnd;
302 		DedupCrcEnd = 0;
303 		MemoryUse = 0;
304 
305 		if (VerboseOpt) {
306 			printf("B-Tree pass  crc-range %08x-max\n",
307 				DedupCrcStart);
308 			fflush(stdout);
309 		}
310 		scan_pfs(av[0], process_btree_elm, "main-pass");
311 
312 		while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) {
313 			if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2))
314 				dedup_skipped_size -= pass2_de->leaf.data_len;
315 
316 			STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry);
317 			free(pass2_de);
318 		}
319 		assert(STAILQ_EMPTY(&pass2_dedup_queue));
320 
321 		if (VerboseOpt >= 2)
322 			dump_real_dedup();
323 
324 		/*
325 		 * Calculate dedup ratio and get rid of the trees
326 		 */
327 		while ((de = RB_ROOT(&dedup_tree)) != NULL) {
328 			if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
329 				while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) {
330 					assert(sha_de->ref_blks != 0);
331 					dedup_ref_size += sha_de->ref_size;
332 					dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks;
333 
334 					RB_REMOVE(sha_dedup_entry_rb_tree,
335 							&de->u.fict_root, sha_de);
336 					free(sha_de);
337 				}
338 				assert(RB_EMPTY(&de->u.fict_root));
339 			} else {
340 				assert(de->u.de.ref_blks != 0);
341 				dedup_ref_size += de->u.de.ref_size;
342 				dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks;
343 			}
344 
345 			RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
346 			free(de);
347 		}
348 		assert(RB_EMPTY(&dedup_tree));
349 		if (DedupCrcEnd && VerboseOpt == 0)
350 			printf(".");
351 	} while (DedupCrcEnd);
352 
353 	printf("Dedup %s succeeded\n", av[0]);
354 	relpfs(glob_fd, &glob_pfs);
355 
356 	humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024);
357 	printf("Dedup ratio = %.2f (in this run)\n"
358 	       "    %8s referenced\n",
359 	       ((dedup_alloc_size != 0) ?
360 			(double)dedup_ref_size / dedup_alloc_size : 0),
361 	       buf
362 	);
363 	humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024);
364 	printf("    %8s allocated\n", buf);
365 	humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024);
366 	printf("    %8s skipped\n", buf);
367 	printf("    %8jd CRC collisions\n"
368 	       "    %8jd SHA collisions\n"
369 	       "    %8jd big-block underflows\n"
370 	       "    %8jd new dedup records\n"
371 	       "    %8jd new dedup bytes\n",
372 	       (intmax_t)dedup_crc_failures,
373 	       (intmax_t)dedup_sha_failures,
374                (intmax_t)dedup_underflows,
375                (intmax_t)dedup_successes_count,
376                (intmax_t)dedup_successes_bytes
377 	);
378 
379 	/* Once completed remove cycle file */
380 	hammer_reset_cycle();
381 
382 	/* We don't want to mess up with other directives */
383 	if (needfree) {
384 		free(tmp);
385 		CyclePath = NULL;
386 	}
387 }
388 
389 static int
390 count_btree_elm(hammer_btree_leaf_elm_t scan_leaf __unused, int flags __unused)
391 {
392 	return(1);
393 }
394 
395 static int
396 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused)
397 {
398 	struct sim_dedup_entry *sim_de;
399 
400 	/*
401 	 * If we are using too much memory we have to clean some out, which
402 	 * will cause the run to use multiple passes.  Be careful of integer
403 	 * overflows!
404 	 */
405 	if (MemoryUse > MemoryLimit) {
406 		DedupCrcEnd = DedupCrcStart +
407 			      (u_int32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
408 		if (VerboseOpt) {
409 			printf("memory limit  crc-range %08x-%08x\n",
410 				DedupCrcStart, DedupCrcEnd);
411 			fflush(stdout);
412 		}
413 		for (;;) {
414 			sim_de = RB_MAX(sim_dedup_entry_rb_tree,
415 					&sim_dedup_tree);
416 			if (sim_de == NULL || sim_de->crc < DedupCrcEnd)
417 				break;
418 			RB_REMOVE(sim_dedup_entry_rb_tree,
419 				  &sim_dedup_tree, sim_de);
420 			MemoryUse -= sizeof(*sim_de);
421 			free(sim_de);
422 		}
423 	}
424 
425 	/*
426 	 * Collect statistics based on the CRC only, do not try to read
427 	 * any data blocks or run SHA hashes.
428 	 */
429 	sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree,
430 			   scan_leaf->data_crc);
431 
432 	if (sim_de == NULL) {
433 		sim_de = calloc(sizeof(*sim_de), 1);
434 		sim_de->crc = scan_leaf->data_crc;
435 		RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
436 		MemoryUse += sizeof(*sim_de);
437 	}
438 
439 	sim_de->ref_blks += 1;
440 	sim_de->ref_size += scan_leaf->data_len;
441 	return (1);
442 }
443 
444 static __inline int
445 validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
446 {
447 	if ((p->data_offset & HAMMER_OFF_ZONE_MASK) !=
448 	    (q->data_offset & HAMMER_OFF_ZONE_MASK)) {
449 		return (1);
450 	}
451 	if (p->data_len != q->data_len) {
452 		return (1);
453 	}
454 
455 	return (0);
456 }
457 
458 #define DEDUP_TECH_FAILURE	1
459 #define DEDUP_CMP_FAILURE	2
460 #define DEDUP_INVALID_ZONE	3
461 #define DEDUP_UNDERFLOW		4
462 #define DEDUP_VERS_FAILURE	5
463 
464 static __inline int
465 deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
466 {
467 	struct hammer_ioc_dedup dedup;
468 
469 	bzero(&dedup, sizeof(dedup));
470 
471 	/*
472 	 * If data_offset fields are the same there is no need to run ioctl,
473 	 * candidate is already dedup'ed.
474 	 */
475 	if (p->data_offset == q->data_offset) {
476 		return (0);
477 	}
478 
479 	dedup.elm1 = p->base;
480 	dedup.elm2 = q->base;
481 	RunningIoctl = 1;
482 	if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) {
483 		if (errno == EOPNOTSUPP) {
484 			/* must be at least version 5 */
485 			return (DEDUP_VERS_FAILURE);
486 		}
487 		/* Technical failure - locking or w/e */
488 		return (DEDUP_TECH_FAILURE);
489 	}
490 	if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE) {
491 		return (DEDUP_CMP_FAILURE);
492 	}
493 	if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE) {
494 		return (DEDUP_INVALID_ZONE);
495 	}
496 	if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW) {
497 		return (DEDUP_UNDERFLOW);
498 	}
499 	RunningIoctl = 0;
500 	++dedup_successes_count;
501 	dedup_successes_bytes += p->data_len;
502 	return (0);
503 }
504 
505 static int
506 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags)
507 {
508 	struct dedup_entry *de;
509 	struct sha_dedup_entry *sha_de, temp;
510 	struct pass2_dedup_entry *pass2_de;
511 	int error;
512 
513 	/*
514 	 * If we are using too much memory we have to clean some out, which
515 	 * will cause the run to use multiple passes.  Be careful of integer
516 	 * overflows!
517 	 */
518 	while (MemoryUse > MemoryLimit) {
519 		DedupCrcEnd = DedupCrcStart +
520 			      (u_int32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
521 		if (VerboseOpt) {
522 			printf("memory limit  crc-range %08x-%08x\n",
523 				DedupCrcStart, DedupCrcEnd);
524 			fflush(stdout);
525 		}
526 
527 		for (;;) {
528 			de = RB_MAX(dedup_entry_rb_tree, &dedup_tree);
529 			if (de == NULL || de->leaf.data_crc < DedupCrcEnd)
530 				break;
531 			if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
532 				while ((sha_de = RB_ROOT(&de->u.fict_root)) !=
533 				       NULL) {
534 					RB_REMOVE(sha_dedup_entry_rb_tree,
535 						  &de->u.fict_root, sha_de);
536 					MemoryUse -= sizeof(*sha_de);
537 					free(sha_de);
538 				}
539 			}
540 			RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
541 			MemoryUse -= sizeof(*de);
542 			free(de);
543 		}
544 	}
545 
546 	/*
547 	 * Collect statistics based on the CRC.  Colliding CRCs usually
548 	 * cause a SHA sub-tree to be created under the de.
549 	 *
550 	 * Trivial case if de not found.
551 	 */
552 	de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc);
553 	if (de == NULL) {
554 		de = calloc(sizeof(*de), 1);
555 		de->leaf = *scan_leaf;
556 		RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de);
557 		MemoryUse += sizeof(*de);
558 		goto upgrade_stats;
559 	}
560 
561 	/*
562 	 * Found entry in CRC tree
563 	 */
564 	if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
565 		/*
566 		 * Optimize the case where a CRC failure results in multiple
567 		 * SHA entries.  If we unconditionally issue a data-read a
568 		 * degenerate situation where a colliding CRC's second SHA
569 		 * entry contains the lion's share of the deduplication
570 		 * candidates will result in excessive data block reads.
571 		 *
572 		 * Deal with the degenerate case by looking for a matching
573 		 * data_offset/data_len in the SHA elements we already have
574 		 * before reading the data block and generating a new SHA.
575 		 */
576 		RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
577 			if (sha_de->leaf.data_offset ==
578 						scan_leaf->data_offset &&
579 			    sha_de->leaf.data_len == scan_leaf->data_len) {
580 				memcpy(temp.sha_hash, sha_de->sha_hash,
581 					SHA256_DIGEST_LENGTH);
582 				break;
583 			}
584 		}
585 
586 		/*
587 		 * Entry in CRC tree is fictitious, so we already had problems
588 		 * with this CRC. Upgrade (compute SHA) the candidate and
589 		 * dive into SHA subtree. If upgrade fails insert the candidate
590 		 * into Pass2 list (it will be processed later).
591 		 */
592 		if (sha_de == NULL) {
593 			if (upgrade_chksum(scan_leaf, temp.sha_hash))
594 				goto pass2_insert;
595 
596 			sha_de = RB_FIND(sha_dedup_entry_rb_tree,
597 					 &de->u.fict_root, &temp);
598 		}
599 
600 		/*
601 		 * Nothing in SHA subtree so far, so this is a new
602 		 * 'dataset'. Insert new entry into SHA subtree.
603 		 */
604 		if (sha_de == NULL) {
605 			sha_de = calloc(sizeof(*sha_de), 1);
606 			sha_de->leaf = *scan_leaf;
607 			memcpy(sha_de->sha_hash, temp.sha_hash,
608 			       SHA256_DIGEST_LENGTH);
609 			RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root,
610 				  sha_de);
611 			MemoryUse += sizeof(*sha_de);
612 			goto upgrade_stats_sha;
613 		}
614 
615 		/*
616 		 * Found entry in SHA subtree, it means we have a potential
617 		 * dedup pair. Validate it (zones have to match and data_len
618 		 * field have to be the same too. If validation fails, treat
619 		 * it as a SHA collision (jump to sha256_failure).
620 		 */
621 		if (validate_dedup_pair(&sha_de->leaf, scan_leaf))
622 			goto sha256_failure;
623 
624 		/*
625 		 * We have a valid dedup pair (SHA match, validated).
626 		 *
627 		 * In case of technical failure (dedup pair was good, but
628 		 * ioctl failed anyways) insert the candidate into Pass2 list
629 		 * (we will try to dedup it after we are done with the rest of
630 		 * the tree).
631 		 *
632 		 * If ioctl fails because either of blocks is in the non-dedup
633 		 * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't
634 		 * bother with the candidate and terminate early.
635 		 *
636 		 * If ioctl fails because of big-block underflow replace the
637 		 * leaf node that found dedup entry represents with scan_leaf.
638 		 */
639 		error = deduplicate(&sha_de->leaf, scan_leaf);
640 		switch(error) {
641 		case 0:
642 			goto upgrade_stats_sha;
643 		case DEDUP_TECH_FAILURE:
644 			goto pass2_insert;
645 		case DEDUP_CMP_FAILURE:
646 			goto sha256_failure;
647 		case DEDUP_INVALID_ZONE:
648 			goto terminate_early;
649 		case DEDUP_UNDERFLOW:
650 			++dedup_underflows;
651 			sha_de->leaf = *scan_leaf;
652 			memcpy(sha_de->sha_hash, temp.sha_hash,
653 				SHA256_DIGEST_LENGTH);
654 			goto upgrade_stats_sha;
655 		case DEDUP_VERS_FAILURE:
656 			fprintf(stderr,
657 				"HAMMER filesystem must be at least "
658 				"version 5 to dedup\n");
659 			exit (1);
660 		default:
661 			fprintf(stderr, "Unknown error\n");
662 			goto terminate_early;
663 		}
664 
665 		/*
666 		 * Ooh la la.. SHA-256 collision. Terminate early, there's
667 		 * nothing we can do here.
668 		 */
669 sha256_failure:
670 		++dedup_sha_failures;
671 		goto terminate_early;
672 	} else {
673 		/*
674 		 * Candidate CRC is good for now (we found an entry in CRC
675 		 * tree and it's not fictitious). This means we have a
676 		 * potential dedup pair.
677 		 */
678 		if (validate_dedup_pair(&de->leaf, scan_leaf))
679 			goto crc_failure;
680 
681 		/*
682 		 * We have a valid dedup pair (CRC match, validated)
683 		 */
684 		error = deduplicate(&de->leaf, scan_leaf);
685 		switch(error) {
686 		case 0:
687 			goto upgrade_stats;
688 		case DEDUP_TECH_FAILURE:
689 			goto pass2_insert;
690 		case DEDUP_CMP_FAILURE:
691 			goto crc_failure;
692 		case DEDUP_INVALID_ZONE:
693 			goto terminate_early;
694 		case DEDUP_UNDERFLOW:
695 			++dedup_underflows;
696 			de->leaf = *scan_leaf;
697 			goto upgrade_stats;
698 		case DEDUP_VERS_FAILURE:
699 			fprintf(stderr,
700 				"HAMMER filesystem must be at least "
701 				"version 5 to dedup\n");
702 			exit (1);
703 		default:
704 			fprintf(stderr, "Unknown error\n");
705 			goto terminate_early;
706 		}
707 
708 crc_failure:
709 		/*
710 		 * We got a CRC collision - either ioctl failed because of
711 		 * the comparison failure or validation of the potential
712 		 * dedup pair went bad. In all cases insert both blocks
713 		 * into SHA subtree (this requires checksum upgrade) and mark
714 		 * entry that corresponds to this CRC in the CRC tree
715 		 * fictitious, so that all futher operations with this CRC go
716 		 * through SHA subtree.
717 		 */
718 		++dedup_crc_failures;
719 
720 		/*
721 		 * Insert block that was represented by now fictitious dedup
722 		 * entry (create a new SHA entry and preserve stats of the
723 		 * old CRC one). If checksum upgrade fails insert the
724 		 * candidate into Pass2 list and return - keep both trees
725 		 * unmodified.
726 		 */
727 		sha_de = calloc(sizeof(*sha_de), 1);
728 		sha_de->leaf = de->leaf;
729 		sha_de->ref_blks = de->u.de.ref_blks;
730 		sha_de->ref_size = de->u.de.ref_size;
731 		if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) {
732 			free(sha_de);
733 			goto pass2_insert;
734 		}
735 		MemoryUse += sizeof(*sha_de);
736 
737 		RB_INIT(&de->u.fict_root);
738 		/*
739 		 * Here we can insert without prior checking because the tree
740 		 * is empty at this point
741 		 */
742 		RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
743 
744 		/*
745 		 * Mark entry in CRC tree fictitious
746 		 */
747 		de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS;
748 
749 		/*
750 		 * Upgrade checksum of the candidate and insert it into
751 		 * SHA subtree. If upgrade fails insert the candidate into
752 		 * Pass2 list.
753 		 */
754 		if (upgrade_chksum(scan_leaf, temp.sha_hash)) {
755 			goto pass2_insert;
756 		}
757 		sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root,
758 				 &temp);
759 		if (sha_de != NULL)
760 			/* There is an entry with this SHA already, but the only
761 			 * RB-tree element at this point is that entry we just
762 			 * added. We know for sure these blocks are different
763 			 * (this is crc_failure branch) so treat it as SHA
764 			 * collision.
765 			 */
766 			goto sha256_failure;
767 
768 		sha_de = calloc(sizeof(*sha_de), 1);
769 		sha_de->leaf = *scan_leaf;
770 		memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
771 		RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
772 		MemoryUse += sizeof(*sha_de);
773 		goto upgrade_stats_sha;
774 	}
775 
776 upgrade_stats:
777 	de->u.de.ref_blks += 1;
778 	de->u.de.ref_size += scan_leaf->data_len;
779 	return (1);
780 
781 upgrade_stats_sha:
782 	sha_de->ref_blks += 1;
783 	sha_de->ref_size += scan_leaf->data_len;
784 	return (1);
785 
786 pass2_insert:
787 	/*
788 	 * If in pass2 mode don't insert anything, fall through to
789 	 * terminate_early
790 	 */
791 	if ((flags & DEDUP_PASS2) == 0) {
792 		pass2_de = calloc(sizeof(*pass2_de), 1);
793 		pass2_de->leaf = *scan_leaf;
794 		STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry);
795 		dedup_skipped_size += scan_leaf->data_len;
796 		return (1);
797 	}
798 
799 terminate_early:
800 	/*
801 	 * Early termination path. Fixup stats.
802 	 */
803 	dedup_alloc_size += scan_leaf->data_len;
804 	dedup_ref_size += scan_leaf->data_len;
805 	return (0);
806 }
807 
808 static int
809 upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash)
810 {
811 	struct hammer_ioc_data data;
812 	char *buf = malloc(DEDUP_BUF);
813 	SHA256_CTX ctx;
814 	int error;
815 
816 	bzero(&data, sizeof(data));
817 	data.elm = leaf->base;
818 	data.ubuf = buf;
819 	data.size = DEDUP_BUF;
820 
821 	error = 0;
822 	if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) {
823 		fprintf(stderr, "Get-data failed: %s\n", strerror(errno));
824 		error = 1;
825 		goto done;
826 	}
827 	DedupDataReads += leaf->data_len;
828 
829 	if (data.leaf.data_len != leaf->data_len) {
830 		error = 1;
831 		goto done;
832 	}
833 
834 	if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD &&
835 	    data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) {
836 		SHA256_Init(&ctx);
837 		SHA256_Update(&ctx, (void *)buf, data.leaf.data_len);
838 		SHA256_Final(sha_hash, &ctx);
839 	}
840 
841 done:
842 	free(buf);
843 	return (error);
844 }
845 
846 static void
847 sigAlrm(int signo __unused)
848 {
849 	SigAlrmFlag = 1;
850 }
851 
852 static void
853 sigInfo(int signo __unused)
854 {
855 	SigInfoFlag = 1;
856 }
857 
858 static void
859 scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id)
860 {
861 	struct hammer_ioc_mirror_rw mirror;
862 	hammer_ioc_mrecord_any_t mrec;
863 	struct hammer_btree_leaf_elm elm;
864 	char *buf = malloc(DEDUP_BUF);
865 	char buf1[8];
866 	char buf2[8];
867 	int offset, bytes;
868 
869 	SigInfoFlag = 0;
870 	DedupDataReads = 0;
871 	DedupCurrentRecords = 0;
872 	signal(SIGINFO, sigInfo);
873 	signal(SIGALRM, sigAlrm);
874 
875 	/*
876 	 * Deduplication happens per element so hammer(8) is in full
877 	 * control of the ioctl()s to actually perform it. SIGALRM
878 	 * needs to be handled within hammer(8) but a checkpoint
879 	 * is needed for resuming. Use cycle file for that.
880 	 *
881 	 * Try to obtain the previous obj_id from the cycle file and
882 	 * if not available just start from the beginning.
883 	 */
884 	bzero(&mirror, sizeof(mirror));
885 	hammer_key_beg_init(&mirror.key_beg);
886 	hammer_get_cycle(&mirror.key_beg, &mirror.tid_beg);
887 
888 	if (mirror.key_beg.obj_id != (int64_t)HAMMER_MIN_OBJID) {
889 		if (VerboseOpt)
890 			fprintf(stderr, "%s: mirror-read: Resuming at object %016jx\n",
891 			    id, (uintmax_t)mirror.key_beg.obj_id);
892 	}
893 
894 	hammer_key_end_init(&mirror.key_end);
895 
896 	mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid;
897 	mirror.tid_end = glob_pfs.ondisk->sync_end_tid;
898 	mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */
899 	mirror.ubuf = buf;
900 	mirror.size = DEDUP_BUF;
901 	mirror.pfs_id = glob_pfs.pfs_id;
902 	mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
903 
904 	if (VerboseOpt && DedupCrcStart == 0) {
905 		printf("%s %s: objspace %016jx:%04x %016jx:%04x\n",
906 			id, filesystem,
907 			(uintmax_t)mirror.key_beg.obj_id,
908 			mirror.key_beg.localization,
909 			(uintmax_t)mirror.key_end.obj_id,
910 			mirror.key_end.localization);
911 		printf("%s %s: pfs_id %d\n",
912 			id, filesystem, glob_pfs.pfs_id);
913 	}
914 	fflush(stdout);
915 	fflush(stderr);
916 
917 	do {
918 		mirror.count = 0;
919 		mirror.pfs_id = glob_pfs.pfs_id;
920 		mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
921 		if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) {
922 			fprintf(stderr, "Mirror-read %s failed: %s\n",
923 				filesystem, strerror(errno));
924 			exit(1);
925 		}
926 		if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) {
927 			fprintf(stderr, "Mirror-read %s fatal error %d\n",
928 				filesystem, mirror.head.error);
929 			exit(1);
930 		}
931 		if (mirror.count) {
932 			offset = 0;
933 			while (offset < mirror.count) {
934 				mrec = (void *)((char *)buf + offset);
935 				bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size);
936 				if (offset + bytes > mirror.count) {
937 					fprintf(stderr, "Misaligned record\n");
938 					exit(1);
939 				}
940 				assert((mrec->head.type &
941 				       HAMMER_MRECF_TYPE_MASK) ==
942 				       HAMMER_MREC_TYPE_REC);
943 				offset += bytes;
944 				elm = mrec->rec.leaf;
945 				if (elm.base.btype != HAMMER_BTREE_TYPE_RECORD)
946 					continue;
947 				if (elm.base.rec_type != HAMMER_RECTYPE_DATA)
948 					continue;
949 				++DedupCurrentRecords;
950 				if (DedupCrcStart != DedupCrcEnd) {
951 					if (elm.data_crc < DedupCrcStart)
952 						continue;
953 					if (DedupCrcEnd &&
954 					    elm.data_crc >= DedupCrcEnd) {
955 						continue;
956 					}
957 				}
958 				func(&elm, 0);
959 			}
960 		}
961 		mirror.key_beg = mirror.key_cur;
962 		if (DidInterrupt || SigAlrmFlag) {
963 			if (VerboseOpt)
964 				fprintf(stderr, "%s\n",
965 				    (DidInterrupt ? "Interrupted" : "Timeout"));
966 			hammer_set_cycle(&mirror.key_cur, mirror.tid_beg);
967 			if (VerboseOpt)
968 				fprintf(stderr, "Cyclefile %s updated for "
969 				    "continuation\n", CyclePath);
970 			exit(1);
971 		}
972 		if (SigInfoFlag) {
973 			if (DedupTotalRecords) {
974 				humanize_unsigned(buf1, sizeof(buf1),
975 						  DedupDataReads,
976 						  "B", 1024);
977 				humanize_unsigned(buf2, sizeof(buf2),
978 						  dedup_successes_bytes,
979 						  "B", 1024);
980 				fprintf(stderr, "%s count %7jd/%jd "
981 						"(%02d.%02d%%) "
982 						"ioread %s newddup %s\n",
983 					id,
984 					(intmax_t)DedupCurrentRecords,
985 					(intmax_t)DedupTotalRecords,
986 					(int)(DedupCurrentRecords * 100 /
987 						DedupTotalRecords),
988 					(int)(DedupCurrentRecords * 10000 /
989 						DedupTotalRecords % 100),
990 					buf1, buf2);
991 			} else {
992 				fprintf(stderr, "%s count %-7jd\n",
993 					id,
994 					(intmax_t)DedupCurrentRecords);
995 			}
996 			SigInfoFlag = 0;
997 		}
998 	} while (mirror.count != 0);
999 
1000 	signal(SIGINFO, SIG_IGN);
1001 	signal(SIGALRM, SIG_IGN);
1002 
1003 	free(buf);
1004 }
1005 
1006 static void
1007 dump_simulated_dedup(void)
1008 {
1009 	struct sim_dedup_entry *sim_de;
1010 
1011 	printf("=== Dumping simulated dedup entries:\n");
1012 	RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) {
1013 		printf("\tcrc=%08x cnt=%ju size=%ju\n",
1014 			sim_de->crc,
1015 			(intmax_t)sim_de->ref_blks,
1016 			(intmax_t)sim_de->ref_size);
1017 	}
1018 	printf("end of dump ===\n");
1019 }
1020 
1021 static void
1022 dump_real_dedup(void)
1023 {
1024 	struct dedup_entry *de;
1025 	struct sha_dedup_entry *sha_de;
1026 	int i;
1027 
1028 	printf("=== Dumping dedup entries:\n");
1029 	RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) {
1030 		if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
1031 			printf("\tcrc=%08x fictitious\n", de->leaf.data_crc);
1032 
1033 			RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
1034 				printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t"
1035 				       "\t\tsha=",
1036 				       sha_de->leaf.data_crc,
1037 				       (intmax_t)sha_de->ref_blks,
1038 				       (intmax_t)sha_de->ref_size);
1039 				for (i = 0; i < SHA256_DIGEST_LENGTH; ++i)
1040 					printf("%02x", sha_de->sha_hash[i]);
1041 				printf("\n");
1042 			}
1043 		} else {
1044 			printf("\tcrc=%08x cnt=%ju size=%ju\n",
1045 			       de->leaf.data_crc,
1046 			       (intmax_t)de->u.de.ref_blks,
1047 			       (intmax_t)de->u.de.ref_size);
1048 		}
1049 	}
1050 	printf("end of dump ===\n");
1051 }
1052 
1053 static void
1054 dedup_usage(int code)
1055 {
1056 	fprintf(stderr,
1057 		"hammer dedup-simulate <filesystem>\n"
1058 		"hammer dedup <filesystem>\n"
1059 	);
1060 	exit(code);
1061 }
1062