xref: /dragonfly/sbin/hammer/cmd_dedup.c (revision 61c0377f)
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer.h"
36 #include <libutil.h>
37 #include <crypto/sha2/sha2.h>
38 
39 #define DEDUP_BUF (64 * 1024)
40 
41 /* Sorted list of block CRCs - light version for dedup-simulate */
42 struct sim_dedup_entry_rb_tree;
43 RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree =
44 					RB_INITIALIZER(&sim_dedup_tree);
45 RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
46 		rb_sim_dedup_entry_compare, hammer_crc_t);
47 
48 struct sim_dedup_entry {
49 	hammer_crc_t	crc;
50 	u_int64_t	ref_blks; /* number of blocks referenced */
51 	u_int64_t	ref_size; /* size of data referenced */
52 	RB_ENTRY(sim_dedup_entry) rb_entry;
53 };
54 
55 struct dedup_entry {
56 	struct hammer_btree_leaf_elm leaf;
57 	union {
58 		struct {
59 			u_int64_t ref_blks;
60 			u_int64_t ref_size;
61 		} de;
62 		RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root;
63 	} u;
64 	u_int8_t flags;
65 	RB_ENTRY(dedup_entry) rb_entry;
66 };
67 
68 #define HAMMER_DEDUP_ENTRY_FICTITIOUS	0x0001
69 
70 struct sha_dedup_entry {
71 	struct hammer_btree_leaf_elm	leaf;
72 	u_int64_t			ref_blks;
73 	u_int64_t			ref_size;
74 	u_int8_t			sha_hash[SHA256_DIGEST_LENGTH];
75 	RB_ENTRY(sha_dedup_entry)	fict_entry;
76 };
77 
78 /* Sorted list of HAMMER B-Tree keys */
79 struct dedup_entry_rb_tree;
80 struct sha_dedup_entry_rb_tree;
81 
82 RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree =
83 					RB_INITIALIZER(&dedup_tree);
84 RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
85 		rb_dedup_entry_compare, hammer_crc_t);
86 
87 RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
88 		rb_sha_dedup_entry_compare);
89 
90 /*
91  * Pass2 list - contains entries that were not dedup'ed because ioctl failed
92  */
93 STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue =
94 				STAILQ_HEAD_INITIALIZER(pass2_dedup_queue);
95 
96 struct pass2_dedup_entry {
97 	struct hammer_btree_leaf_elm	leaf;
98 	STAILQ_ENTRY(pass2_dedup_entry)	sq_entry;
99 };
100 
101 #define DEDUP_PASS2	0x0001 /* process_btree_elm() mode */
102 
103 static int SigInfoFlag;
104 static int64_t DedupDataReads;
105 static int64_t DedupCurrentRecords;
106 static int64_t DedupTotalRecords;
107 static u_int32_t DedupCrcStart;
108 static u_int32_t DedupCrcEnd;
109 static u_int64_t MemoryUse;
110 
111 /* PFS global ids - we deal with just one PFS at a run */
112 int glob_fd;
113 struct hammer_ioc_pseudofs_rw glob_pfs;
114 
115 /*
116  * Global accounting variables
117  *
118  * Last three don't have to be 64-bit, just to be safe..
119  */
120 u_int64_t dedup_alloc_size = 0;
121 u_int64_t dedup_ref_size = 0;
122 u_int64_t dedup_skipped_size = 0;
123 u_int64_t dedup_crc_failures = 0;
124 u_int64_t dedup_sha_failures = 0;
125 u_int64_t dedup_underflows = 0;
126 u_int64_t dedup_successes_count = 0;
127 u_int64_t dedup_successes_bytes = 0;
128 
129 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
130 				struct sim_dedup_entry *sim_de2);
131 static int rb_dedup_entry_compare(struct dedup_entry *de1,
132 				struct dedup_entry *de2);
133 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
134 				struct sha_dedup_entry *sha_de2);
135 typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags);
136 static void scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id);
137 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
138 static int count_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
139 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
140 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash);
141 static void dump_simulated_dedup(void);
142 static void dump_real_dedup(void);
143 static void dedup_usage(int code);
144 
145 RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
146 		rb_sim_dedup_entry_compare, hammer_crc_t, crc);
147 RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
148 		rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc);
149 RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
150 		rb_sha_dedup_entry_compare);
151 
152 static int
153 rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
154 			struct sim_dedup_entry *sim_de2)
155 {
156 	if (sim_de1->crc < sim_de2->crc)
157 		return (-1);
158 	if (sim_de1->crc > sim_de2->crc)
159 		return (1);
160 
161 	return (0);
162 }
163 
164 static int
165 rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2)
166 {
167 	if (de1->leaf.data_crc < de2->leaf.data_crc)
168 		return (-1);
169 	if (de1->leaf.data_crc > de2->leaf.data_crc)
170 		return (1);
171 
172 	return (0);
173 }
174 
175 static int
176 rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
177 			struct sha_dedup_entry *sha_de2)
178 {
179 	unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash;
180 	unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash;
181 	int i;
182 
183 	for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) {
184 		if (h1[i] < h2[i])
185 			return (-1);
186 		if (h1[i] > h2[i])
187 			return (1);
188 	}
189 
190 	return (0);
191 }
192 
193 /*
194  * dedup-simulate <filesystem>
195  */
196 void
197 hammer_cmd_dedup_simulate(char **av, int ac)
198 {
199 	struct sim_dedup_entry *sim_de;
200 
201 	if (ac != 1)
202 		dedup_usage(1);
203 
204 	glob_fd = getpfs(&glob_pfs, av[0]);
205 
206 	/*
207 	 * Collection passes (memory limited)
208 	 */
209 	printf("Dedup-simulate running\n");
210 	do {
211 		DedupCrcStart = DedupCrcEnd;
212 		DedupCrcEnd = 0;
213 		MemoryUse = 0;
214 
215 		if (VerboseOpt) {
216 			printf("b-tree pass  crc-range %08x-max\n",
217 				DedupCrcStart);
218 			fflush(stdout);
219 		}
220 		scan_pfs(av[0], collect_btree_elm, "simu-pass");
221 
222 		if (VerboseOpt >= 2)
223 			dump_simulated_dedup();
224 
225 		/*
226 		 * Calculate simulated dedup ratio and get rid of the tree
227 		 */
228 		while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) {
229 			assert(sim_de->ref_blks != 0);
230 			dedup_ref_size += sim_de->ref_size;
231 			dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks;
232 
233 			RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
234 			free(sim_de);
235 		}
236 		if (DedupCrcEnd && VerboseOpt == 0)
237 			printf(".");
238 	} while (DedupCrcEnd);
239 
240 	printf("Dedup-simulate %s succeeded\n", av[0]);
241 	relpfs(glob_fd, &glob_pfs);
242 
243 	printf("Simulated dedup ratio = %.2f\n",
244 	    (dedup_alloc_size != 0) ?
245 		(double)dedup_ref_size / dedup_alloc_size : 0);
246 }
247 
248 /*
249  * dedup <filesystem>
250  */
251 void
252 hammer_cmd_dedup(char **av, int ac)
253 {
254 	struct dedup_entry *de;
255 	struct sha_dedup_entry *sha_de;
256 	struct pass2_dedup_entry *pass2_de;
257 	char buf[8];
258 
259 	if (TimeoutOpt > 0)
260 		alarm(TimeoutOpt);
261 
262 	if (ac != 1)
263 		dedup_usage(1);
264 
265 	STAILQ_INIT(&pass2_dedup_queue);
266 
267 	glob_fd = getpfs(&glob_pfs, av[0]);
268 
269 	/*
270 	 * Pre-pass to cache the btree
271 	 */
272 	scan_pfs(av[0], count_btree_elm, "pre-pass ");
273 	DedupTotalRecords = DedupCurrentRecords;
274 
275 	/*
276 	 * Collection passes (memory limited)
277 	 */
278 	printf("Dedup running\n");
279 	do {
280 		DedupCrcStart = DedupCrcEnd;
281 		DedupCrcEnd = 0;
282 		MemoryUse = 0;
283 
284 		if (VerboseOpt) {
285 			printf("b-tree pass  crc-range %08x-max\n",
286 				DedupCrcStart);
287 			fflush(stdout);
288 		}
289 		scan_pfs(av[0], process_btree_elm, "main-pass");
290 
291 		while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) {
292 			if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2))
293 				dedup_skipped_size -= pass2_de->leaf.data_len;
294 
295 			STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry);
296 			free(pass2_de);
297 		}
298 		assert(STAILQ_EMPTY(&pass2_dedup_queue));
299 
300 		if (VerboseOpt >= 2)
301 			dump_real_dedup();
302 
303 		/*
304 		 * Calculate dedup ratio and get rid of the trees
305 		 */
306 		while ((de = RB_ROOT(&dedup_tree)) != NULL) {
307 			if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
308 				while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) {
309 					assert(sha_de->ref_blks != 0);
310 					dedup_ref_size += sha_de->ref_size;
311 					dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks;
312 
313 					RB_REMOVE(sha_dedup_entry_rb_tree,
314 							&de->u.fict_root, sha_de);
315 					free(sha_de);
316 				}
317 				assert(RB_EMPTY(&de->u.fict_root));
318 			} else {
319 				assert(de->u.de.ref_blks != 0);
320 				dedup_ref_size += de->u.de.ref_size;
321 				dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks;
322 			}
323 
324 			RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
325 			free(de);
326 		}
327 		assert(RB_EMPTY(&dedup_tree));
328 		if (DedupCrcEnd && VerboseOpt == 0)
329 			printf(".");
330 	} while (DedupCrcEnd);
331 
332 	printf("Dedup %s succeeded\n", av[0]);
333 	relpfs(glob_fd, &glob_pfs);
334 
335 	humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024);
336 	printf("Dedup ratio = %.2f\n"
337 	       "    %8s referenced\n",
338 	       ((dedup_alloc_size != 0) ?
339 			(double)dedup_ref_size / dedup_alloc_size : 0),
340 	       buf
341 	);
342 	humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024);
343 	printf("    %8s allocated\n", buf);
344 	humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024);
345 	printf("    %8s skipped\n", buf);
346 	printf("    %8jd CRC collisions\n"
347 	       "    %8jd SHA collisions\n"
348 	       "    %8jd bigblock underflows\n"
349 	       "    %8jd new dedup records\n"
350 	       "    %8jd new dedup bytes\n",
351 	       (intmax_t)dedup_crc_failures,
352 	       (intmax_t)dedup_sha_failures,
353                (intmax_t)dedup_underflows,
354                (intmax_t)dedup_successes_count,
355                (intmax_t)dedup_successes_bytes
356 	);
357 }
358 
359 static int
360 count_btree_elm(hammer_btree_leaf_elm_t scan_leaf __unused, int flags __unused)
361 {
362 	return(1);
363 }
364 
365 static int
366 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused)
367 {
368 	struct sim_dedup_entry *sim_de;
369 
370 	/*
371 	 * If we are using too much memory we have to clean some out, which
372 	 * will cause the run to use multiple passes.  Be careful of integer
373 	 * overflows!
374 	 */
375 	if (MemoryUse > MemoryLimit) {
376 		DedupCrcEnd = DedupCrcStart +
377 			      (u_int32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
378 		if (VerboseOpt) {
379 			printf("memory limit  crc-range %08x-%08x\n",
380 				DedupCrcStart, DedupCrcEnd);
381 			fflush(stdout);
382 		}
383 		for (;;) {
384 			sim_de = RB_MAX(sim_dedup_entry_rb_tree,
385 					&sim_dedup_tree);
386 			if (sim_de == NULL || sim_de->crc < DedupCrcEnd)
387 				break;
388 			RB_REMOVE(sim_dedup_entry_rb_tree,
389 				  &sim_dedup_tree, sim_de);
390 			MemoryUse -= sizeof(*sim_de);
391 			free(sim_de);
392 		}
393 	}
394 
395 	/*
396 	 * Collect statistics based on the CRC only, do not try to read
397 	 * any data blocks or run SHA hashes.
398 	 */
399 	sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree,
400 			   scan_leaf->data_crc);
401 
402 	if (sim_de == NULL) {
403 		sim_de = calloc(sizeof(*sim_de), 1);
404 		sim_de->crc = scan_leaf->data_crc;
405 		RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
406 		MemoryUse += sizeof(*sim_de);
407 	}
408 
409 	sim_de->ref_blks += 1;
410 	sim_de->ref_size += scan_leaf->data_len;
411 	return (1);
412 }
413 
414 static __inline int
415 validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
416 {
417 	if ((p->data_offset & HAMMER_OFF_ZONE_MASK) !=
418 	    (q->data_offset & HAMMER_OFF_ZONE_MASK)) {
419 		return (1);
420 	}
421 	if (p->data_len != q->data_len) {
422 		return (1);
423 	}
424 
425 	return (0);
426 }
427 
428 #define DEDUP_TECH_FAILURE	1
429 #define DEDUP_CMP_FAILURE	2
430 #define DEDUP_INVALID_ZONE	3
431 #define DEDUP_UNDERFLOW		4
432 #define DEDUP_VERS_FAILURE	5
433 
434 static __inline int
435 deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
436 {
437 	struct hammer_ioc_dedup dedup;
438 
439 	bzero(&dedup, sizeof(dedup));
440 
441 	/*
442 	 * If data_offset fields are the same there is no need to run ioctl,
443 	 * candidate is already dedup'ed.
444 	 */
445 	if (p->data_offset == q->data_offset) {
446 		return (0);
447 	}
448 
449 	dedup.elm1 = p->base;
450 	dedup.elm2 = q->base;
451 	RunningIoctl = 1;
452 	if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) {
453 		if (errno == EOPNOTSUPP) {
454 			/* must be at least version 5 */
455 			return (DEDUP_VERS_FAILURE);
456 		}
457 		/* Technical failure - locking or w/e */
458 		return (DEDUP_TECH_FAILURE);
459 	}
460 	if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE) {
461 		return (DEDUP_CMP_FAILURE);
462 	}
463 	if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE) {
464 		return (DEDUP_INVALID_ZONE);
465 	}
466 	if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW) {
467 		return (DEDUP_UNDERFLOW);
468 	}
469 	RunningIoctl = 0;
470 	++dedup_successes_count;
471 	dedup_successes_bytes += p->data_len;
472 	return (0);
473 }
474 
475 static int
476 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags)
477 {
478 	struct dedup_entry *de;
479 	struct sha_dedup_entry *sha_de, temp;
480 	struct pass2_dedup_entry *pass2_de;
481 	int error;
482 
483 	/*
484 	 * If we are using too much memory we have to clean some out, which
485 	 * will cause the run to use multiple passes.  Be careful of integer
486 	 * overflows!
487 	 */
488 	while (MemoryUse > MemoryLimit) {
489 		DedupCrcEnd = DedupCrcStart +
490 			      (u_int32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
491 		if (VerboseOpt) {
492 			printf("memory limit  crc-range %08x-%08x\n",
493 				DedupCrcStart, DedupCrcEnd);
494 			fflush(stdout);
495 		}
496 
497 		for (;;) {
498 			de = RB_MAX(dedup_entry_rb_tree, &dedup_tree);
499 			if (de == NULL || de->leaf.data_crc < DedupCrcEnd)
500 				break;
501 			if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
502 				while ((sha_de = RB_ROOT(&de->u.fict_root)) !=
503 				       NULL) {
504 					RB_REMOVE(sha_dedup_entry_rb_tree,
505 						  &de->u.fict_root, sha_de);
506 					MemoryUse -= sizeof(*sha_de);
507 					free(sha_de);
508 				}
509 			}
510 			RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
511 			MemoryUse -= sizeof(*de);
512 			free(de);
513 		}
514 	}
515 
516 	/*
517 	 * Collect statistics based on the CRC.  Colliding CRCs usually
518 	 * cause a SHA sub-tree to be created under the de.
519 	 *
520 	 * Trivial case if de not found.
521 	 */
522 	de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc);
523 	if (de == NULL) {
524 		de = calloc(sizeof(*de), 1);
525 		de->leaf = *scan_leaf;
526 		RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de);
527 		MemoryUse += sizeof(*de);
528 		goto upgrade_stats;
529 	}
530 
531 	/*
532 	 * Found entry in CRC tree
533 	 */
534 	if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
535 		/*
536 		 * Optimize the case where a CRC failure results in multiple
537 		 * SHA entries.  If we unconditionally issue a data-read a
538 		 * degenerate situation where a colliding CRC's second SHA
539 		 * entry contains the lion's share of the deduplication
540 		 * candidates will result in excessive data block reads.
541 		 *
542 		 * Deal with the degenerate case by looking for a matching
543 		 * data_offset/data_len in the SHA elements we already have
544 		 * before reading the data block and generating a new SHA.
545 		 */
546 		RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
547 			if (sha_de->leaf.data_offset ==
548 						scan_leaf->data_offset &&
549 			    sha_de->leaf.data_len == scan_leaf->data_len) {
550 				memcpy(temp.sha_hash, sha_de->sha_hash,
551 					SHA256_DIGEST_LENGTH);
552 				break;
553 			}
554 		}
555 
556 		/*
557 		 * Entry in CRC tree is fictitious, so we already had problems
558 		 * with this CRC. Upgrade (compute SHA) the candidate and
559 		 * dive into SHA subtree. If upgrade fails insert the candidate
560 		 * into Pass2 list (it will be processed later).
561 		 */
562 		if (sha_de == NULL) {
563 			if (upgrade_chksum(scan_leaf, temp.sha_hash))
564 				goto pass2_insert;
565 
566 			sha_de = RB_FIND(sha_dedup_entry_rb_tree,
567 					 &de->u.fict_root, &temp);
568 		}
569 
570 		/*
571 		 * Nothing in SHA subtree so far, so this is a new
572 		 * 'dataset'. Insert new entry into SHA subtree.
573 		 */
574 		if (sha_de == NULL) {
575 			sha_de = calloc(sizeof(*sha_de), 1);
576 			sha_de->leaf = *scan_leaf;
577 			memcpy(sha_de->sha_hash, temp.sha_hash,
578 			       SHA256_DIGEST_LENGTH);
579 			RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root,
580 				  sha_de);
581 			MemoryUse += sizeof(*sha_de);
582 			goto upgrade_stats_sha;
583 		}
584 
585 		/*
586 		 * Found entry in SHA subtree, it means we have a potential
587 		 * dedup pair. Validate it (zones have to match and data_len
588 		 * field have to be the same too. If validation fails, treat
589 		 * it as a SHA collision (jump to sha256_failure).
590 		 */
591 		if (validate_dedup_pair(&sha_de->leaf, scan_leaf))
592 			goto sha256_failure;
593 
594 		/*
595 		 * We have a valid dedup pair (SHA match, validated).
596 		 *
597 		 * In case of technical failure (dedup pair was good, but
598 		 * ioctl failed anyways) insert the candidate into Pass2 list
599 		 * (we will try to dedup it after we are done with the rest of
600 		 * the tree).
601 		 *
602 		 * If ioctl fails because either of blocks is in the non-dedup
603 		 * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't
604 		 * bother with the candidate and terminate early.
605 		 *
606 		 * If ioctl fails because of bigblock underflow replace the
607 		 * leaf node that found dedup entry represents with scan_leaf.
608 		 */
609 		error = deduplicate(&sha_de->leaf, scan_leaf);
610 		switch(error) {
611 		case 0:
612 			goto upgrade_stats_sha;
613 		case DEDUP_TECH_FAILURE:
614 			goto pass2_insert;
615 		case DEDUP_CMP_FAILURE:
616 			goto sha256_failure;
617 		case DEDUP_INVALID_ZONE:
618 			goto terminate_early;
619 		case DEDUP_UNDERFLOW:
620 			++dedup_underflows;
621 			sha_de->leaf = *scan_leaf;
622 			memcpy(sha_de->sha_hash, temp.sha_hash,
623 				SHA256_DIGEST_LENGTH);
624 			goto upgrade_stats_sha;
625 		case DEDUP_VERS_FAILURE:
626 			fprintf(stderr,
627 				"HAMMER filesystem must be at least "
628 				"version 5 to dedup\n");
629 			exit (1);
630 		default:
631 			fprintf(stderr, "Unknown error\n");
632 			goto terminate_early;
633 		}
634 
635 		/*
636 		 * Ooh la la.. SHA-256 collision. Terminate early, there's
637 		 * nothing we can do here.
638 		 */
639 sha256_failure:
640 		++dedup_sha_failures;
641 		goto terminate_early;
642 	} else {
643 		/*
644 		 * Candidate CRC is good for now (we found an entry in CRC
645 		 * tree and it's not fictitious). This means we have a
646 		 * potential dedup pair.
647 		 */
648 		if (validate_dedup_pair(&de->leaf, scan_leaf))
649 			goto crc_failure;
650 
651 		/*
652 		 * We have a valid dedup pair (CRC match, validated)
653 		 */
654 		error = deduplicate(&de->leaf, scan_leaf);
655 		switch(error) {
656 		case 0:
657 			goto upgrade_stats;
658 		case DEDUP_TECH_FAILURE:
659 			goto pass2_insert;
660 		case DEDUP_CMP_FAILURE:
661 			goto crc_failure;
662 		case DEDUP_INVALID_ZONE:
663 			goto terminate_early;
664 		case DEDUP_UNDERFLOW:
665 			++dedup_underflows;
666 			de->leaf = *scan_leaf;
667 			goto upgrade_stats;
668 		case DEDUP_VERS_FAILURE:
669 			fprintf(stderr,
670 				"HAMMER filesystem must be at least "
671 				"version 5 to dedup\n");
672 			exit (1);
673 		default:
674 			fprintf(stderr, "Unknown error\n");
675 			goto terminate_early;
676 		}
677 
678 crc_failure:
679 		/*
680 		 * We got a CRC collision - either ioctl failed because of
681 		 * the comparison failure or validation of the potential
682 		 * dedup pair went bad. In all cases insert both blocks
683 		 * into SHA subtree (this requires checksum upgrade) and mark
684 		 * entry that corresponds to this CRC in the CRC tree
685 		 * fictitious, so that all futher operations with this CRC go
686 		 * through SHA subtree.
687 		 */
688 		++dedup_crc_failures;
689 
690 		/*
691 		 * Insert block that was represented by now fictitious dedup
692 		 * entry (create a new SHA entry and preserve stats of the
693 		 * old CRC one). If checksum upgrade fails insert the
694 		 * candidate into Pass2 list and return - keep both trees
695 		 * unmodified.
696 		 */
697 		sha_de = calloc(sizeof(*sha_de), 1);
698 		sha_de->leaf = de->leaf;
699 		sha_de->ref_blks = de->u.de.ref_blks;
700 		sha_de->ref_size = de->u.de.ref_size;
701 		if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) {
702 			free(sha_de);
703 			goto pass2_insert;
704 		}
705 		MemoryUse += sizeof(*sha_de);
706 
707 		RB_INIT(&de->u.fict_root);
708 		/*
709 		 * Here we can insert without prior checking because the tree
710 		 * is empty at this point
711 		 */
712 		RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
713 
714 		/*
715 		 * Mark entry in CRC tree fictitious
716 		 */
717 		de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS;
718 
719 		/*
720 		 * Upgrade checksum of the candidate and insert it into
721 		 * SHA subtree. If upgrade fails insert the candidate into
722 		 * Pass2 list.
723 		 */
724 		if (upgrade_chksum(scan_leaf, temp.sha_hash)) {
725 			goto pass2_insert;
726 		}
727 		sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root,
728 				 &temp);
729 		if (sha_de != NULL)
730 			/* There is an entry with this SHA already, but the only
731 			 * RB-tree element at this point is that entry we just
732 			 * added. We know for sure these blocks are different
733 			 * (this is crc_failure branch) so treat it as SHA
734 			 * collision.
735 			 */
736 			goto sha256_failure;
737 
738 		sha_de = calloc(sizeof(*sha_de), 1);
739 		sha_de->leaf = *scan_leaf;
740 		memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
741 		RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
742 		MemoryUse += sizeof(*sha_de);
743 		goto upgrade_stats_sha;
744 	}
745 
746 upgrade_stats:
747 	de->u.de.ref_blks += 1;
748 	de->u.de.ref_size += scan_leaf->data_len;
749 	return (1);
750 
751 upgrade_stats_sha:
752 	sha_de->ref_blks += 1;
753 	sha_de->ref_size += scan_leaf->data_len;
754 	return (1);
755 
756 pass2_insert:
757 	/*
758 	 * If in pass2 mode don't insert anything, fall through to
759 	 * terminate_early
760 	 */
761 	if ((flags & DEDUP_PASS2) == 0) {
762 		pass2_de = calloc(sizeof(*pass2_de), 1);
763 		pass2_de->leaf = *scan_leaf;
764 		STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry);
765 		dedup_skipped_size += scan_leaf->data_len;
766 		return (1);
767 	}
768 
769 terminate_early:
770 	/*
771 	 * Early termination path. Fixup stats.
772 	 */
773 	dedup_alloc_size += scan_leaf->data_len;
774 	dedup_ref_size += scan_leaf->data_len;
775 	return (0);
776 }
777 
778 static int
779 upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash)
780 {
781 	struct hammer_ioc_data data;
782 	char *buf = malloc(DEDUP_BUF);
783 	SHA256_CTX ctx;
784 	int error;
785 
786 	bzero(&data, sizeof(data));
787 	data.elm = leaf->base;
788 	data.ubuf = buf;
789 	data.size = DEDUP_BUF;
790 
791 	error = 0;
792 	if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) {
793 		fprintf(stderr, "Get-data failed: %s\n", strerror(errno));
794 		error = 1;
795 		goto done;
796 	}
797 	DedupDataReads += leaf->data_len;
798 
799 	if (data.leaf.data_len != leaf->data_len) {
800 		error = 1;
801 		goto done;
802 	}
803 
804 	if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD &&
805 	    data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) {
806 		SHA256_Init(&ctx);
807 		SHA256_Update(&ctx, (void *)buf, data.leaf.data_len);
808 		SHA256_Final(sha_hash, &ctx);
809 	}
810 
811 done:
812 	free(buf);
813 	return (error);
814 }
815 
816 static void
817 sigInfo(int signo __unused)
818 {
819 	SigInfoFlag = 1;
820 }
821 
822 static void
823 scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id)
824 {
825 	struct hammer_ioc_mirror_rw mirror;
826 	hammer_ioc_mrecord_any_t mrec;
827 	struct hammer_btree_leaf_elm elm;
828 	char *buf = malloc(DEDUP_BUF);
829 	char buf1[8];
830 	char buf2[8];
831 	int offset, bytes;
832 
833 	SigInfoFlag = 0;
834 	DedupDataReads = 0;
835 	DedupCurrentRecords = 0;
836 	signal(SIGINFO, sigInfo);
837 
838 	bzero(&mirror, sizeof(mirror));
839 	hammer_key_beg_init(&mirror.key_beg);
840 	hammer_key_end_init(&mirror.key_end);
841 
842 	mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid;
843 	mirror.tid_end = glob_pfs.ondisk->sync_end_tid;
844 	mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */
845 	mirror.ubuf = buf;
846 	mirror.size = DEDUP_BUF;
847 	mirror.pfs_id = glob_pfs.pfs_id;
848 	mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
849 
850 	if (VerboseOpt && DedupCrcStart == 0) {
851 		printf("%s %s: objspace %016jx:%04x %016jx:%04x\n",
852 			id, filesystem,
853 			(uintmax_t)mirror.key_beg.obj_id,
854 			mirror.key_beg.localization,
855 			(uintmax_t)mirror.key_end.obj_id,
856 			mirror.key_end.localization);
857 		printf("%s %s: pfs_id %d\n",
858 			id, filesystem, glob_pfs.pfs_id);
859 	}
860 	fflush(stdout);
861 	fflush(stderr);
862 
863 	do {
864 		mirror.count = 0;
865 		mirror.pfs_id = glob_pfs.pfs_id;
866 		mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
867 		if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) {
868 			fprintf(stderr, "Mirror-read %s failed: %s\n",
869 				filesystem, strerror(errno));
870 			exit(1);
871 		}
872 		if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) {
873 			fprintf(stderr, "Mirror-read %s fatal error %d\n",
874 				filesystem, mirror.head.error);
875 			exit(1);
876 		}
877 		if (mirror.count) {
878 			offset = 0;
879 			while (offset < mirror.count) {
880 				mrec = (void *)((char *)buf + offset);
881 				bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size);
882 				if (offset + bytes > mirror.count) {
883 					fprintf(stderr, "Misaligned record\n");
884 					exit(1);
885 				}
886 				assert((mrec->head.type &
887 				       HAMMER_MRECF_TYPE_MASK) ==
888 				       HAMMER_MREC_TYPE_REC);
889 				offset += bytes;
890 				elm = mrec->rec.leaf;
891 				if (elm.base.btype != HAMMER_BTREE_TYPE_RECORD)
892 					continue;
893 				if (elm.base.rec_type != HAMMER_RECTYPE_DATA)
894 					continue;
895 				++DedupCurrentRecords;
896 				if (DedupCrcStart != DedupCrcEnd) {
897 					if (elm.data_crc < DedupCrcStart)
898 						continue;
899 					if (DedupCrcEnd &&
900 					    elm.data_crc >= DedupCrcEnd) {
901 						continue;
902 					}
903 				}
904 				func(&elm, 0);
905 			}
906 		}
907 		mirror.key_beg = mirror.key_cur;
908 		if (DidInterrupt) {
909 			fprintf(stderr, "Interrupted\n");
910 			exit(1);
911 		}
912 		if (SigInfoFlag) {
913 			if (DedupTotalRecords) {
914 				humanize_unsigned(buf1, sizeof(buf1),
915 						  DedupDataReads,
916 						  "B", 1024);
917 				humanize_unsigned(buf2, sizeof(buf2),
918 						  dedup_successes_bytes,
919 						  "B", 1024);
920 				fprintf(stderr, "%s count %7jd/%jd "
921 						"(%02d.%02d%%) "
922 						"ioread %s newddup %s\n",
923 					id,
924 					(intmax_t)DedupCurrentRecords,
925 					(intmax_t)DedupTotalRecords,
926 					(int)(DedupCurrentRecords * 100 /
927 						DedupTotalRecords),
928 					(int)(DedupCurrentRecords * 10000 /
929 						DedupTotalRecords % 100),
930 					buf1, buf2);
931 			} else {
932 				fprintf(stderr, "%s count %-7jd\n",
933 					id,
934 					(intmax_t)DedupCurrentRecords);
935 			}
936 			SigInfoFlag = 0;
937 		}
938 	} while (mirror.count != 0);
939 
940 	signal(SIGINFO, SIG_IGN);
941 
942 	free(buf);
943 }
944 
945 static void
946 dump_simulated_dedup(void)
947 {
948 	struct sim_dedup_entry *sim_de;
949 
950 	printf("=== Dumping simulated dedup entries:\n");
951 	RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) {
952 		printf("\tcrc=%08x cnt=%ju size=%ju\n",
953 			sim_de->crc,
954 			(intmax_t)sim_de->ref_blks,
955 			(intmax_t)sim_de->ref_size);
956 	}
957 	printf("end of dump ===\n");
958 }
959 
960 static void
961 dump_real_dedup(void)
962 {
963 	struct dedup_entry *de;
964 	struct sha_dedup_entry *sha_de;
965 	int i;
966 
967 	printf("=== Dumping dedup entries:\n");
968 	RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) {
969 		if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
970 			printf("\tcrc=%08x fictitious\n", de->leaf.data_crc);
971 
972 			RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
973 				printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t"
974 				       "\t\tsha=",
975 				       sha_de->leaf.data_crc,
976 				       (intmax_t)sha_de->ref_blks,
977 				       (intmax_t)sha_de->ref_size);
978 				for (i = 0; i < SHA256_DIGEST_LENGTH; ++i)
979 					printf("%02x", sha_de->sha_hash[i]);
980 				printf("\n");
981 			}
982 		} else {
983 			printf("\tcrc=%08x cnt=%ju size=%ju\n",
984 			       de->leaf.data_crc,
985 			       (intmax_t)de->u.de.ref_blks,
986 			       (intmax_t)de->u.de.ref_size);
987 		}
988 	}
989 	printf("end of dump ===\n");
990 }
991 
992 static void
993 dedup_usage(int code)
994 {
995 	fprintf(stderr,
996 		"hammer dedup-simulate <filesystem>\n"
997 		"hammer dedup <filesystem>\n"
998 	);
999 	exit(code);
1000 }
1001