xref: /dragonfly/sbin/hammer2/cmd_recover.c (revision 2b3f93ea)
1 /*
2  * Copyright (c) 2023 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * hammer2 recover <devpath> <path> <destdir>
37  *
38  * Recover files from corrupted media, recover deleted files from good
39  * media, generally ignoring the data structure topology outside of the
40  * structures that hang off of inodes and directory entries.  Files are
41  * validated during recovery and renamed to .corrupted when a version of
42  * a file cannot be completely recovered.
43  *
44  * This is a "try to get everything we can out of the filesystem"
45  * directive when you've done something terrible to the filesystem.
46  * The resulting <destdir> tree cannot directly replace the lost topology
47  * as many versions of the same file will be present, so filenames are
48  * suffixed.
49  *
50  * <path> may be a relative file, directory, directory path, or absolute
51  * (absolute is relative to the mount) file or directory path.
52  *
53  * For example, "hammer2 recover /dev/da0s1d /home/charlie /tmp/" will
54  * recover all possible versions of the /home/charlie sub-tree.  If
55  * you said "home/charlie" instead, then it would recover the same but
56  * also include any sub-directory paths that match home/charlie, not
57  * just root paths.  If you want a specific file, then e.g. ".cshrc"
58  * would recover every single .cshrc that can be found on the media.
59  *
60  * The command checks ALL PFSs and snapshots.  Redundant files with the same
61  * path are ignored.  You basically get everything that can possibly be
62  * recovered from the media.
63  */
64 #include "hammer2.h"
65 
66 #define HTABLE_SIZE	(4*1024*1024)
67 #define HTABLE_MASK	(HTABLE_SIZE - 1)
68 #define DISPMODULO	(HTABLE_SIZE / 32768)
69 
70 #include <openssl/sha.h>
71 
72 typedef struct dirent_entry {
73 	struct dirent_entry *next;
74 	char		*filename;
75 	size_t		len;
76 	hammer2_key_t	inum;
77 	hammer2_off_t	bref_off;
78 } dirent_entry_t;
79 
80 typedef struct inode_entry {
81 	struct inode_entry *next;
82 	struct inode_entry *next2;	/* secondary (ino ^ data_off) hash */
83 	hammer2_off_t	data_off;
84 	hammer2_key_t	inum;		/* from bref or inode meta */
85 	//hammer2_inode_data_t inode;	/* (removed, too expensive) */
86 	char		*link_file_path;
87 	uint32_t	inode_crc;
88 	uint8_t		type;		/* from inode meta */
89 	uint8_t		encountered;	/* copies limit w/REPINODEDEPTH */
90 	uint8_t		loopcheck;	/* recursion loop check */
91 	uint8_t		unused03;
92 } inode_entry_t;
93 
94 typedef struct topology_entry {
95 	struct topology_entry *next;
96 	char		*path;
97 	long		iterator;
98 } topology_entry_t;
99 
100 typedef struct neg_entry {
101 	struct neg_entry *next;
102 	hammer2_blockref_t bref;
103 } neg_entry_t;
104 
105 typedef struct topo_bref_entry {
106 	struct topo_bref_entry *next;
107 	topology_entry_t  *topo;
108 	hammer2_off_t	data_off;
109 } topo_bref_entry_t;
110 
111 typedef struct topo_inode_entry {
112 	struct topo_inode_entry *next;
113 	topology_entry_t  *topo;
114 	inode_entry_t *iscan;
115 } topo_inode_entry_t;
116 
117 static dirent_entry_t **DirHash;
118 static inode_entry_t **InodeHash;
119 static inode_entry_t **InodeHash2;	/* secondary multi-variable hash */
120 static topology_entry_t **TopologyHash;
121 static neg_entry_t **NegativeHash;
122 static topo_bref_entry_t **TopoBRefHash;
123 static topo_inode_entry_t **TopoInodeHash;
124 
125 /*static void resolve_topology(void);*/
126 static int check_filename(hammer2_blockref_t *bref,
127 			const char *filename, char *buf, size_t flen);
128 //static void enter_dirent(hammer2_blockref_t *bref, hammer2_off_t loff,
129 //			const char *filename, size_t flen);
130 static topology_entry_t *enter_topology(const char *path);
131 static int topology_check_duplicate_inode(topology_entry_t *topo,
132 			inode_entry_t *iscan);
133 static int topology_check_duplicate_indirect(topology_entry_t *topo,
134 			hammer2_blockref_t *bref);
135 static void enter_inode(hammer2_blockref_t *bref);
136 static void enter_inode_untested(hammer2_inode_data_t *ip, hammer2_off_t loff);
137 static inode_entry_t *find_first_inode(hammer2_key_t inum);
138 /*static dirent_entry_t *find_first_dirent(hammer2_key_t inum);*/
139 static int find_neg(hammer2_blockref_t *bref);
140 static void enter_neg(hammer2_blockref_t *bref);
141 static void dump_tree(inode_entry_t *iscan, const char *dest,
142 			const char *remain, int depth, int path_depth,
143 			int isafile);
144 static int dump_inum_file(inode_entry_t *iscan, hammer2_inode_data_t *inode,
145 			const char *path);
146 static int dump_inum_softlink(hammer2_inode_data_t *inode, const char *path);
147 static int dump_dir_data(const char *dest, const char *remain,
148 			hammer2_blockref_t *base, int count,
149 			int depth, int path_depth, int isafile);
150 static int dump_file_data(int wfd, hammer2_off_t fsize,
151 			hammer2_blockref_t *bref, int count);
152 static int validate_crc(hammer2_blockref_t *bref, void *data, size_t bytes);
153 static uint32_t hammer2_to_unix_xid(const uuid_t *uuid);
154 static void *hammer2_cache_read(hammer2_off_t data_off, size_t *bytesp);
155 
156 static long InodeCount;
157 static long TopoBRefCount;
158 static long TopoBRefDupCount;
159 static long TopoInodeCount;
160 static long TopoInodeDupCount;
161 static long NegativeCount;
162 static long NegativeHits;
163 static long MediaBytes;
164 static int StrictMode = 1;
165 
166 #define INODES_PER_BLOCK	\
167 	(sizeof(union hammer2_media_data) / sizeof(hammer2_inode_data_t))
168 #define REPINODEDEPTH		256
169 
170 /*
171  * Recover the specified file.
172  *
173  * Basically do a raw scan of the drive image looking for directory entries
174  * and inodes.  Index all inodes found, including copies, and filter
175  * directory entries for the requested filename to locate inode numbers.
176  *
177  * All copies that are located are written to destdir with a suffix .00001,
178  * .00002, etc.
179  */
180 int
181 cmd_recover(const char *devpath, const char *pathname,
182 	    const char *destdir, int strict, int isafile)
183 {
184 	hammer2_media_data_t data;
185 	hammer2_volume_t *vol;
186 	hammer2_off_t loff;
187 	hammer2_off_t poff;
188 	size_t i;
189 
190 	StrictMode = strict;
191 	hammer2_init_volumes(devpath, 1);
192 	DirHash = calloc(HTABLE_SIZE, sizeof(dirent_entry_t *));
193 	InodeHash = calloc(HTABLE_SIZE, sizeof(inode_entry_t *));
194 	InodeHash2 = calloc(HTABLE_SIZE, sizeof(inode_entry_t *));
195 	TopologyHash = calloc(HTABLE_SIZE, sizeof(topology_entry_t *));
196 	NegativeHash = calloc(HTABLE_SIZE, sizeof(neg_entry_t *));
197 	TopoBRefHash = calloc(HTABLE_SIZE, sizeof(topo_bref_entry_t *));
198 	TopoInodeHash = calloc(HTABLE_SIZE, sizeof(topo_inode_entry_t *));
199 
200 	/*
201 	 * Media Pass
202 	 *
203 	 * Look for blockrefs that point to inodes.  The blockrefs could
204 	 * be bogus since we aren't validating them, but the combination
205 	 * of a CRC that matches the inode content is fairly robust in
206 	 * finding actual inodes.
207 	 *
208 	 * We also enter unvalidated inodes for inode #1 (PFS roots),
209 	 * because there might not be any blockrefs pointing to some of
210 	 * them.  We need these to be able to locate directory entries
211 	 * under the roots.
212 	 *
213 	 * At the moment we do not try to enter unvalidated directory
214 	 * entries, since this will result in a massive number of false
215 	 * hits
216 	 */
217 	printf("MEDIA PASS\n");
218 
219 	loff = 0;
220 	while ((vol = hammer2_get_volume(loff)) != NULL) {
221 		int fd;
222 		int xdisp;
223 		hammer2_off_t vol_size;
224 
225 		fd = vol->fd;
226 		poff = loff - vol->offset;
227 		vol_size = lseek(fd, 0L, SEEK_END);
228 		xdisp = 0;
229 
230 		while (poff < vol->size) {
231 			if (pread(fd, &data, sizeof(data), poff) !=
232 			    sizeof(data))
233 			{
234 				/* try to skip possible I/O error */
235 				poff += sizeof(data);
236 				continue;
237 			}
238 			for (i = 0; i < HAMMER2_IND_COUNT_MAX; ++i) {
239 				hammer2_blockref_t *bref;
240 				// char filename_buf[1024+1];
241 
242 				bref = &data.npdata[i];
243 
244 				/*
245 				 * Found a possible inode
246 				 */
247 				switch(bref->type) {
248 				case HAMMER2_BREF_TYPE_INODE:
249 					/*
250 					 * Note: preliminary bref filter
251 					 * is inside enter_inode().
252 					 */
253 					enter_inode(bref);
254 					break;
255 				case HAMMER2_BREF_TYPE_DIRENT:
256 					/*
257 					 * Go overboard and try to index
258 					 * anything that looks like a
259 					 * directory entry.  This might find
260 					 * entries whos inodes are no longer
261 					 * available, but will also generate
262 					 * a lot of false files.
263 					 */
264 #if 0
265 					if (filename &&
266 					    flen != bref->embed.dirent.namlen)
267 					{
268 						/* single-file shortcut */
269 						break;
270 					}
271 					if (check_filename(bref, filename,
272 						   filename_buf,
273 						   bref->embed.dirent.namlen))
274 					{
275 						enter_dirent(bref,
276 						    poff + vol->offset +
277 						    (i *
278 						    sizeof(hammer2_blockref_t)),
279 						    filename_buf,
280 						    bref->embed.dirent.namlen);
281 					}
282 #endif
283 					break;
284 				default:
285 					break;
286 				}
287 			}
288 
289 			/*
290 			 * Look for possible root inodes.  We generally can't
291 			 * find these by finding BREFs pointing to them because
292 			 * the BREFs often hang off the volume header.
293 			 *
294 			 * These "inodes" could be seriously corrupt, but if
295 			 * the bref tree is intact that is what we need to
296 			 * get top-level directory entries.
297 			 */
298 			for (i = 0; i < INODES_PER_BLOCK; ++i) {
299 				hammer2_inode_data_t *ip;
300 
301 				ip = (void *)(data.buf + i * sizeof(*ip));
302 				if (ip->meta.inum == 1 &&
303 				    ip->meta.iparent == 0 &&
304 				    ip->meta.type ==
305 					HAMMER2_OBJTYPE_DIRECTORY &&
306 				    ip->meta.op_flags & HAMMER2_OPFLAG_PFSROOT)
307 				{
308 					enter_inode_untested(ip,
309 						poff + vol->offset +
310 						(i * sizeof(*ip)));
311 				}
312 			}
313 			poff += sizeof(data);
314 
315 			MediaBytes += sizeof(data);
316 
317 			/*
318 			 * Update progress
319 			 */
320 			if (QuietOpt == 0 &&
321 			    (++xdisp == DISPMODULO ||
322 			     poff == vol->size - sizeof(data)))
323 			{
324 				xdisp = 0;
325 				printf("%ld inodes scanned, "
326 				       "media %6.2f/%-3.2fG\r",
327 					InodeCount,
328 					MediaBytes / 1e9,
329 					vol_size / 1e9);
330 				fflush(stdout);
331 			}
332 		}
333 		loff = vol->offset + vol->size;
334 	}
335 
336 	/*
337 	 * Restoration Pass
338 	 *
339 	 * Run through available directory inodes, which allows us to locate
340 	 * and validate (crc check) their directory entry blockrefs and
341 	 * construct absolute or relative paths through a recursion.
342 	 *
343 	 * When an absolute path is obtained the search is anchored on a
344 	 * root inode.  When a relative path is obtained the search is
345 	 * unanchored and will find all matching sub-paths.  For example,
346 	 * if you look for ".cshrc" it will find ALL .cshrc's.  If you
347 	 * look for "fubar/.cshsrc" it will find ALL .cshrc's residing
348 	 * in a directory called fubar, however many there are.  But if
349 	 * you look for "/fubar/srcs" it will only find the sub-tree
350 	 * "/fubar/srcs" relative to PFS roots.
351 	 *
352 	 * We may not have indexed the PFS roots themselves, because they
353 	 * often hang off of the volume header and might not have COW'd
354 	 * references to them, so we use the "iparent" field in the inode
355 	 * to detect top-level directories under those roots.
356 	 */
357 	printf("\nInodes=%ld, Invalid_brefs=%ld, Invalid_hits=%ld\n",
358 		InodeCount, NegativeCount, NegativeHits);
359 	printf("RESTORATION PASS\n");
360 
361 	{
362 		int abspath = 0;
363 		long root_count = 0;
364 		long root_max = 0;
365 
366 		/*
367 		 * Check for absolute path, else relative
368 		 */
369 		if (pathname[0] == '/') {
370 			abspath = 1;
371 			while (*pathname == '/')
372 				++pathname;
373 		}
374 
375 		/*
376 		 * Count root inodes
377 		 */
378 		{
379 			inode_entry_t *iscan;
380 
381 			for (iscan = InodeHash[1];
382 			     iscan;
383 			     iscan = iscan->next)
384 			{
385 				if (iscan->inum == 1)
386 					++root_max;
387 			}
388 		}
389 
390 		/*
391 		 * Run through all directory inodes to locate validated
392 		 * directory entries.  If an absolute path was specified
393 		 * we start at root inodes.
394 		 */
395 		for (i = 0; i < HTABLE_SIZE; ++i) {
396 			inode_entry_t *iscan;
397 
398 			for (iscan = InodeHash[i];
399 			     iscan;
400 			     iscan = iscan->next)
401 			{
402 				/*
403 				 * Absolute paths always start at root inodes,
404 				 * otherwise we can start at any directory
405 				 * inode.
406 				 */
407 				if (abspath && iscan->inum != 1)
408 					continue;
409 				if (iscan->type != HAMMER2_OBJTYPE_DIRECTORY)
410 					continue;
411 
412 				/*
413 				 * Progress down root inodes can be slow,
414 				 * so print progress for each root inode.
415 				 */
416 				if (i == 1 && iscan->inum == 1 &&
417 				    QuietOpt == 0)
418 				{
419 					printf("scan roots %p 0x%016jx "
420 					       "(count %ld/%ld)\r",
421 						iscan,
422 						iscan->data_off,
423 						++root_count, root_max);
424 					fflush(stdout);
425 				}
426 
427 				/*
428 				 * Primary match/recover recursion
429 				 */
430 				dump_tree(iscan, destdir, pathname,
431 					  1, 1, isafile);
432 			}
433 			if (QuietOpt == 0 &&
434 			    (i & (DISPMODULO - 1)) == DISPMODULO - 1)
435 			{
436 				if (i == DISPMODULO - 1)
437 					printf("\n");
438 				printf("Progress %zd/%d\r", i, HTABLE_SIZE);
439 				fflush(stdout);
440 			}
441 		}
442 		printf("\n");
443 	}
444 
445 	printf("CLEANUP\n");
446 	printf("TopoBRef stats: count=%ld dups=%ld\n",
447 	       TopoBRefCount, TopoBRefDupCount);
448 	printf("TopoInode stats: count=%ld dups=%ld\n",
449 	       TopoInodeCount, TopoInodeDupCount);
450 
451 	/*
452 	 * Cleanup
453 	 */
454 	hammer2_cleanup_volumes();
455 
456 	for (i = 0; i < HTABLE_SIZE; ++i) {
457 		dirent_entry_t *dscan;
458 		inode_entry_t *iscan;
459 		topology_entry_t *top_scan;
460 		neg_entry_t *negscan;
461 		topo_bref_entry_t *topo_bref;
462 		topo_inode_entry_t *topo_inode;
463 
464 		while ((dscan = DirHash[i]) != NULL) {
465 			DirHash[i] = dscan->next;
466 			free(dscan->filename);
467 			free(dscan);
468 		}
469 
470 		while ((iscan = InodeHash[i]) != NULL) {
471 			InodeHash[i] = iscan->next;
472 			free(iscan);
473 		}
474 		/* InodeHash2[] indexes the same structures */
475 
476 		while ((top_scan = TopologyHash[i]) != NULL) {
477 			TopologyHash[i] = top_scan->next;
478 			free(top_scan->path);
479 			free(top_scan);
480 		}
481 
482 		while ((negscan = NegativeHash[i]) != NULL) {
483 			NegativeHash[i] = negscan->next;
484 			free(negscan);
485 		}
486 
487 		while ((topo_bref = TopoBRefHash[i]) != NULL) {
488 			TopoBRefHash[i] = topo_bref->next;
489 			free(topo_bref);
490 		}
491 
492 		while ((topo_inode = TopoInodeHash[i]) != NULL) {
493 			TopoInodeHash[i] = topo_inode->next;
494 			free(topo_inode);
495 		}
496 	}
497 	free(TopoInodeHash);
498 	free(TopoBRefHash);
499 	free(NegativeHash);
500 	free(TopologyHash);
501 	free(DirHash);
502 	free(InodeHash);
503 	free(InodeHash2);
504 
505 	return 0;
506 }
507 
508 /*
509  * Check for a matching filename, Directory entries can directly-embed
510  * filenames <= 64 bytes.  Otherwise the directory entry has a data
511  * reference to the location of the filename.
512  *
513  * If filename is NULL, check for a valid filename, and copy it into buf.
514  */
515 static int
516 check_filename(hammer2_blockref_t *bref, const char *filename, char *buf,
517 	       size_t flen)
518 {
519 	/* filename too long */
520 	if (flen > 1024)
521 		return 0;
522 
523 	if (flen <= 64) {
524 		/*
525 		 * Filename is embedded in bref
526 		 */
527 		if (buf)
528 			bcopy(bref->check.buf, buf, flen);
529 		buf[flen] = 0;
530 		if (filename == NULL)
531 			return 1;
532 		if (bcmp(filename, bref->check.buf, flen) == 0)
533 			return 1;
534 	} else {
535 		/*
536 		 * Filename requires media access
537 		 */
538 		hammer2_media_data_t data;
539 		hammer2_volume_t *vol;
540 		hammer2_off_t poff;
541 		hammer2_off_t psize;
542 		int vfd;
543 
544 		/*
545 		 * bref must represent a data reference to a 1KB block or
546 		 * smaller.
547 		 */
548 		if ((bref->data_off & 0x1F) == 0 ||
549 		    (bref->data_off & 0x1F) > 10)
550 		{
551 			return 0;
552 		}
553 
554 		/*
555 		 * Indirect block containing filename must be large enough
556 		 * to contain the filename.
557 		 */
558 		psize = 1 << (bref->data_off & 0x1F);
559 		if (flen > psize)
560 			return 0;
561 
562 		/*
563 		 * In strict mode we disallow bref's set to HAMMER2_CHECK_NONE
564 		 * or HAMMER2_CHECK_DISABLED.  Do this check before burning
565 		 * time on an I/O.
566 		 */
567 		if (StrictMode) {
568 			if (HAMMER2_DEC_CHECK(bref->methods) ==
569 				HAMMER2_CHECK_NONE ||
570 			    HAMMER2_DEC_CHECK(bref->methods) ==
571 				HAMMER2_CHECK_DISABLED)
572 			{
573 				return 0;
574 			}
575 		}
576 
577 		/*
578 		 * Read the data, check CRC and such
579 		 */
580 		vol = hammer2_get_volume(bref->data_off);
581 		if (vol == NULL)
582 			return 0;
583 
584 		vfd = vol->fd;
585 		poff = (bref->data_off - vol->offset) & ~0x1FL;
586 		if (pread(vfd, &data, psize, poff) != (ssize_t)psize)
587 			return 0;
588 
589 		if (validate_crc(bref, &data, psize) == 0)
590 			return 0;
591 
592 		if (buf)
593 			bcopy(data.buf, buf, flen);
594 		buf[flen] = 0;
595 		if (filename == NULL)
596 			return 1;
597 		if (bcmp(filename, data.buf, flen) == 0)
598 			return 1;
599 	}
600 	return 0;
601 }
602 
603 #if 0
604 static void
605 enter_dirent(hammer2_blockref_t *bref, hammer2_off_t loff,
606 	     const char *filename, size_t flen)
607 {
608 	hammer2_key_t inum = bref->embed.dirent.inum;
609 	dirent_entry_t *entry;
610 	uint32_t hv = (inum ^ (inum >> 16)) & HTABLE_MASK;
611 
612 	for (entry = DirHash[hv]; entry; entry = entry->next) {
613 		if (entry->inum == inum &&
614 		    entry->len == flen &&
615 		    bcmp(entry->filename, filename, flen) == 0)
616 		{
617 			return;
618 		}
619 	}
620 	entry = malloc(sizeof(*entry));
621 	bzero(entry, sizeof(*entry));
622 	entry->inum = inum;
623 	entry->next = DirHash[hv];
624 	entry->filename = malloc(flen + 1);
625 	entry->len = flen;
626 	entry->bref_off = loff;
627 	bcopy(filename, entry->filename, flen);
628 	entry->filename[flen] = 0;
629 	DirHash[hv] = entry;
630 }
631 #endif
632 
633 /*
634  * Topology duplicate scan avoidance helpers.  We associate inodes and
635  * indirect block data offsets, allowing us to avoid re-scanning any
636  * duplicates that we see.  And there will be many due to how the COW
637  * process occurs.
638  *
639  * For example, when a large directory is modified the content update to
640  * the directory entries will cause the directory inode to be COWd, along
641  * with whatever is holding the bref(s) blocks that have undergone
642  * adjustment.  More likely than not, there will be MANY shared indirect
643  * blocks.
644  */
645 static topology_entry_t *
646 enter_topology(const char *path)
647 {
648 	topology_entry_t *topo;
649 	uint32_t hv = 0;
650 	size_t i;
651 
652 	for (i = 0; path[i]; ++i)
653 		hv = (hv << 5) ^ path[i] ^ (hv >> 24);
654 	hv = (hv ^ (hv >> 16)) & HTABLE_MASK;
655 	for (topo = TopologyHash[hv]; topo; topo = topo->next) {
656 		if (strcmp(path, topo->path) == 0)
657 			return topo;
658 	}
659 	topo = malloc(sizeof(*topo));
660 	bzero(topo, sizeof(*topo));
661 
662 	topo->next = TopologyHash[hv];
663 	TopologyHash[hv] = topo;
664 	topo->path = strdup(path);
665 	topo->iterator = 1;
666 
667 	return topo;
668 }
669 
670 /*
671  * Determine if an inode at the current topology location is one that we
672  * have already dealt with.
673  */
674 static int
675 topology_check_duplicate_inode(topology_entry_t *topo, inode_entry_t *iscan)
676 {
677 	int hv = (((intptr_t)topo ^ (intptr_t)iscan) >> 6) & HTABLE_MASK;
678 	topo_inode_entry_t *scan;
679 
680 	for (scan = TopoInodeHash[hv]; scan; scan = scan->next) {
681 		if (scan->topo == topo &&
682 		    scan->iscan == iscan)
683 		{
684 			++TopoInodeDupCount;
685 			return 1;
686 		}
687 	}
688 	scan = malloc(sizeof(*scan));
689 	bzero(scan, sizeof(*scan));
690 	scan->iscan = iscan;
691 	scan->topo = topo;
692 	scan->next = TopoInodeHash[hv];
693 	TopoInodeHash[hv] = scan;
694 	++TopoInodeCount;
695 
696 	return 0;
697 }
698 
699 /*
700  * Determine if an indirect block (represented by the bref) at the current
701  * topology level is one that we have already dealt with.
702  */
703 static int
704 topology_check_duplicate_indirect(topology_entry_t *topo,
705 				  hammer2_blockref_t *bref)
706 {
707 	int hv = ((intptr_t)topo ^ (bref->data_off >> 8)) & HTABLE_MASK;
708 	topo_bref_entry_t *scan;
709 
710 	for (scan = TopoBRefHash[hv]; scan; scan = scan->next) {
711 		if (scan->topo == topo &&
712 		    scan->data_off == bref->data_off)
713 		{
714 			++TopoBRefDupCount;
715 			return 1;
716 		}
717 	}
718 	scan = malloc(sizeof(*scan));
719 	bzero(scan, sizeof(*scan));
720 	scan->data_off = bref->data_off;
721 	scan->topo = topo;
722 	scan->next = TopoBRefHash[hv];
723 	TopoBRefHash[hv] = scan;
724 	++TopoBRefCount;
725 
726 	return 0;
727 }
728 
729 /*
730  * Valid and record an inode found on media.  There can be many versions
731  * of the same inode number present on the media.
732  */
733 static void
734 enter_inode(hammer2_blockref_t *bref)
735 {
736 	uint32_t hv;
737 	uint32_t hv2;
738 	inode_entry_t *scan;
739 	hammer2_inode_data_t *inode;
740 	size_t psize;
741 
742 	hv = (bref->key ^ (bref->key >> 16)) & HTABLE_MASK;
743 	hv2 = (bref->key ^ (bref->key >> 16) ^ (bref->data_off >> 10)) &
744 	      HTABLE_MASK;
745 
746 	/*
747 	 * Ignore duplicate inodes, use the secondary inode hash table's
748 	 * better spread to reduce cpu consumption (there can be many
749 	 * copies of the same inode so the primary hash table can have
750 	 * very long chains in it).
751 	 */
752 	for (scan = InodeHash2[hv2]; scan; scan = scan->next2) {
753 		if (bref->key == scan->inum &&
754 		    bref->data_off == scan->data_off)
755 		{
756 			return;
757 		}
758 	}
759 
760 	/*
761 	 * Ignore brefs which we have already determined to be bad
762 	 */
763 	if (find_neg(bref))
764 		return;
765 
766 	/*
767 	 * Validate the potential blockref.  Note that this might not be a
768 	 * real blockref.  Don't trust anything, really.
769 	 *
770 	 * - Must be sized for an inode block
771 	 * - Must be properly aligned for an inode block
772 	 * - Keyspace is 1 (keybits == 0), i.e. a single inode number
773 	 */
774 	if ((1 << (bref->data_off & 0x1F)) != sizeof(*inode))
775 		return;
776 	if ((bref->data_off & ~0x1FL & (sizeof(*inode) - 1)) != 0)
777 		return;
778 	if (bref->keybits != 0)
779 		return;
780 	if (bref->key == 0)
781 		return;
782 
783 	inode = hammer2_cache_read(bref->data_off, &psize);
784 
785 	/*
786 	 * Failure prior to I/O being performed.
787 	 */
788 	if (psize == 0)
789 		return;
790 
791 	/*
792 	 * Any failures which occur after the I/O has been performed
793 	 * should enter the bref in the negative cache to avoid unnecessary
794 	 * guaranteed-to-fil reissuances of the same (bref, data_off) combo.
795 	 */
796 	if (inode == NULL) {
797 fail:
798 		enter_neg(bref);
799 		return;
800 	}
801 
802 	/*
803 	 * The blockref looks ok but the real test is whether the
804 	 * inode data it references passes the CRC check.  If it
805 	 * does, it is highly likely that we have a valid inode.
806 	 */
807 	if (validate_crc(bref, inode, sizeof(*inode)) == 0)
808 		goto fail;
809 	if (inode->meta.inum != bref->key)
810 		goto fail;
811 
812 	/*
813 	 * Record the inode.  For now we do not record the actual content
814 	 * of the inode because if there are more than few million of them
815 	 * the memory consumption can get into the dozens of gigabytes.
816 	 *
817 	 * Instead, the inode will be re-read from media in the recovery
818 	 * pass.
819 	 */
820 	scan = malloc(sizeof(*scan));
821 	bzero(scan, sizeof(*scan));
822 
823 	scan->inum = bref->key;
824 	scan->type = inode->meta.type;
825 	scan->data_off = bref->data_off;
826 	scan->inode_crc = hammer2_icrc32(inode, sizeof(*inode));
827 	//scan->inode = *inode;		/* removed, too expensive */
828 
829 	scan->next = InodeHash[hv];
830 	InodeHash[hv] = scan;
831 	scan->next2 = InodeHash2[hv2];
832 	InodeHash2[hv2] = scan;
833 
834 	++InodeCount;
835 }
836 
837 /*
838  * This is used to enter possible root inodes.  Root inodes typically hang
839  * off the volume root and thus there might not be a bref reference to the
840  * many old copies of root inodes sitting around on the media.  Without a
841  * bref we can't really validate that the content is ok.  But we need
842  * these inodes as part of our path searches.
843  */
844 static void
845 enter_inode_untested(hammer2_inode_data_t *ip, hammer2_off_t loff)
846 {
847 	uint32_t hv;
848 	uint32_t hv2;
849 	inode_entry_t *scan;
850 
851 	hv = (ip->meta.inum ^ (ip->meta.inum >> 16)) & HTABLE_MASK;
852 	hv2 = (ip->meta.inum ^ (ip->meta.inum >> 16) ^ (loff >> 10)) &
853 	      HTABLE_MASK;
854 
855 	for (scan = InodeHash2[hv2]; scan; scan = scan->next2) {
856 		if (ip->meta.inum == scan->inum &&
857 		    loff == scan->data_off)
858 		{
859 			return;
860 		}
861 	}
862 
863 	/*
864 	 * Record the inode.  For now we do not record the actual content
865 	 * of the inode because if there are more than few million of them
866 	 * the memory consumption can get into the dozens of gigabytes.
867 	 *
868 	 * Instead, the inode will be re-read from media in the recovery
869 	 * pass.
870 	 */
871 	scan = malloc(sizeof(*scan));
872 	bzero(scan, sizeof(*scan));
873 
874 	scan->inum = ip->meta.inum;
875 	scan->type = ip->meta.type;
876 	scan->data_off = loff;
877 	scan->inode_crc = hammer2_icrc32(ip, sizeof(*ip));
878 	//scan->inode = *ip;		/* removed, too expensive */
879 
880 	scan->next = InodeHash[hv];
881 	InodeHash[hv] = scan;
882 	scan->next2 = InodeHash2[hv2];
883 	InodeHash2[hv2] = scan;
884 
885 	++InodeCount;
886 }
887 
888 static inode_entry_t *
889 find_first_inode(hammer2_key_t inum)
890 {
891 	inode_entry_t *entry;
892 	uint32_t hv = (inum ^ (inum >> 16)) & HTABLE_MASK;
893 
894 	for (entry = InodeHash[hv]; entry; entry = entry->next) {
895 		if (entry->inum == inum)
896 			return entry;
897 	}
898 	return NULL;
899 }
900 
901 /*
902  * Negative bref cache.  A cache of brefs that we have determined
903  * to be invalid.  Used to reduce unnecessary disk I/O.
904  *
905  * NOTE: Checks must be reasonable and at least encompass checks
906  *	 done in enter_inode() after it has decided to read the
907  *	 block at data_off.
908  *
909  *	 Adding a few more match fields in addition won't hurt either.
910  */
911 static int
912 find_neg(hammer2_blockref_t *bref)
913 {
914 	int hv = (bref->data_off >> 10) & HTABLE_MASK;
915 	neg_entry_t *neg;
916 
917 	for (neg = NegativeHash[hv]; neg; neg = neg->next) {
918 		if (bref->data_off == neg->bref.data_off &&
919 		    bref->type == neg->bref.type &&
920 		    bref->methods == neg->bref.methods &&
921 		    bref->key == neg->bref.key &&
922 		    bcmp(&bref->check, &neg->bref.check,
923 			 sizeof(bref->check)) == 0)
924 		{
925 			++NegativeHits;
926 			return 1;
927 		}
928 	}
929 	return 0;
930 }
931 
932 static void
933 enter_neg(hammer2_blockref_t *bref)
934 {
935 	int hv = (bref->data_off >> 10) & HTABLE_MASK;
936 	neg_entry_t *neg;
937 
938 	neg = malloc(sizeof(*neg));
939 	bzero(neg, sizeof(*neg));
940 	neg->next = NegativeHash[hv];
941 	neg->bref = *bref;
942 	NegativeHash[hv] = neg;
943 	++NegativeCount;
944 }
945 
946 /*
947  * Dump the specified inode (file or directory)
948  *
949  * This function recurses via dump_dir_data().
950  */
951 static void
952 dump_tree(inode_entry_t *iscan, const char *dest, const char *remain,
953 	  int depth, int path_depth, int isafile)
954 {
955 	topology_entry_t *topo;
956 	hammer2_inode_data_t *inode;
957 	hammer2_inode_data_t inode_copy;
958 	struct stat st;
959 	size_t psize;
960 	char *path;
961 
962 	/*
963 	 * Re-read the already-validated inode instead of saving it in
964 	 * memory from the media pass.  Even though we already validated
965 	 * it, the content may have changed if scanning live media, so
966 	 * check against a simple crc we recorded earlier.
967 	 */
968 	inode = hammer2_cache_read(iscan->data_off, &psize);
969 	if (psize == 0)
970 		return;
971 	if (inode == NULL || psize != sizeof(*inode))
972 		return;
973 	if (iscan->inode_crc != hammer2_icrc32(inode, sizeof(*inode)))
974 		return;
975 	inode_copy = *inode;
976 	inode = &inode_copy;
977 
978 	/*
979 	 * Try to limit potential infinite loops
980 	 */
981 	if (depth > REPINODEDEPTH && iscan->encountered)
982 		return;
983 
984 	/*
985 	 * Get rid of any dividing slashes
986 	 */
987 	while (*remain == '/')
988 		++remain;
989 
990 	/*
991 	 * Create/lookup destination path (without the iterator), to acquire
992 	 * an iterator for different versions of the same file.
993 	 *
994 	 * Due to the COW mechanism, a lot of repeated snapshot-like
995 	 * directories may be encountered so we use the topology tree
996 	 * to weed out duplicates that attempt to use the same pathname.
997 	 *
998 	 * We also don't iterate copies of directories since this would
999 	 * create a major mess due to the many versions that might be
1000 	 * laying around.  Directories use unextended names.
1001 	 */
1002 	topo = enter_topology(dest);
1003 	if (topology_check_duplicate_inode(topo, iscan))
1004 		return;
1005 
1006 	switch(iscan->type) {
1007 	case HAMMER2_OBJTYPE_DIRECTORY:
1008 		/*
1009 		 * If we have exhausted the path and isafile is TRUE,
1010 		 * stop.
1011 		 */
1012 		if (isafile && *remain == 0)
1013 			return;
1014 
1015 		/*
1016 		 * Create / make usable the target directory.  Note that
1017 		 * it might already exist.
1018 		 *
1019 		 * Do not do this for the destination base directory
1020 		 * (depth 1).
1021 		 */
1022 		if (depth != 1) {
1023 			if (mkdir(dest, 0755) == 0)
1024 				iscan->encountered = 1;
1025 			if (stat(dest, &st) == 0) {
1026 				if (st.st_flags)
1027 					chflags(dest, 0);
1028 				if ((st.st_mode & 0700) != 0700)
1029 					chmod(dest, 0755);
1030 			}
1031 		}
1032 
1033 		/*
1034 		 * Dump directory contents (scan the directory)
1035 		 */
1036 		(void)dump_dir_data(dest, remain,
1037 				    &inode->u.blockset.blockref[0],
1038 				    HAMMER2_SET_COUNT, depth, path_depth + 1,
1039 				    isafile);
1040 
1041 		/*
1042 		 * Final adjustment to directory inode
1043 		 */
1044 		if (depth != 1) {
1045 			struct timeval tvs[2];
1046 
1047 			tvs[0].tv_sec = inode->meta.atime / 1000000;
1048 			tvs[0].tv_usec = inode->meta.atime % 1000000;
1049 			tvs[1].tv_sec = inode->meta.mtime / 1000000;
1050 			tvs[1].tv_usec = inode->meta.mtime % 1000000;
1051 
1052 			if (lutimes(dest, tvs) < 0)
1053 				perror("futimes");
1054 			lchown(dest,
1055 			       hammer2_to_unix_xid(&inode->meta.uid),
1056 			       hammer2_to_unix_xid(&inode->meta.gid));
1057 
1058 			lchmod(dest, inode->meta.mode);
1059 			lchflags(dest, inode->meta.uflags);
1060 		}
1061 		break;
1062 	case HAMMER2_OBJTYPE_REGFILE:
1063 		/*
1064 		 * If no more path to match, dump the file contents
1065 		 */
1066 		if (*remain == 0) {
1067 			asprintf(&path, "%s.%05ld", dest, topo->iterator);
1068 			++topo->iterator;
1069 
1070 			if (stat(path, &st) == 0) {
1071 				if (st.st_flags)
1072 					chflags(path, 0);
1073 				if ((st.st_mode & 0600) != 0600)
1074 					chmod(path, 0644);
1075 			}
1076 			iscan->encountered = 1;
1077 			(void)dump_inum_file(iscan, inode, path);
1078 			free(path);
1079 		}
1080 		break;
1081 	case HAMMER2_OBJTYPE_SOFTLINK:
1082 		/*
1083 		 * If no more path to match, dump the file contents
1084 		 */
1085 		if (*remain == 0) {
1086 			asprintf(&path, "%s.%05ld", dest, topo->iterator);
1087 			++topo->iterator;
1088 
1089 			if (stat(path, &st) == 0) {
1090 				if (st.st_flags)
1091 					chflags(path, 0);
1092 				if ((st.st_mode & 0600) != 0600)
1093 					chmod(path, 0644);
1094 			}
1095 			(void)dump_inum_softlink(inode, path);
1096 			free(path);
1097 		}
1098 		break;
1099 	}
1100 }
1101 
1102 /*
1103  * [re]create a regular file and attempt to restore the originanl perms,
1104  * modes, flags, times, uid, and gid if successful.
1105  *
1106  * If the data block recursion fails the file will be renamed
1107  * .corrupted.
1108  */
1109 static int
1110 dump_inum_file(inode_entry_t *iscan, hammer2_inode_data_t *inode,
1111 	       const char *path1)
1112 {
1113 	char *path2;
1114 	int wfd;
1115 	int res;
1116 
1117 	/*
1118 	 * If this specific inode has already been generated, try to
1119 	 * hardlink it instead of regenerating the same file again.
1120 	 */
1121 	if (iscan->link_file_path) {
1122 		if (link(iscan->link_file_path, path1) == 0)
1123 			return 1;
1124 		chflags(iscan->link_file_path, 0);
1125 		chmod(iscan->link_file_path, 0600);
1126 		if (link(iscan->link_file_path, path1) == 0) {
1127 			chmod(iscan->link_file_path, inode->meta.mode);
1128 			chflags(iscan->link_file_path, inode->meta.uflags);
1129 			return 1;
1130 		}
1131 	}
1132 
1133 	/*
1134 	 * Cleanup potential flags and modes to allow us to write out a
1135 	 * new file.
1136 	 */
1137 	chflags(path1, 0);
1138 	chmod(path1, 0600);
1139 	wfd = open(path1, O_RDWR|O_CREAT|O_TRUNC, 0600);
1140 
1141 	if (inode->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1142 		/*
1143 		 * Direct data case
1144 		 */
1145 		if (inode->meta.size > 0 &&
1146 		    inode->meta.size <= sizeof(inode->u.data))
1147 		{
1148 			write(wfd, inode->u.data, inode->meta.size);
1149 		}
1150 		res = 1;
1151 	} else {
1152 		/*
1153 		 * file content, indirect blockrefs
1154 		 */
1155 		res = dump_file_data(wfd, inode->meta.size,
1156 				     &inode->u.blockset.blockref[0],
1157 				     HAMMER2_SET_COUNT);
1158 	}
1159 
1160 	/*
1161 	 * On success, set perms, mtime, flags, etc
1162 	 * On failure, rename file to .corrupted
1163 	 */
1164 	if (res) {
1165 		struct timeval tvs[2];
1166 
1167 		tvs[0].tv_sec = inode->meta.atime / 1000000;
1168 		tvs[0].tv_usec = inode->meta.atime % 1000000;
1169 		tvs[1].tv_sec = inode->meta.mtime / 1000000;
1170 		tvs[1].tv_usec = inode->meta.mtime % 1000000;
1171 
1172 		ftruncate(wfd, inode->meta.size);
1173 		if (futimes(wfd, tvs) < 0)
1174 			perror("futimes");
1175 		fchown(wfd,
1176 		       hammer2_to_unix_xid(&inode->meta.uid),
1177 		       hammer2_to_unix_xid(&inode->meta.gid));
1178 
1179 		fchmod(wfd, inode->meta.mode);
1180 		fchflags(wfd, inode->meta.uflags);
1181 		iscan->link_file_path = strdup(path1);
1182 	} else {
1183 		struct timeval tvs[2];
1184 
1185 		tvs[0].tv_sec = inode->meta.atime / 1000000;
1186 		tvs[0].tv_usec = inode->meta.atime % 1000000;
1187 		tvs[1].tv_sec = inode->meta.mtime / 1000000;
1188 		tvs[1].tv_usec = inode->meta.mtime % 1000000;
1189 
1190 		ftruncate(wfd, inode->meta.size);
1191 		if (futimes(wfd, tvs) < 0)
1192 			perror("futimes");
1193 		fchown(wfd,
1194 		       hammer2_to_unix_xid(&inode->meta.uid),
1195 		       hammer2_to_unix_xid(&inode->meta.gid));
1196 
1197 		asprintf(&path2, "%s.corrupted", path1);
1198 		rename(path1, path2);
1199 		iscan->link_file_path = path2;
1200 	}
1201 
1202 	/*
1203 	 * Cleanup
1204 	 */
1205 	close(wfd);
1206 
1207 	return res;
1208 }
1209 
1210 /*
1211  * TODO XXX
1212  */
1213 static int
1214 dump_inum_softlink(hammer2_inode_data_t *inode __unused,
1215 		   const char *path __unused)
1216 {
1217 	return 0;
1218 }
1219 
1220 /*
1221  * Scan the directory for a match against the next component in
1222  * the (remain) path.
1223  *
1224  * This function is part of the dump_tree() recursion mechanism.
1225  */
1226 static int
1227 dump_dir_data(const char *dest, const char *remain,
1228 	      hammer2_blockref_t *base, int count,
1229 	      int depth, int path_depth, int isafile)
1230 {
1231 	hammer2_media_data_t data;
1232 	hammer2_volume_t *vol;
1233 	hammer2_off_t poff;
1234 	size_t psize;
1235 	int res = 1;
1236 	int rtmp;
1237 	int n;
1238 	size_t flen;
1239 
1240 	/*
1241 	 * Calculate length of next path component to match against.
1242 	 * A length of zero matches against the entire remaining
1243 	 * directory sub-tree.
1244 	 */
1245 	for (flen = 0; remain[flen] && remain[flen] != '/'; ++flen)
1246 		;
1247 
1248 	/*
1249 	 * Scan the brefs associated with the directory
1250 	 */
1251 	for (n = 0; n < count; ++n) {
1252 		hammer2_blockref_t *bref;
1253 		char filename_buf[1024+1];
1254 		topology_entry_t *topo;
1255 
1256 		bref = &base[n];
1257 
1258 		if (bref->type == HAMMER2_BREF_TYPE_EMPTY)
1259 			continue;
1260 
1261 		vol = NULL;
1262 		poff = 0;
1263 		psize = 0;
1264 		if (bref->data_off & 0x1F) {
1265 			vol = hammer2_get_volume(bref->data_off);
1266 			if (vol == NULL) {
1267 				res = 0;
1268 				continue;
1269 			}
1270 			poff = (bref->data_off - vol->offset) & ~0x1FL;
1271 			psize = 1 << (bref->data_off & 0x1F);
1272 			if (psize > sizeof(data)) {
1273 				res = 0;
1274 				continue;
1275 			}
1276 		}
1277 
1278 		switch(bref->type) {
1279 		case HAMMER2_BREF_TYPE_DIRENT:
1280 			/*
1281 			 * Match the path element or match all directory
1282 			 * entries if we have exhausted (remain).
1283 			 *
1284 			 * Locate all matching inodes and continue the
1285 			 * traversal.
1286 			 *
1287 			 * Avoid traversal loops by recording path_depth
1288 			 * on the way down and clearing it on the way back
1289 			 * up.
1290 			 */
1291 			if ((flen == 0 &&
1292 			    check_filename(bref, NULL, filename_buf,
1293 					   bref->embed.dirent.namlen)) ||
1294 			    (flen &&
1295 			     flen == bref->embed.dirent.namlen &&
1296 			     check_filename(bref, remain, filename_buf, flen)))
1297 			{
1298 				inode_entry_t *iscan;
1299 				hammer2_key_t inum;
1300 				char *path;
1301 
1302 				inum = bref->embed.dirent.inum;
1303 
1304 				asprintf(&path, "%s/%s", dest, filename_buf);
1305 				iscan = find_first_inode(inum);
1306 				while (iscan) {
1307 					if (iscan->inum == inum &&
1308 					    iscan->loopcheck == 0 &&
1309 					    iscan->type ==
1310 					     bref->embed.dirent.type)
1311 					{
1312 						iscan->loopcheck = 1;
1313 						dump_tree(iscan, path,
1314 							  remain + flen,
1315 							  depth + 1,
1316 							  path_depth,
1317 							  isafile);
1318 
1319 						/*
1320 						 * Clear loop check
1321 						 */
1322 						iscan->loopcheck = 0;
1323 					}
1324 					iscan = iscan->next;
1325 				}
1326 				free(path);
1327 			}
1328 			break;
1329 		case HAMMER2_BREF_TYPE_INDIRECT:
1330 			if (psize == 0) {
1331 				res = 0;
1332 				break;
1333 			}
1334 
1335 			/*
1336 			 * Due to COW operations, even if the inode is
1337 			 * replicated, some of the indirect brefs might
1338 			 * still be shared and allow us to reject duplicate
1339 			 * scans.
1340 			 */
1341 			topo = enter_topology(dest);
1342 			if (topology_check_duplicate_indirect(topo, bref))
1343 				break;
1344 
1345 			if (pread(vol->fd, &data, psize, poff) !=
1346 			    (ssize_t)psize)
1347 			{
1348 				res = 0;
1349 				break;
1350 			}
1351 
1352 			if (validate_crc(bref, &data, psize) == 0) {
1353 				res = 0;
1354 				break;
1355 			}
1356 
1357 			rtmp = dump_dir_data(dest, remain,
1358 					  &data.npdata[0],
1359 					  psize / sizeof(hammer2_blockref_t),
1360 					  depth, path_depth, isafile);
1361 			if (res)
1362 				res = rtmp;
1363 			break;
1364 		}
1365 	}
1366 	return res;
1367 }
1368 
1369 /*
1370  * Dumps the data records for an inode to the target file (wfd), returns
1371  * TRUE on success, FALSE if corruption was detected.
1372  */
1373 static int
1374 dump_file_data(int wfd, hammer2_off_t fsize,
1375 	       hammer2_blockref_t *base, int count)
1376 {
1377 	hammer2_media_data_t data;
1378 	hammer2_blockref_t *bref;
1379 	hammer2_volume_t *vol;
1380 	hammer2_off_t poff;
1381 	hammer2_off_t psize;
1382 	hammer2_off_t nsize;
1383 	int res = 1;
1384 	int rtmp;
1385 	int n;
1386 
1387 	for (n = 0; n < count; ++n) {
1388 		char *dptr;
1389 		int res2;
1390 
1391 		bref = &base[n];
1392 
1393 		if (bref->type == HAMMER2_BREF_TYPE_EMPTY ||
1394 		    bref->data_off == 0)
1395 		{
1396 			continue;
1397 		}
1398 		vol = hammer2_get_volume(bref->data_off);
1399 		if (vol == NULL)
1400 			continue;
1401 		if (bref->type == HAMMER2_BREF_TYPE_EMPTY ||
1402 		    bref->data_off == 0)
1403 		{
1404 			continue;
1405 		}
1406 
1407 		poff = (bref->data_off - vol->offset) & ~0x1FL;
1408 		psize = 1 << (bref->data_off & 0x1F);
1409 		if (psize > sizeof(data)) {
1410 			res = 0;
1411 			continue;
1412 		}
1413 
1414 		if (pread(vol->fd, &data, psize, poff) != (ssize_t)psize) {
1415 			res = 0;
1416 			continue;
1417 		}
1418 
1419 		if (validate_crc(bref, &data, psize) == 0) {
1420 			res = 0;
1421 			continue;
1422 		}
1423 
1424 		switch(bref->type) {
1425 		case HAMMER2_BREF_TYPE_DATA:
1426 			dptr = (void *)&data;
1427 
1428 			nsize = 1L << bref->keybits;
1429 			if (nsize > sizeof(data)) {
1430 				res = 0;
1431 				break;
1432 			}
1433 
1434 			switch (HAMMER2_DEC_COMP(bref->methods)) {
1435 			case HAMMER2_COMP_LZ4:
1436 				dptr = hammer2_decompress_LZ4(dptr, psize,
1437 							      nsize, &res2);
1438 				if (res)
1439 					res = res2;
1440 				psize = nsize;
1441 				break;
1442 			case HAMMER2_COMP_ZLIB:
1443 				dptr = hammer2_decompress_ZLIB(dptr, psize,
1444 							       nsize, &res2);
1445 				if (res)
1446 					res = res2;
1447 				psize = nsize;
1448 				break;
1449 			case HAMMER2_COMP_NONE:
1450 			default:
1451 				/* leave in current form */
1452 				break;
1453 			}
1454 
1455 			if (bref->key + psize > fsize)
1456 				pwrite(wfd, dptr, fsize - bref->key,
1457 				       bref->key);
1458 			else
1459 				pwrite(wfd, dptr, psize, bref->key);
1460 			break;
1461 		case HAMMER2_BREF_TYPE_INDIRECT:
1462 			rtmp = dump_file_data(wfd, fsize,
1463 					  &data.npdata[0],
1464 					  psize / sizeof(hammer2_blockref_t));
1465 			if (res)
1466 				res = rtmp;
1467 			break;
1468 		}
1469 	}
1470 	return res;
1471 }
1472 
1473 /*
1474  * Validate the bref data target.  The recovery scan will often attempt to
1475  * validate invalid elements, so don't spew errors to stderr on failure.
1476  */
1477 static int
1478 validate_crc(hammer2_blockref_t *bref, void *data, size_t bytes)
1479 {
1480 	uint32_t cv;
1481 	uint64_t cv64;
1482 	SHA256_CTX hash_ctx;
1483 	int success = 0;
1484 	union {
1485 		uint8_t digest[SHA256_DIGEST_LENGTH];
1486 		uint64_t digest64[SHA256_DIGEST_LENGTH/8];
1487 	} u;
1488 
1489 	switch(HAMMER2_DEC_CHECK(bref->methods)) {
1490 	case HAMMER2_CHECK_NONE:
1491 		if (StrictMode == 0)
1492 			success = 1;
1493 		break;
1494 	case HAMMER2_CHECK_DISABLED:
1495 		if (StrictMode == 0)
1496 			success = 1;
1497 		break;
1498 	case HAMMER2_CHECK_ISCSI32:
1499 		cv = hammer2_icrc32(data, bytes);
1500 		if (bref->check.iscsi32.value == cv) {
1501 			success = 1;
1502 		}
1503 		break;
1504 	case HAMMER2_CHECK_XXHASH64:
1505 		cv64 = XXH64(data, bytes, XXH_HAMMER2_SEED);
1506 		if (bref->check.xxhash64.value == cv64) {
1507 			success = 1;
1508 		}
1509 		break;
1510 	case HAMMER2_CHECK_SHA192:
1511 		SHA256_Init(&hash_ctx);
1512 		SHA256_Update(&hash_ctx, data, bytes);
1513 		SHA256_Final(u.digest, &hash_ctx);
1514 		u.digest64[2] ^= u.digest64[3];
1515 		if (memcmp(u.digest, bref->check.sha192.data,
1516 			   sizeof(bref->check.sha192.data)) == 0)
1517 		{
1518 			success = 1;
1519 		}
1520 		break;
1521 	case HAMMER2_CHECK_FREEMAP:
1522 		cv = hammer2_icrc32(data, bytes);
1523 		if (bref->check.freemap.icrc32 == cv) {
1524 			success = 1;
1525 		}
1526 		break;
1527 	}
1528 	return success;
1529 }
1530 
1531 /*
1532  * Convert a hammer2 uuid to a uid or gid.
1533  */
1534 static uint32_t
1535 hammer2_to_unix_xid(const uuid_t *uuid)
1536 {
1537         return(*(const uint32_t *)&uuid->node[2]);
1538 }
1539 
1540 /*
1541  * Read from disk image, with caching to improve performance.
1542  *
1543  * Use a very simple LRU algo with 16 entries, linearly checked.
1544  */
1545 #define SDCCOUNT	16
1546 #define SDCMASK		(SDCCOUNT - 1)
1547 
1548 typedef struct sdccache {
1549 	char		buf[HAMMER2_PBUFSIZE];
1550 	hammer2_off_t	offset;
1551 	hammer2_volume_t *vol;
1552 	int64_t		last;
1553 } sdccache_t;
1554 
1555 static sdccache_t SDCCache[SDCCOUNT];
1556 static int64_t SDCLast;
1557 
1558 static void *
1559 hammer2_cache_read(hammer2_off_t data_off, size_t *bytesp)
1560 {
1561 	hammer2_off_t poff;
1562 	hammer2_off_t pbase;
1563 	hammer2_volume_t *vol;
1564 	sdccache_t *sdc;
1565 	sdccache_t *sdc_worst;
1566 	int i;
1567 
1568 	*bytesp = 0;
1569 
1570 	/*
1571 	 * Translate logical offset to volume and physical offset.
1572 	 * Return NULL with *bytesp set to 0 to indicate pre-I/O
1573 	 * sanity check failure.
1574 	 */
1575 	vol = hammer2_get_volume(data_off);
1576         if (vol == NULL)
1577                 return NULL;
1578         poff = (data_off - vol->offset) & ~0x1FL;
1579 	*bytesp = 1 << (data_off & 0x1F);
1580 
1581 	pbase = poff & ~HAMMER2_PBUFMASK64;
1582 
1583 	/*
1584 	 * Must not straddle two full-sized hammer2 blocks
1585 	 */
1586 	if ((poff ^ (poff + *bytesp - 1)) & ~HAMMER2_PBUFMASK64) {
1587 		*bytesp = 0;
1588 		return NULL;
1589 	}
1590 
1591 	/*
1592 	 * I/Os are powers of 2 in size and must be aligned to at least
1593 	 * the I/O size.
1594 	 */
1595 	if (poff & ~0x1FL & (*bytesp - 1)) {
1596 		*bytesp = 0;
1597 		return NULL;
1598 	}
1599 
1600 	/*
1601 	 * LRU match lookup
1602 	 */
1603 	sdc_worst = NULL;
1604 	for (i = 0; i < SDCCOUNT; ++i) {
1605 		sdc = &SDCCache[i];
1606 
1607 		if (sdc->vol == vol && sdc->offset == pbase) {
1608 			sdc->last = ++SDCLast;
1609 			return (&sdc->buf[poff - pbase]);
1610 		}
1611 		if (sdc_worst == NULL || sdc_worst->last > sdc->last)
1612 			sdc_worst = sdc;
1613 	}
1614 
1615 	/*
1616 	 * Fallback to I/O if not found, using oldest entry
1617 	 *
1618 	 * On failure we leave (*bytesp) intact to indicate that an I/O
1619 	 * was attempted.
1620 	 */
1621 	sdc = sdc_worst;
1622 	sdc->vol = vol;
1623 	sdc->offset = pbase;
1624 	sdc->last = ++SDCLast;
1625 
1626 	if (pread(vol->fd, sdc->buf, HAMMER2_PBUFSIZE, pbase) !=
1627 	    HAMMER2_PBUFSIZE)
1628 	{
1629 		sdc->offset = (hammer2_off_t)-1;
1630 		sdc->last = 0;
1631 		return NULL;
1632 	}
1633 	return (&sdc->buf[poff - pbase]);
1634 }
1635