xref: /dragonfly/sys/vfs/hammer2/hammer2_disk.h (revision 4d0c54c1)
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #ifndef VFS_HAMMER2_DISK_H_
36 #define VFS_HAMMER2_DISK_H_
37 
38 #ifndef _SYS_UUID_H_
39 #include <sys/uuid.h>
40 #endif
41 #ifndef _SYS_DMSG_H_
42 #include <sys/dmsg.h>
43 #endif
44 
45 /*
46  * The structures below represent the on-disk media structures for the HAMMER2
47  * filesystem.  Note that all fields for on-disk structures are naturally
48  * aligned.  The host endian format is typically used - compatibility is
49  * possible if the implementation detects reversed endian and adjusts accesses
50  * accordingly.
51  *
52  * HAMMER2 primarily revolves around the directory topology:  inodes,
53  * directory entries, and block tables.  Block device buffer cache buffers
54  * are always 64KB.  Logical file buffers are typically 16KB.  All data
55  * references utilize 64-bit byte offsets.
56  *
57  * Free block management is handled independently using blocks reserved by
58  * the media topology.
59  */
60 
61 /*
62  * The data at the end of a file or directory may be a fragment in order
63  * to optimize storage efficiency.  The minimum fragment size is 64 bytes.
64  * Since allocations are in powers of 2 fragments must also be sized in
65  * powers of 2 (64, 128, 256, ... 65536).
66  *
67  * For the moment the maximum allocation size is HAMMER2_PBUFSIZE (64K),
68  * which is 2^16.  Larger extents may be supported in the future.
69  *
70  * A full indirect block uses supports 1024 x 64-byte blockrefs.
71  *
72  * A maximally sized file (2^64-1 bytes) requires 5 indirect block levels.
73  * The hammer2_blockset in the volume header or file inode has another 8
74  * entries, giving us 66+3 = 69 bits of address space.  However, some bits
75  * are taken up by (potentially) requests for redundant copies.  HAMMER2
76  * currently supports up to 8 copies, which brings the address space down
77  * to 66 bits and gives us 2 bits of leeway.
78  */
79 #define HAMMER2_MIN_ALLOC	64	/* minimum allocation size */
80 #define HAMMER2_MIN_RADIX	6	/* minimum allocation size 2^N */
81 #define HAMMER2_MAX_RADIX	16	/* maximum allocation size 2^N */
82 #define HAMMER2_KEY_RADIX	64	/* number of bits in key */
83 
84 /*
85  * MINALLOCSIZE		- The minimum allocation size.  This can be smaller
86  *		  	  or larger than the minimum physical IO size.
87  *
88  *			  NOTE: Should not be larger than 1K since inodes
89  *				are 1K.
90  *
91  * MINIOSIZE		- The minimum IO size.  This must be less than
92  *			  or equal to HAMMER2_PBUFSIZE.
93  *
94  *			  XXX currently must be set to MINALLOCSIZE until/if
95  *			      we deal with recursive buffer cache locks.
96  *
97  * HAMMER2_PBUFSIZE	- Topological block size used by files for all
98  *			  blocks except the block straddling EOF.
99  *
100  * HAMMER2_SEGSIZE	- Allocation map segment size, typically 2MB
101  */
102 
103 #define HAMMER2_SEGSIZE		(65536 * 8)
104 
105 #define HAMMER2_PBUFRADIX	16	/* physical buf (1<<16) bytes */
106 #define HAMMER2_PBUFSIZE	65536
107 #define HAMMER2_LBUFRADIX	14	/* logical buf (1<<14) bytes */
108 #define HAMMER2_LBUFSIZE	16384
109 
110 #if 0
111 #define HAMMER2_MINIORADIX	16	/* minimum phsical IO size */
112 #define HAMMER2_MINIOSIZE	65536
113 #endif
114 #define HAMMER2_MINIORADIX	HAMMER2_MINALLOCRADIX
115 #define HAMMER2_MINIOSIZE	HAMMER2_MINALLOCSIZE
116 
117 #define HAMMER2_MINALLOCRADIX	10	/* minimum block allocation size */
118 #define HAMMER2_MINALLOCSIZE	1024
119 #define HAMMER2_IND_BYTES_MIN	4096	/* first indirect layer only */
120 #define HAMMER2_IND_BYTES_MAX	HAMMER2_PBUFSIZE
121 #define HAMMER2_IND_COUNT_MIN	(HAMMER2_IND_BYTES_MIN / \
122 				 sizeof(hammer2_blockref_t))
123 #define HAMMER2_IND_COUNT_MAX	(HAMMER2_IND_BYTES_MAX / \
124 				 sizeof(hammer2_blockref_t))
125 
126 /*
127  * HAMMER2 processes blockrefs in sets of 8.  The set is fully associative,
128  * is not sorted, and may contain holes.
129  *
130  * A full indirect block supports 1024 blockrefs.
131  *
132  * An inode embeds one set of blockrefs but may also use the data area for
133  * up to 512 bytes of direct data.
134  */
135 #define HAMMER2_SET_COUNT	8	/* direct entries & associativity */
136 #define HAMMER2_SET_RADIX	3
137 #define HAMMER2_EMBEDDED_BYTES	512
138 #define HAMMER2_EMBEDDED_RADIX	9
139 
140 #define HAMMER2_PBUFMASK	(HAMMER2_PBUFSIZE - 1)
141 #define HAMMER2_LBUFMASK	(HAMMER2_LBUFSIZE - 1)
142 #define HAMMER2_SEGMASK		(HAMMER2_SEGSIZE - 1)
143 
144 #define HAMMER2_LBUFMASK64	((hammer2_off_t)HAMMER2_LBUFMASK)
145 #define HAMMER2_PBUFSIZE64	((hammer2_off_t)HAMMER2_PBUFSIZE)
146 #define HAMMER2_PBUFMASK64	((hammer2_off_t)HAMMER2_PBUFMASK)
147 #define HAMMER2_SEGSIZE64	((hammer2_off_t)HAMMER2_SEGSIZE)
148 #define HAMMER2_SEGMASK64	((hammer2_off_t)HAMMER2_SEGMASK)
149 
150 #define HAMMER2_UUID_STRING	"5cbb9ad1-862d-11dc-a94d-01301bb8a9f5"
151 
152 /*
153  * A HAMMER2 filesystem is always sized in multiples of 8MB.
154  *
155  * A 4MB segment is reserved at the beginning of each 2GB zone.  This segment
156  * contains the volume header, the free block table, and possibly other
157  * information in the future.  4MB = 64 x 64K blocks.
158  */
159 #define HAMMER2_VOLUME_ALIGN		(8 * 1024 * 1024)
160 #define HAMMER2_VOLUME_ALIGN64		((hammer2_off_t)HAMMER2_VOLUME_ALIGN)
161 #define HAMMER2_VOLUME_ALIGNMASK	(HAMMER2_VOLUME_ALIGN - 1)
162 #define HAMMER2_VOLUME_ALIGNMASK64     ((hammer2_off_t)HAMMER2_VOLUME_ALIGNMASK)
163 
164 #define HAMMER2_NEWFS_ALIGN		(HAMMER2_VOLUME_ALIGN)
165 #define HAMMER2_NEWFS_ALIGN64		((hammer2_off_t)HAMMER2_VOLUME_ALIGN)
166 #define HAMMER2_NEWFS_ALIGNMASK		(HAMMER2_VOLUME_ALIGN - 1)
167 #define HAMMER2_NEWFS_ALIGNMASK64	((hammer2_off_t)HAMMER2_NEWFS_ALIGNMASK)
168 
169 #define HAMMER2_ZONE_BYTES64		(2LLU * 1024 * 1024 * 1024)
170 #define HAMMER2_ZONE_MASK64		(HAMMER2_ZONE_BYTES64 - 1)
171 #define HAMMER2_ZONE_SEG		(4 * 1024 * 1024)
172 #define HAMMER2_ZONE_SEG64		((hammer2_off_t)HAMMER2_ZONE_SEG)
173 #define HAMMER2_ZONE_BLOCKS_SEG		(HAMMER2_ZONE_SEG / HAMMER2_PBUFSIZE)
174 
175 /*
176  * Two linear areas can be reserved after the initial 2MB segment in the base
177  * zone (the one starting at offset 0).  These areas are NOT managed by the
178  * block allocator and do not fall under HAMMER2 crc checking rules based
179  * at the volume header (but can be self-CRCd internally, depending).
180  */
181 #define HAMMER2_BOOT_MIN_BYTES		HAMMER2_VOLUME_ALIGN
182 #define HAMMER2_BOOT_NOM_BYTES		(64*1024*1024)
183 #define HAMMER2_BOOT_MAX_BYTES		(256*1024*1024)
184 
185 #define HAMMER2_REDO_MIN_BYTES		HAMMER2_VOLUME_ALIGN
186 #define HAMMER2_REDO_NOM_BYTES		(256*1024*1024)
187 #define HAMMER2_REDO_MAX_BYTES		(1024*1024*1024)
188 
189 /*
190  * Most HAMMER2 types are implemented as unsigned 64-bit integers.
191  * Transaction ids are monotonic.
192  *
193  * We utilize 32-bit iSCSI CRCs.
194  */
195 typedef uint64_t hammer2_tid_t;
196 typedef uint64_t hammer2_off_t;
197 typedef uint64_t hammer2_key_t;
198 typedef uint32_t hammer2_crc32_t;
199 
200 /*
201  * Miscellanious ranges (all are unsigned).
202  */
203 #define HAMMER2_MIN_TID		1ULL
204 #define HAMMER2_MAX_TID		0xFFFFFFFFFFFFFFFFULL
205 #define HAMMER2_MIN_KEY		0ULL
206 #define HAMMER2_MAX_KEY		0xFFFFFFFFFFFFFFFFULL
207 #define HAMMER2_MIN_OFFSET	0ULL
208 #define HAMMER2_MAX_OFFSET	0xFFFFFFFFFFFFFFFFULL
209 
210 /*
211  * HAMMER2 data offset special cases and masking.
212  *
213  * All HAMMER2 data offsets have to be broken down into a 64K buffer base
214  * offset (HAMMER2_OFF_MASK_HI) and a 64K buffer index (HAMMER2_OFF_MASK_LO).
215  *
216  * Indexes into physical buffers are always 64-byte aligned.  The low 6 bits
217  * of the data offset field specifies how large the data chunk being pointed
218  * to as a power of 2.  This value typically ranges from HAMMER2_MIN_RADIX
219  * to HAMMER2_MAX_RADIX (6-16).  Larger values may be supported in the future
220  * to support file extents.
221  */
222 #define HAMMER2_OFF_BAD		((hammer2_off_t)-1)
223 #define HAMMER2_OFF_MASK	0xFFFFFFFFFFFFFFC0ULL
224 #define HAMMER2_OFF_MASK_LO	(HAMMER2_OFF_MASK & HAMMER2_PBUFMASK64)
225 #define HAMMER2_OFF_MASK_HI	(~HAMMER2_PBUFMASK64)
226 #define HAMMER2_OFF_MASK_RADIX	0x000000000000003FULL
227 #define HAMMER2_MAX_COPIES	6
228 
229 /*
230  * HAMMER2 directory support and pre-defined keys
231  */
232 #define HAMMER2_DIRHASH_VISIBLE	0x8000000000000000ULL
233 #define HAMMER2_DIRHASH_USERMSK	0x7FFFFFFFFFFFFFFFULL
234 #define HAMMER2_DIRHASH_LOMASK	0x0000000000007FFFULL
235 #define HAMMER2_DIRHASH_HIMASK	0xFFFFFFFFFFFF0000ULL
236 #define HAMMER2_DIRHASH_FORCED	0x0000000000008000ULL	/* bit forced on */
237 
238 #define HAMMER2_SROOT_KEY	0x0000000000000000ULL	/* volume to sroot */
239 
240 /*
241  * The media block reference structure.  This forms the core of the HAMMER2
242  * media topology recursion.  This 64-byte data structure is embedded in the
243  * volume header, in inodes (which are also directory entries), and in
244  * indirect blocks.
245  *
246  * A blockref references a single media item, which typically can be a
247  * directory entry (aka inode), indirect block, or data block.
248  *
249  * The primary feature a blockref represents is the ability to validate
250  * the entire tree underneath it via its check code.  Any modification to
251  * anything propagates up the blockref tree all the way to the root, replacing
252  * the related blocks.  Propagations can shortcut to the volume root to
253  * implement the 'fast syncing' feature but this only delays the eventual
254  * propagation.
255  *
256  * The check code can be a simple 32-bit iscsi code, a 64-bit crc,
257  * or as complex as a 192 bit cryptographic hash.  192 bits is the maximum
258  * supported check code size, which is not sufficient for unverified dedup
259  * UNLESS one doesn't mind once-in-a-blue-moon data corruption (such as when
260  * farming web data).  HAMMER2 has an unverified dedup feature for just this
261  * purpose.
262  */
263 struct hammer2_blockref {		/* MUST BE EXACTLY 64 BYTES */
264 	uint8_t		type;		/* type of underlying item */
265 	uint8_t		methods;	/* check method & compression method */
266 	uint8_t		copyid;		/* specify which copy this is */
267 	uint8_t		keybits;	/* #of keybits masked off 0=leaf */
268 	uint8_t		vradix;		/* virtual data/meta-data size */
269 	uint8_t		flags;		/* blockref flags */
270 	uint8_t		reserved06;
271 	uint8_t		reserved07;
272 	hammer2_key_t	key;		/* key specification */
273 	hammer2_tid_t	mirror_tid;	/* propagate for mirror scan */
274 	hammer2_tid_t	modify_tid;	/* modifications sans propagation */
275 	hammer2_off_t	data_off;	/* low 6 bits is phys size (radix)*/
276 	union {				/* check info */
277 		char	buf[24];
278 		struct {
279 			uint32_t value;
280 			uint32_t unused[5];
281 		} iscsi32;
282 		struct {
283 			uint64_t value;
284 			uint64_t unused[2];
285 		} crc64;
286 		struct {
287 			char data[24];
288 		} sha192;
289 	} check;
290 };
291 
292 typedef struct hammer2_blockref hammer2_blockref_t;
293 
294 #define HAMMER2_BREF_SYNC1		0x01	/* modification synchronized */
295 #define HAMMER2_BREF_SYNC2		0x02	/* modification committed */
296 #define HAMMER2_BREF_DESYNCCHLD		0x04	/* desynchronize children */
297 #define HAMMER2_BREF_DELETED		0x80	/* indicates a deletion */
298 
299 #define HAMMER2_BLOCKREF_BYTES		64	/* blockref struct in bytes */
300 
301 #define HAMMER2_BREF_TYPE_EMPTY		0
302 #define HAMMER2_BREF_TYPE_INODE		1
303 #define HAMMER2_BREF_TYPE_INDIRECT	2
304 #define HAMMER2_BREF_TYPE_DATA		3
305 #define HAMMER2_BREF_TYPE_VOLUME	255	/* pseudo-type */
306 
307 #define HAMMER2_ENC_COMPMETHOD(n)	(n)
308 #define HAMMER2_ENC_CHECKMETHOD(n)	((n) << 4)
309 #define HAMMER2_DEC_COMPMETHOD(n)	((n) & 15)
310 #define HAMMER2_DEC_CHECKMETHOD(n)	(((n) >> 4) & 15)
311 
312 /*
313  * HAMMER2 block references are collected into sets of 8 blockrefs.  These
314  * sets are fully associative, meaning the elements making up a set are
315  * not sorted in any way and may contain duplicate entries, holes, or
316  * entries which shortcut multiple levels of indirection.  Sets are used
317  * in various ways:
318  *
319  * (1) When redundancy is desired a set may contain several duplicate
320  *     entries pointing to different copies of the same data.  Up to 8 copies
321  *     are supported but the set structure becomes a bit inefficient once
322  *     you go over 4.
323  *
324  * (2) The blockrefs in a set can shortcut multiple levels of indirections
325  *     within the bounds imposed by the parent of set.
326  *
327  * When a set fills up another level of indirection is inserted, moving
328  * some or all of the set's contents into indirect blocks placed under the
329  * set.  This is a top-down approach in that indirect blocks are not created
330  * until the set actually becomes full (that is, the entries in the set can
331  * shortcut the indirect blocks when the set is not full).  Depending on how
332  * things are filled multiple indirect blocks will eventually be created.
333  */
334 struct hammer2_blockset {
335 	hammer2_blockref_t	blockref[HAMMER2_SET_COUNT];
336 };
337 
338 typedef struct hammer2_blockset hammer2_blockset_t;
339 
340 /*
341  * Catch programmer snafus
342  */
343 #if (1 << HAMMER2_SET_RADIX) != HAMMER2_SET_COUNT
344 #error "hammer2 direct radix is incorrect"
345 #endif
346 #if (1 << HAMMER2_PBUFRADIX) != HAMMER2_PBUFSIZE
347 #error "HAMMER2_PBUFRADIX and HAMMER2_PBUFSIZE are inconsistent"
348 #endif
349 #if (1 << HAMMER2_MIN_RADIX) != HAMMER2_MIN_ALLOC
350 #error "HAMMER2_MIN_RADIX and HAMMER2_MIN_ALLOC are inconsistent"
351 #endif
352 
353 /*
354  * The media indirect block structure.
355  */
356 struct hammer2_indblock_data {
357 	hammer2_blockref_t blockref[HAMMER2_IND_COUNT_MAX];
358 };
359 
360 typedef struct hammer2_indblock_data hammer2_indblock_data_t;
361 
362 /*
363  * In HAMMER2 inodes ARE directory entries, with a special exception for
364  * hardlinks.  The inode number is stored in the inode rather than being
365  * based on the location of the inode (since the location moves every time
366  * the inode or anything underneath the inode is modified).
367  *
368  * The inode is 1024 bytes, made up of 256 bytes of meta-data, 256 bytes
369  * for the filename, and 512 bytes worth of direct file data OR an embedded
370  * blockset.
371  *
372  * Directories represent one inode per blockref.  Inodes are not laid out
373  * as a file but instead are represented by the related blockrefs.  The
374  * blockrefs, in turn, are indexed by the 64-bit directory hash key.  Remember
375  * that blocksets are fully associative, so a certain degree efficiency is
376  * achieved just from that.
377  *
378  * Up to 512 bytes of direct data can be embedded in an inode, and since
379  * inodes are essentially directory entries this also means that small data
380  * files end up simply being laid out linearly in the directory, resulting
381  * in fewer seeks and highly optimal access.
382  *
383  * The compression mode can be changed at any time in the inode and is
384  * recorded on a blockref-by-blockref basis.
385  *
386  * Hardlinks are supported via the inode map.  Essentially the way a hardlink
387  * works is that all individual directory entries representing the same file
388  * are special cased and specify the same inode number.  The actual file
389  * is placed in the nearest parent directory that is parent to all instances
390  * of the hardlink.  If all hardlinks to a file are in the same directory
391  * the actual file will also be placed in that directory.  This file uses
392  * the inode number as the directory entry key and is invisible to normal
393  * directory scans.  Real directory entry keys are differentiated from the
394  * inode number key via bit 63.  Access to the hardlink silently looks up
395  * the real file and forwards all operations to that file.  Removal of the
396  * last hardlink also removes the real file.
397  *
398  * (attr_tid) is only updated when the inode's specific attributes or regular
399  * file size has changed, and affects path lookups and stat.  (attr_tid)
400  * represents a special cache coherency lock under the inode.  The inode
401  * blockref's modify_tid will always cover it.
402  *
403  * (dirent_tid) is only updated when an entry under a directory inode has
404  * been created, deleted, renamed, or had its attributes change, and affects
405  * directory lookups and scans.  (dirent_tid) represents another special cache
406  * coherency lock under the inode.  The inode blockref's modify_tid will
407  * always cover it.
408  */
409 #define HAMMER2_INODE_BYTES		1024	/* (asserted by code) */
410 #define HAMMER2_INODE_MAXNAME		256	/* maximum name in bytes */
411 #define HAMMER2_INODE_VERSION_ONE	1
412 
413 struct hammer2_inode_data {
414 	uint16_t	version;	/* 0000 inode data version */
415 	uint16_t	reserved02;	/* 0002 */
416 
417 	/*
418 	 * core inode attributes, inode type, misc flags
419 	 */
420 	uint32_t	uflags;		/* 0004 chflags */
421 	uint32_t	rmajor;		/* 0008 available for device nodes */
422 	uint32_t	rminor;		/* 000C available for device nodes */
423 	uint64_t	ctime;		/* 0010 inode change time */
424 	uint64_t	mtime;		/* 0018 modified time */
425 	uint64_t	atime;		/* 0020 access time (unsupported) */
426 	uint64_t	btime;		/* 0028 birth time */
427 	uuid_t		uid;		/* 0030 uid / degenerate unix uid */
428 	uuid_t		gid;		/* 0040 gid / degenerate unix gid */
429 
430 	uint8_t		type;		/* 0050 object type */
431 	uint8_t		op_flags;	/* 0051 operational flags */
432 	uint16_t	cap_flags;	/* 0052 capability flags */
433 	uint32_t	mode;		/* 0054 unix modes (typ low 16 bits) */
434 
435 	/*
436 	 * inode size, identification, localized recursive configuration
437 	 * for compression and backup copies.
438 	 */
439 	hammer2_tid_t	inum;		/* 0058 inode number */
440 	hammer2_off_t	size;		/* 0060 size of file */
441 	uint64_t	nlinks;		/* 0068 hard links (typ only dirs) */
442 	hammer2_tid_t	iparent;	/* 0070 parent inum (recovery only) */
443 	hammer2_key_t	name_key;	/* 0078 full filename key */
444 	uint16_t	name_len;	/* 0080 filename length */
445 	uint8_t		ncopies;	/* 0082 ncopies to local media */
446 	uint8_t		comp_algo;	/* 0083 compression request & algo */
447 
448 	/*
449 	 * These fields are currently only applicable to PFSROOTs.
450 	 *
451 	 * NOTE: We can't use {volume_data->fsid, pfs_clid} to uniquely
452 	 *	 identify an instance of a PFS in the cluster because
453 	 *	 a mount may contain more than one copy of the PFS as
454 	 *	 a separate node.  {pfs_clid, pfs_fsid} must be used for
455 	 *	 registration in the cluster.
456 	 */
457 	uint8_t		target_type;	/* 0084 hardlink target type */
458 	uint8_t		reserved85;	/* 0085 */
459 	uint8_t		reserved86;	/* 0086 */
460 	uint8_t		pfs_type;	/* 0087 (if PFSROOT) node type */
461 	uint64_t	pfs_inum;	/* 0088 (if PFSROOT) inum allocator */
462 	uuid_t		pfs_clid;	/* 0090 (if PFSROOT) cluster uuid */
463 	uuid_t		pfs_fsid;	/* 00A0 (if PFSROOT) unique uuid */
464 
465 	/*
466 	 * Quotas and cumulative sub-tree counters.
467 	 */
468 	hammer2_off_t	data_quota;	/* 00B0 subtree quota in bytes */
469 	hammer2_off_t	data_count;	/* 00B8 subtree byte count */
470 	hammer2_off_t	inode_quota;	/* 00C0 subtree quota inode count */
471 	hammer2_off_t	inode_count;	/* 00C8 subtree inode count */
472 	hammer2_tid_t	attr_tid;	/* 00D0 attributes changed */
473 	hammer2_tid_t	dirent_tid;	/* 00D8 directory/attr changed */
474 	uint64_t	reservedE0;	/* 00E0 */
475 	uint64_t	reservedE8;	/* 00E8 */
476 	uint64_t	reservedF0;	/* 00F0 */
477 	uint64_t	reservedF8;	/* 00F8 */
478 
479 	unsigned char	filename[HAMMER2_INODE_MAXNAME];
480 					/* 0100-01FF (256 char, unterminated) */
481 	union {				/* 0200-03FF (64x8 = 512 bytes) */
482 		struct hammer2_blockset blockset;
483 		char data[HAMMER2_EMBEDDED_BYTES];
484 	} u;
485 };
486 
487 typedef struct hammer2_inode_data hammer2_inode_data_t;
488 
489 #define HAMMER2_OPFLAG_DIRECTDATA	0x01
490 #define HAMMER2_OPFLAG_PFSROOT		0x02
491 #define HAMMER2_OPFLAG_COPYIDS		0x04	/* copyids override parent */
492 
493 #define HAMMER2_OBJTYPE_UNKNOWN		0
494 #define HAMMER2_OBJTYPE_DIRECTORY	1
495 #define HAMMER2_OBJTYPE_REGFILE		2
496 #define HAMMER2_OBJTYPE_FIFO		4
497 #define HAMMER2_OBJTYPE_CDEV		5
498 #define HAMMER2_OBJTYPE_BDEV		6
499 #define HAMMER2_OBJTYPE_SOFTLINK	7
500 #define HAMMER2_OBJTYPE_HARDLINK	8	/* dummy entry for hardlink */
501 #define HAMMER2_OBJTYPE_SOCKET		9
502 #define HAMMER2_OBJTYPE_WHITEOUT	10
503 
504 #define HAMMER2_COPYID_NONE		0
505 #define HAMMER2_COPYID_LOCAL		((uint8_t)-1)
506 
507 #define HAMMER2_COMP_NONE		0
508 #define HAMMER2_COMP_AUTOZERO		1
509 
510 #define HAMMER2_CHECK_NONE		0
511 #define HAMMER2_CHECK_ICRC		1
512 
513 /*
514  * PEER types identify connections and help cluster controller filter
515  * out unwanted SPANs.
516  */
517 #define HAMMER2_PEER_NONE		DMSG_PEER_NONE
518 #define HAMMER2_PEER_CLUSTER		DMSG_PEER_CLUSTER
519 #define HAMMER2_PEER_BLOCK		DMSG_PEER_BLOCK
520 #define HAMMER2_PEER_HAMMER2		DMSG_PEER_HAMMER2
521 
522 #define HAMMER2_COPYID_COUNT		DMSG_COPYID_COUNT
523 
524 /*
525  * PFS types identify a PFS on media and in LNK_SPAN messages.
526  */
527 #define HAMMER2_PFSTYPE_NONE		DMSG_PFSTYPE_NONE
528 #define HAMMER2_PFSTYPE_ADMIN		DMSG_PFSTYPE_ADMIN
529 #define HAMMER2_PFSTYPE_CLIENT		DMSG_PFSTYPE_CLIENT
530 #define HAMMER2_PFSTYPE_CACHE		DMSG_PFSTYPE_CACHE
531 #define HAMMER2_PFSTYPE_COPY		DMSG_PFSTYPE_COPY
532 #define HAMMER2_PFSTYPE_SLAVE		DMSG_PFSTYPE_SLAVE
533 #define HAMMER2_PFSTYPE_SOFT_SLAVE	DMSG_PFSTYPE_SOFT_SLAVE
534 #define HAMMER2_PFSTYPE_SOFT_MASTER	DMSG_PFSTYPE_SOFT_MASTER
535 #define HAMMER2_PFSTYPE_MASTER		DMSG_PFSTYPE_MASTER
536 #define HAMMER2_PFSTYPE_MAX		DMSG_PFSTYPE_MAX
537 
538 /*
539  * The allocref structure represents the allocation table.  One 64K block
540  * is broken down into 4096 x 16 byte entries.  Each indirect block chops
541  * 11 bits off the 64-bit storage space, with leaf entries representing
542  * 64KB blocks.  So:  (12, 12, 12, 12, 16) = 64 bit storage space.
543  *
544  * Each 64K freemap block breaks the 4096 entries into a 64x64 tree with
545  * big_hint1 representing the top level every 64th entry and big_hint2
546  * representing the lower level in each entry.  These fields specify the
547  * largest contiguous radix (1-63) available for allocation in the related
548  * sub-tree.  The largest contiguous radix available for the entire block
549  * is saved in the parent (for the root this will be alloc_blockref in the
550  * volume header).  The hints may be larger than actual and will be corrected
551  * on the fly but must not be smaller.  The allocator uses the hints to
552  * very quickly locate nearby blocks of the desired size.
553  *
554  * In indirect blocks the 64-bit free[_or_mask] field stores the total free
555  * space for each of the 4096 sub-nodes in bytes.  The total free space
556  * represented by the indirect block is stored in its parent.
557  *
558  * Each leaf element represents a 64K block.  A bitmap replaces the free space
559  * count, giving us a 1KB allocation resolution.  A micro-allocation append
560  * offset replaces the icrc field.  The micro-allocation feature is not
561  * currently implemented and the field will be set to 65536.
562  *
563  * The allocation map uses reserved blocks so no data block reference is
564  * required, only a bit in the flags field to specify which of two possible
565  * reserved blocks to use.  This allows the allocation map to be flushed to
566  * disk with minimal synchronization.
567  */
568 struct hammer2_allocref {
569 	uint32_t	icrc_or_app;	/* node: icrc, leaf: append offset */
570 	uint16_t	flags;
571 	uint8_t		big_hint1;	/* upper level hint */
572 	uint8_t		big_hint2;	/* lower level hint */
573 	uint64_t	free_or_mask;	/* node: free bytes, leaf: bitmask */
574 };
575 
576 typedef struct hammer2_allocref hammer2_allocref_t;
577 
578 /*
579  * WARNING - allocref size x entries must equate to the hammer buffer size,
580  *	     and 12 bits per recursion is assumed by the allocator.
581  *
582  * ALTA-D	Since no data_offset is specified flags are needed to select
583  *		which sub-block to recurse down into for root & internal nodes.
584  *		(only ALTA and ALTB is currently supported).
585  *
586  * LEAF		Terminal entry, always set for leafs.  May be used to support
587  *		4MB extent allocations and early termination in the future.
588  *		(not required to shortcut allocation scans as the big_hint1/2
589  *		fields are used for this).
590  */
591 #define HAMMER2_ALLOCREF_BYTES		16	/* structure size */
592 #define HAMMER2_ALLOCREF_ENTRIES	4096	/* entries */
593 #define HAMMER2_ALLOCREF_RADIX		12	/* log2(entries) */
594 
595 #if (HAMMER2_ALLOCREF_BYTES * HAMMER2_ALLOCREF_ENTRIES) != HAMMER2_PBUFSIZE
596 #error "allocref parameters do not fit in hammer buffer"
597 #endif
598 #if (1 << HAMMER2_ALLOCREF_RADIX) != HAMMER2_ALLOCREF_ENTRIES
599 #error "allocref parameters are inconsistent"
600 #endif
601 
602 #define HAMMER2_ALLOCREF_ALTMASK	0x0003	/* select block for recurse */
603 #define HAMMER2_ALLOCREF_ALTA		0x0000
604 #define HAMMER2_ALLOCREF_ALTB		0x0001
605 #define HAMMER2_ALLOCREF_ALTC		0x0002	/* unsupported */
606 #define HAMMER2_ALLOCREF_ALTD		0x0003	/* unsupported */
607 #define HAMMER2_ALLOCREF_LEAF		0x0004
608 
609 /*
610  * The volume header eats a 64K block.  There is currently an issue where
611  * we want to try to fit all nominal filesystem updates in a 512-byte section
612  * but it may be a lost cause due to the need for a blockset.
613  *
614  * All information is stored in host byte order.  The volume header's magic
615  * number may be checked to determine the byte order.  If you wish to mount
616  * between machines w/ different endian modes you'll need filesystem code
617  * which acts on the media data consistently (either all one way or all the
618  * other).  Our code currently does not do that.
619  *
620  * A read-write mount may have to recover missing allocations by doing an
621  * incremental mirror scan looking for modifications made after alloc_tid.
622  * If alloc_tid == last_tid then no recovery operation is needed.  Recovery
623  * operations are usually very, very fast.
624  *
625  * Read-only mounts do not need to do any recovery, access to the filesystem
626  * topology is always consistent after a crash (is always consistent, period).
627  * However, there may be shortcutted blockref updates present from deep in
628  * the tree which are stored in the volumeh eader and must be tracked on
629  * the fly.
630  *
631  * NOTE: The copyinfo[] array contains the configuration for both the
632  *	 cluster connections and any local media copies.  The volume
633  *	 header will be replicated for each local media copy.
634  *
635  *	 The mount command may specify multiple medias or just one and
636  *	 allow HAMMER2 to pick up the others when it checks the copyinfo[]
637  *	 array on mount.
638  *
639  * NOTE: root_blockref points to the super-root directory, not the root
640  *	 directory.  The root directory will be a subdirectory under the
641  *	 super-root.
642  *
643  *	 The super-root directory contains all root directories and all
644  *	 snapshots (readonly or writable).  It is possible to do a
645  *	 null-mount of the super-root using special path constructions
646  *	 relative to your mounted root.
647  *
648  * NOTE: HAMMER2 allows any subdirectory tree to be managed as if it were
649  *	 a PFS, including mirroring and storage quota operations, and this is
650  *	 prefered over creating discrete PFSs in the super-root.  Instead
651  *	 the super-root is most typically used to create writable snapshots,
652  *	 alternative roots, and so forth.  The super-root is also used by
653  *	 the automatic snapshotting mechanism.
654  */
655 #define HAMMER2_VOLUME_ID_HBO	0x48414d3205172011LLU
656 #define HAMMER2_VOLUME_ID_ABO	0x11201705324d4148LLU
657 
658 struct hammer2_volume_data {
659 	/*
660 	 * sector #0 - 512 bytes
661 	 */
662 	uint64_t	magic;			/* 0000 Signature */
663 	hammer2_off_t	boot_beg;		/* 0008 Boot area (future) */
664 	hammer2_off_t	boot_end;		/* 0010 (size = end - beg) */
665 	hammer2_off_t	aux_beg;		/* 0018 Aux area (future) */
666 	hammer2_off_t	aux_end;		/* 0020 (size = end - beg) */
667 	hammer2_off_t	volu_size;		/* 0028 Volume size, bytes */
668 
669 	uint32_t	version;		/* 0030 */
670 	uint32_t	flags;			/* 0034 */
671 	uint8_t		copyid;			/* 0038 copyid of phys vol */
672 	uint8_t		freemap_version;	/* 0039 freemap algorithm */
673 	uint8_t		peer_type;		/* 003A HAMMER2_PEER_xxx */
674 	uint8_t		reserved003B;		/* 003B */
675 	uint32_t	reserved003C;		/* 003C */
676 
677 	uuid_t		fsid;			/* 0040 */
678 	uuid_t		fstype;			/* 0050 */
679 
680 	/*
681 	 * allocator_size is precalculated at newfs time and does not include
682 	 * reserved blocks, boot, or redo areas.
683 	 *
684 	 * Initial non-reserved-area allocations do not use the allocation
685 	 * map but instead adjust alloc_iterator.  Dynamic allocations take
686 	 * over starting at (allocator_beg).  This makes newfs_hammer2's
687 	 * job a lot easier and can also serve as a testing jig.
688 	 */
689 	hammer2_off_t	allocator_size;		/* 0060 Total data space */
690 	hammer2_off_t   allocator_free;		/* 0068	Free space */
691 	hammer2_off_t	allocator_beg;		/* 0070 Initial allocations */
692 	hammer2_tid_t	mirror_tid;		/* 0078 best committed tid */
693 	hammer2_tid_t	alloc_tid;		/* 0080 Alloctable modify tid */
694 	hammer2_blockref_t alloc_blockref;	/* 0088-00C7 */
695 
696 	/*
697 	 * Copyids are allocated dynamically from the copyexists bitmap.
698 	 * An id from the active copies set (up to 8, see copyinfo later on)
699 	 * may still exist after the copy set has been removed from the
700 	 * volume header and its bit will remain active in the bitmap and
701 	 * cannot be reused until it is 100% removed from the hierarchy.
702 	 */
703 	uint32_t	copyexists[8];		/* 00C8-00E7 copy exists bmap */
704 	char		reserved0140[248];	/* 00E8-01DF */
705 
706 	/*
707 	 * 32 bit CRC array at the end of the first 512 byte sector.
708 	 *
709 	 * icrc_sects[7] - First 512-4 bytes of volume header (including all
710 	 *		   the other icrc's except the last one).
711 	 *
712 	 * icrc_sects[6] - Second 512-4 bytes of volume header, which is
713 	 *		   the blockset for the root.
714 	 */
715 	hammer2_crc32_t	icrc_sects[8];		/* 01E0-01FF */
716 
717 	/*
718 	 * sector #1 - 512 bytes
719 	 *
720 	 * The entire sector is used by a blockset.
721 	 */
722 	hammer2_blockset_t sroot_blockset;	/* 0200-03FF Superroot dir */
723 
724 	/*
725 	 * sector #2-7
726 	 */
727 	char	sector2[512];			/* 0400-05FF reserved */
728 	char	sector3[512];			/* 0600-07FF reserved */
729 	char	sector4[512];			/* 0800-09FF reserved */
730 	char	sector5[512];			/* 0A00-0BFF reserved */
731 	char	sector6[512];			/* 0C00-0DFF reserved */
732 	char	sector7[512];			/* 0E00-0FFF reserved */
733 
734 	/*
735 	 * sector #8-71	- 32768 bytes
736 	 *
737 	 * Contains the configuration for up to 256 copyinfo targets.  These
738 	 * specify local and remote copies operating as masters or slaves.
739 	 * copyid's 0 and 255 are reserved (0 indicates an empty slot and 255
740 	 * indicates the local media).
741 	 *
742 	 * Each inode contains a set of up to 8 copyids, either inherited
743 	 * from its parent or explicitly specified in the inode, which
744 	 * indexes into this array.
745 	 */
746 						/* 1000-8FFF copyinfo config */
747 	dmsg_vol_data_t	copyinfo[HAMMER2_COPYID_COUNT];
748 
749 	/*
750 	 * Remaining sections are reserved for future use.
751 	 */
752 	char		reserved0400[0x6FFC];	/* 9000-FFFB reserved */
753 
754 	/*
755 	 * icrc on entire volume header
756 	 */
757 	hammer2_crc32_t	icrc_volheader;		/* FFFC-FFFF full volume icrc*/
758 };
759 
760 typedef struct hammer2_volume_data hammer2_volume_data_t;
761 
762 /*
763  * Various parts of the volume header have their own iCRCs.
764  *
765  * The first 512 bytes has its own iCRC stored at the end of the 512 bytes
766  * and not included the icrc calculation.
767  *
768  * The second 512 bytes also has its own iCRC but it is stored in the first
769  * 512 bytes so it covers the entire second 512 bytes.
770  *
771  * The whole volume block (64KB) has an iCRC covering all but the last 4 bytes,
772  * which is where the iCRC for the whole volume is stored.  This is currently
773  * a catch-all for anything not individually iCRCd.
774  */
775 #define HAMMER2_VOL_ICRC_SECT0		7
776 #define HAMMER2_VOL_ICRC_SECT1		6
777 
778 #define HAMMER2_VOLUME_BYTES		65536
779 
780 #define HAMMER2_VOLUME_ICRC0_OFF	0
781 #define HAMMER2_VOLUME_ICRC1_OFF	512
782 #define HAMMER2_VOLUME_ICRCVH_OFF	0
783 
784 #define HAMMER2_VOLUME_ICRC0_SIZE	(512 - 4)
785 #define HAMMER2_VOLUME_ICRC1_SIZE	(512)
786 #define HAMMER2_VOLUME_ICRCVH_SIZE	(65536 - 4)
787 
788 #define HAMMER2_VOL_VERSION_MIN		1
789 #define HAMMER2_VOL_VERSION_DEFAULT	1
790 #define HAMMER2_VOL_VERSION_WIP 	2
791 
792 #define HAMMER2_NUM_VOLHDRS		4
793 
794 union hammer2_media_data {
795 	hammer2_volume_data_t	voldata;
796         hammer2_inode_data_t    ipdata;
797 	hammer2_indblock_data_t npdata;
798 	char			buf[HAMMER2_PBUFSIZE];
799 };
800 
801 typedef union hammer2_media_data hammer2_media_data_t;
802 
803 #endif
804