xref: /dragonfly/sys/vfs/hammer2/hammer2.h (revision 38b930d0)
1 /*
2  * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 /*
37  * This header file contains structures used internally by the HAMMER2
38  * implementation.  See hammer2_disk.h for on-disk structures.
39  */
40 
41 #ifndef _VFS_HAMMER2_HAMMER2_H_
42 #define _VFS_HAMMER2_HAMMER2_H_
43 
44 #include <sys/param.h>
45 #include <sys/types.h>
46 #include <sys/kernel.h>
47 #include <sys/conf.h>
48 #include <sys/systm.h>
49 #include <sys/tree.h>
50 #include <sys/malloc.h>
51 #include <sys/mount.h>
52 #include <sys/vnode.h>
53 #include <sys/proc.h>
54 #include <sys/mountctl.h>
55 #include <sys/priv.h>
56 #include <sys/stat.h>
57 #include <sys/thread.h>
58 #include <sys/globaldata.h>
59 #include <sys/lockf.h>
60 #include <sys/buf.h>
61 #include <sys/queue.h>
62 #include <sys/limits.h>
63 #include <sys/buf2.h>
64 #include <sys/signal2.h>
65 #include <sys/dmsg.h>
66 #include <sys/mutex.h>
67 #include <sys/mutex2.h>
68 
69 #include "hammer2_disk.h"
70 #include "hammer2_mount.h"
71 #include "hammer2_ioctl.h"
72 #include "hammer2_ccms.h"
73 
74 struct hammer2_chain;
75 struct hammer2_inode;
76 struct hammer2_mount;
77 struct hammer2_pfsmount;
78 struct hammer2_span;
79 struct hammer2_state;
80 struct hammer2_msg;
81 
82 /*
83  * The chain structure tracks a portion of the media topology from the
84  * root (volume) down.  Chains represent volumes, inodes, indirect blocks,
85  * data blocks, and freemap nodes and leafs.
86  *
87  * The chain structure can be multi-homed and its topological recursion
88  * (chain->core) can be shared amongst several chains.  Chain structures
89  * are topologically stable once placed in the in-memory topology (they
90  * don't move around).  Modifications which cross flush synchronization
91  * boundaries, renames, resizing, or any move of the chain to elsewhere
92  * in the topology is accomplished via the DELETE-DUPLICATE mechanism.
93  *
94  * DELETE-DUPLICATE allows HAMMER2 to track work across flush synchronization
95  * points without stalling the filesystem or corrupting the flush
96  * sychronization point.  When necessary a chain will be marked DELETED
97  * and a new, duplicate chain will be allocated.
98  *
99  * This mechanism necessarily requires that we be able to overload chains
100  * at any given layer in the topology.  Overloading is accomplished via a
101  * RBTREE recursion through chain->rbtree.
102  *
103  * Advantages:
104  *
105  *	(1) Fully coherent snapshots can be taken without requiring
106  *	    a pre-flush, resulting in extremely fast (sub-millisecond)
107  *	    snapshots.
108  *
109  *	(2) Multiple synchronization points can be in-flight at the same
110  *	    time, representing multiple snapshots or flushes.
111  *
112  *	(3) The algorithms needed to keep track of everything are actually
113  *	    not that complex.
114  *
115  * Special Considerations:
116  *
117  *	A chain is ref-counted on a per-chain basis, but the chain's lock
118  *	is associated with the shared chain_core and is not per-chain.
119  *
120  *	The power-of-2 nature of the media radix tree ensures that there
121  *	will be no overlaps which straddle edges.
122  */
123 RB_HEAD(hammer2_chain_tree, hammer2_chain);
124 TAILQ_HEAD(h2_flush_deferral_list, hammer2_chain);
125 TAILQ_HEAD(h2_core_list, hammer2_chain);
126 TAILQ_HEAD(h2_layer_list, hammer2_chain_layer);
127 
128 struct hammer2_chain_layer {
129 	int		good;
130 	TAILQ_ENTRY(hammer2_chain_layer) entry;
131 	struct hammer2_chain_tree rbtree;
132 	int		refs;		/* prevent destruction */
133 };
134 
135 typedef struct hammer2_chain_layer hammer2_chain_layer_t;
136 
137 struct hammer2_chain_core {
138 	int		good;
139 	struct ccms_cst	cst;
140 	struct h2_core_list ownerq;	/* all chains sharing this core */
141 	struct h2_layer_list layerq;
142 	int		live_zero;	/* blockref array opt */
143 	hammer2_tid_t	update_lo;	/* check update against parent */
144 	hammer2_tid_t	update_hi;	/* check update against parent */
145 	u_int		chain_count;	/* total chains in layers */
146 	u_int		sharecnt;
147 	u_int		flags;
148 	u_int		live_count;	/* live (not deleted) chains in tree */
149 	int		generation;	/* generation number (inserts only) */
150 };
151 
152 typedef struct hammer2_chain_core hammer2_chain_core_t;
153 
154 #define HAMMER2_CORE_UNUSED0001		0x0001
155 #define HAMMER2_CORE_COUNTEDBREFS	0x0002
156 
157 /*
158  * H2 is a copy-on-write filesystem.  In order to allow chains to allocate
159  * smaller blocks (down to 64-bytes), but improve performance and make
160  * clustered I/O possible using larger block sizes, the kernel buffer cache
161  * is abstracted via the hammer2_io structure.
162  */
163 RB_HEAD(hammer2_io_tree, hammer2_io);
164 
165 struct hammer2_io {
166 	RB_ENTRY(hammer2_io) rbnode;
167 	struct spinlock spin;
168 	struct hammer2_mount *hmp;
169 	struct buf	*bp;
170 	struct bio	*bio;
171 	off_t		pbase;
172 	int		psize;
173 	void		(*callback)(struct hammer2_io *dio,
174 				    struct hammer2_chain *chain,
175 				    void *arg1, off_t arg2);
176 	struct hammer2_chain *arg_c;		/* INPROG I/O only */
177 	void		*arg_p;			/* INPROG I/O only */
178 	off_t		arg_o;			/* INPROG I/O only */
179 	int		refs;
180 };
181 
182 typedef struct hammer2_io hammer2_io_t;
183 
184 /*
185  * Primary chain structure keeps track of the topology in-memory.
186  */
187 struct hammer2_chain {
188 	RB_ENTRY(hammer2_chain) rbnode;		/* node */
189 	TAILQ_ENTRY(hammer2_chain) core_entry;	/* contemporary chains */
190 	hammer2_chain_layer_t	*inlayer;
191 	hammer2_blockref_t	bref;
192 	hammer2_chain_core_t	*core;
193 	hammer2_chain_core_t	*above;
194 	struct hammer2_state	*state;		/* if active cache msg */
195 	struct hammer2_mount	*hmp;
196 	struct hammer2_pfsmount	*pmp;		/* can be NULL */
197 	struct hammer2_chain	*debug_previous;
198 
199 	hammer2_tid_t	modify_tid;		/* snapshot/flush filter */
200 	hammer2_tid_t	delete_tid;
201 	hammer2_key_t   data_count;		/* delta's to apply */
202 	hammer2_key_t   inode_count;		/* delta's to apply */
203 	hammer2_io_t	*dio;			/* physical data buffer */
204 	u_int		bytes;			/* physical data size */
205 	u_int		flags;
206 	u_int		refs;
207 	u_int		lockcnt;
208 	int		debug_reason;
209 	int		src_reason;
210 	int		dst_reason;
211 	hammer2_media_data_t *data;		/* data pointer shortcut */
212 	TAILQ_ENTRY(hammer2_chain) flush_node;	/* flush deferral list */
213 };
214 
215 typedef struct hammer2_chain hammer2_chain_t;
216 
217 int hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2);
218 RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
219 
220 /*
221  * Special notes on flags:
222  *
223  * INITIAL - This flag allows a chain to be created and for storage to
224  *	     be allocated without having to immediately instantiate the
225  *	     related buffer.  The data is assumed to be all-zeros.  It
226  *	     is primarily used for indirect blocks.
227  *
228  * MOVED   - A modified chain becomes MOVED after it flushes.  A chain
229  *	     can also become MOVED if it is moved within the topology
230  *	     (even if not modified).
231  *
232  * MODIFIED- The chain's media data has been modified.
233  */
234 #define HAMMER2_CHAIN_MODIFIED		0x00000001	/* dirty chain data */
235 #define HAMMER2_CHAIN_ALLOCATED		0x00000002	/* kmalloc'd chain */
236 #define HAMMER2_CHAIN_UNUSED0004	0x00000004
237 #define HAMMER2_CHAIN_FORCECOW		0x00000008	/* force copy-on-wr */
238 #define HAMMER2_CHAIN_DELETED		0x00000010	/* deleted chain */
239 #define HAMMER2_CHAIN_INITIAL		0x00000020	/* initial create */
240 #define HAMMER2_CHAIN_FLUSHED		0x00000040	/* blktable updated */
241 #define HAMMER2_CHAIN_MOVED		0x00000080	/* bref changed */
242 #define HAMMER2_CHAIN_IOFLUSH		0x00000100	/* bawrite on put */
243 #define HAMMER2_CHAIN_DEFERRED		0x00000200	/* on a deferral list */
244 #define HAMMER2_CHAIN_DESTROYED		0x00000400	/* destroying inode */
245 #define HAMMER2_CHAIN_VOLUMESYNC	0x00000800	/* needs volume sync */
246 #define HAMMER2_CHAIN_UNUSED01000	0x00001000
247 #define HAMMER2_CHAIN_MOUNTED		0x00002000	/* PFS is mounted */
248 #define HAMMER2_CHAIN_ONRBTREE		0x00004000	/* on parent RB tree */
249 #define HAMMER2_CHAIN_SNAPSHOT		0x00008000	/* snapshot special */
250 #define HAMMER2_CHAIN_EMBEDDED		0x00010000	/* embedded data */
251 #define HAMMER2_CHAIN_RELEASE		0x00020000	/* don't keep around */
252 #define HAMMER2_CHAIN_UNUSED40000	0x00040000
253 #define HAMMER2_CHAIN_UNUSED80000	0x00080000
254 #define HAMMER2_CHAIN_DUPLICATED	0x00100000	/* fwd delete-dup */
255 #define HAMMER2_CHAIN_PFSROOT		0x00200000	/* in pfs->cluster */
256 
257 /*
258  * Flags passed to hammer2_chain_lookup() and hammer2_chain_next()
259  *
260  * NOTE: MATCHIND allows an indirect block / freemap node to be returned
261  *	 when the passed key range matches the radix.  Remember that key_end
262  *	 is inclusive (e.g. {0x000,0xFFF}, not {0x000,0x1000}).
263  */
264 #define HAMMER2_LOOKUP_NOLOCK		0x00000001	/* ref only */
265 #define HAMMER2_LOOKUP_NODATA		0x00000002	/* data left NULL */
266 #define HAMMER2_LOOKUP_SHARED		0x00000100
267 #define HAMMER2_LOOKUP_MATCHIND		0x00000200	/* return all chains */
268 #define HAMMER2_LOOKUP_FREEMAP		0x00000400	/* freemap base */
269 #define HAMMER2_LOOKUP_ALWAYS		0x00000800	/* resolve data */
270 
271 /*
272  * Flags passed to hammer2_chain_modify() and hammer2_chain_resize()
273  *
274  * NOTE: OPTDATA allows us to avoid instantiating buffers for INDIRECT
275  *	 blocks in the INITIAL-create state.
276  */
277 #define HAMMER2_MODIFY_OPTDATA		0x00000002	/* data can be NULL */
278 #define HAMMER2_MODIFY_NO_MODIFY_TID	0x00000004
279 #define HAMMER2_MODIFY_ASSERTNOCOPY	0x00000008	/* assert no del-dup */
280 #define HAMMER2_MODIFY_NOREALLOC	0x00000010
281 #define HAMMER2_MODIFY_INPLACE		0x00000020	/* don't del-dup */
282 
283 /*
284  * Flags passed to hammer2_chain_lock()
285  */
286 #define HAMMER2_RESOLVE_NEVER		1
287 #define HAMMER2_RESOLVE_MAYBE		2
288 #define HAMMER2_RESOLVE_ALWAYS		3
289 #define HAMMER2_RESOLVE_MASK		0x0F
290 
291 #define HAMMER2_RESOLVE_SHARED		0x10	/* request shared lock */
292 #define HAMMER2_RESOLVE_NOREF		0x20	/* already ref'd on lock */
293 
294 /*
295  * Flags passed to hammer2_chain_delete()
296  */
297 #define HAMMER2_DELETE_WILLDUP		0x0001	/* no blk free, will be dup */
298 
299 /*
300  * Flags passed to hammer2_chain_delete_duplicate()
301  */
302 #define HAMMER2_DELDUP_RECORE		0x0001
303 
304 /*
305  * Cluster different types of storage together for allocations
306  */
307 #define HAMMER2_FREECACHE_INODE		0
308 #define HAMMER2_FREECACHE_INDIR		1
309 #define HAMMER2_FREECACHE_DATA		2
310 #define HAMMER2_FREECACHE_UNUSED3	3
311 #define HAMMER2_FREECACHE_TYPES		4
312 
313 /*
314  * hammer2_freemap_alloc() block preference
315  */
316 #define HAMMER2_OFF_NOPREF		((hammer2_off_t)-1)
317 
318 /*
319  * BMAP read-ahead maximum parameters
320  */
321 #define HAMMER2_BMAP_COUNT		16	/* max bmap read-ahead */
322 #define HAMMER2_BMAP_BYTES		(HAMMER2_PBUFSIZE * HAMMER2_BMAP_COUNT)
323 
324 /*
325  * Misc
326  */
327 #define HAMMER2_FLUSH_DEPTH_LIMIT	10	/* stack recursion limit */
328 
329 /*
330  * hammer2_freemap_adjust()
331  */
332 #define HAMMER2_FREEMAP_DORECOVER	1
333 #define HAMMER2_FREEMAP_DOMAYFREE	2
334 #define HAMMER2_FREEMAP_DOREALFREE	3
335 
336 /*
337  * HAMMER2 IN-MEMORY CACHE OF MEDIA STRUCTURES
338  *
339  * There is an in-memory representation of all on-media data structure.
340  * Basically everything is represented by a hammer2_chain structure
341  * in-memory and other higher-level structures map to chains.
342  *
343  * A great deal of data is accessed simply via its buffer cache buffer,
344  * which is mapped for the duration of the chain's lock.  However, because
345  * chains may represent blocks smaller than the 16KB minimum we impose
346  * on buffer cache buffers, we cannot hold related buffer cache buffers
347  * locked for smaller blocks.  In these situations we kmalloc() a copy
348  * of the block.
349  *
350  * When modifications are made to a chain a new filesystem block must be
351  * allocated.  Multiple modifications do not necessarily allocate new
352  * blocks.  However, when a flush occurs a flush synchronization point
353  * is created and any new modifications made after this point will allocate
354  * a new block even if the chain is already in a modified state.
355  *
356  * The in-memory representation may remain cached (for example in order to
357  * placemark clustering locks) even after the related data has been
358  * detached.
359  *
360  *				CORE SHARING
361  *
362  * In order to support concurrent flushes a flush synchronization point
363  * is created represented by a transaction id.  Among other things,
364  * operations may move filesystem objects from one part of the topology
365  * to another (for example, if you rename a file or when indirect blocks
366  * are created or destroyed, and a few other things).  When this occurs
367  * across a flush synchronization point the flusher needs to be able to
368  * recurse down BOTH the 'before' version of the topology and the 'after'
369  * version.
370  *
371  * To facilitate this modifications to chains do what is called a
372  * DELETE-DUPLICATE operation.  Chains are not actually moved in-memory.
373  * Instead the chain we wish to move is deleted and a new chain is created
374  * at the target location in the topology.  ANY SUBCHAINS PLACED UNDER THE
375  * CHAIN BEING MOVED HAVE TO EXIST IN BOTH PLACES.  To make this work
376  * all sub-chains are managed by the hammer2_chain_core structure.  This
377  * structure can be multi-homed, meaning that it can have more than one
378  * chain as its parent.  When a chain is delete-duplicated the chain's core
379  * becomes shared under both the old and new chain.
380  *
381  *				STALE CHAINS
382  *
383  * When a chain is delete-duplicated the old chain typically becomes stale.
384  * This is detected via the HAMMER2_CHAIN_DUPLICATED flag in chain->flags.
385  * To avoid executing live filesystem operations on stale chains, the inode
386  * locking code will follow stale chains via core->ownerq until it finds
387  * the live chain.  The lock prevents ripups by other threads.  Lookups
388  * must properly order locking operations to prevent other threads from
389  * racing the lookup operation and will also follow stale chains when
390  * required.
391  */
392 
393 RB_HEAD(hammer2_inode_tree, hammer2_inode);
394 
395 /*
396  * A hammer2 inode.
397  *
398  * NOTE: The inode's attribute CST which is also used to lock the inode
399  *	 is embedded in the chain (chain.cst) and aliased w/ attr_cst.
400  */
401 struct hammer2_inode {
402 	RB_ENTRY(hammer2_inode) rbnode;		/* inumber lookup (HL) */
403 	ccms_cst_t		topo_cst;	/* directory topology cst */
404 	struct hammer2_pfsmount	*pmp;		/* PFS mount */
405 	struct hammer2_inode	*pip;		/* parent inode */
406 	struct vnode		*vp;
407 	hammer2_chain_t		*chain;		/* NOTE: rehomed on rename */
408 	struct lockf		advlock;
409 	hammer2_tid_t		inum;
410 	u_int			flags;
411 	u_int			refs;		/* +vpref, +flushref */
412 	uint8_t			comp_heuristic;
413 	hammer2_off_t		size;
414 	uint64_t		mtime;
415 };
416 
417 typedef struct hammer2_inode hammer2_inode_t;
418 
419 #define HAMMER2_INODE_MODIFIED		0x0001
420 #define HAMMER2_INODE_SROOT		0x0002	/* kmalloc special case */
421 #define HAMMER2_INODE_RENAME_INPROG	0x0004
422 #define HAMMER2_INODE_ONRBTREE		0x0008
423 #define HAMMER2_INODE_RESIZED		0x0010
424 #define HAMMER2_INODE_MTIME		0x0020
425 
426 int hammer2_inode_cmp(hammer2_inode_t *ip1, hammer2_inode_t *ip2);
427 RB_PROTOTYPE2(hammer2_inode_tree, hammer2_inode, rbnode, hammer2_inode_cmp,
428 		hammer2_tid_t);
429 
430 /*
431  * A hammer2 transaction and flush sequencing structure.
432  *
433  * This global structure is tied into hammer2_mount and is used
434  * to sequence modifying operations and flushes.
435  *
436  * (a) Any modifying operations with sync_tid >= flush_tid will stall until
437  *     all modifying operating with sync_tid < flush_tid complete.
438  *
439  *     The flush related to flush_tid stalls until all modifying operations
440  *     with sync_tid < flush_tid complete.
441  *
442  * (b) Once unstalled, modifying operations with sync_tid > flush_tid are
443  *     allowed to run.  All modifications cause modify/duplicate operations
444  *     to occur on the related chains.  Note that most INDIRECT blocks will
445  *     be unaffected because the modifications just overload the RBTREE
446  *     structurally instead of actually modifying the indirect blocks.
447  *
448  * (c) The actual flush unstalls and RUNS CONCURRENTLY with (b), but only
449  *     utilizes the chain structures with sync_tid <= flush_tid.  The
450  *     flush will modify related indirect blocks and inodes in-place
451  *     (rather than duplicate) since the adjustments are compatible with
452  *     (b)'s RBTREE overloading
453  *
454  *     SPECIAL NOTE:  Inode modifications have to also propagate along any
455  *		      modify/duplicate chains.  File writes detect the flush
456  *		      and force out the conflicting buffer cache buffer(s)
457  *		      before reusing them.
458  *
459  * (d) Snapshots can be made instantly but must be flushed and disconnected
460  *     from their duplicative source before they can be mounted.  This is
461  *     because while H2's on-media structure supports forks, its in-memory
462  *     structure only supports very simple forking for background flushing
463  *     purposes.
464  *
465  * TODO: Flush merging.  When fsync() is called on multiple discrete files
466  *	 concurrently there is no reason to stall the second fsync.
467  *	 The final flush that reaches to root can cover both fsync()s.
468  *
469  *     The chains typically terminate as they fly onto the disk.  The flush
470  *     ultimately reaches the volume header.
471  */
472 struct hammer2_trans {
473 	TAILQ_ENTRY(hammer2_trans) entry;
474 	struct hammer2_pfsmount *pmp;		/* might be NULL */
475 	struct hammer2_mount	*hmp_single;	/* if single-targetted */
476 	hammer2_tid_t		sync_tid;
477 	hammer2_tid_t		real_tid;
478 	hammer2_tid_t		inode_tid;
479 	thread_t		td;		/* pointer */
480 	int			flags;
481 	int			blocked;
482 	uint8_t			inodes_created;
483 	uint8_t			dummy[7];
484 };
485 
486 typedef struct hammer2_trans hammer2_trans_t;
487 
488 #define HAMMER2_TRANS_ISFLUSH		0x0001	/* formal flush */
489 #define HAMMER2_TRANS_UNUSED0002	0x0002
490 #define HAMMER2_TRANS_BUFCACHE		0x0004	/* from bioq strategy write */
491 #define HAMMER2_TRANS_NEWINODE		0x0008	/* caller allocating inode */
492 #define HAMMER2_TRANS_ISALLOCATING	0x0010	/* in allocator */
493 
494 #define HAMMER2_FREEMAP_HEUR_NRADIX	4	/* pwr 2 PBUFRADIX-MINIORADIX */
495 #define HAMMER2_FREEMAP_HEUR_TYPES	8
496 #define HAMMER2_FREEMAP_HEUR		(HAMMER2_FREEMAP_HEUR_NRADIX * \
497 					 HAMMER2_FREEMAP_HEUR_TYPES)
498 
499 /*
500  * Global (per device) mount structure for device (aka vp->v_mount->hmp)
501  */
502 TAILQ_HEAD(hammer2_trans_queue, hammer2_trans);
503 
504 struct hammer2_mount {
505 	struct vnode	*devvp;		/* device vnode */
506 	int		ronly;		/* read-only mount */
507 	int		pmp_count;	/* PFS mounts backed by us */
508 	TAILQ_ENTRY(hammer2_mount) mntentry; /* hammer2_mntlist */
509 
510 	struct malloc_type *mchain;
511 	int		nipstacks;
512 	int		maxipstacks;
513 	struct spinlock	io_spin;	/* iotree access */
514 	struct hammer2_io_tree iotree;
515 	int		iofree_count;
516 	hammer2_chain_t vchain;		/* anchor chain (topology) */
517 	hammer2_chain_t fchain;		/* anchor chain (freemap) */
518 	hammer2_inode_t	*sroot;		/* super-root localized to media */
519 	struct lock	alloclk;	/* lockmgr lock */
520 	struct lock	voldatalk;	/* lockmgr lock */
521 	struct hammer2_trans_queue transq; /* all in-progress transactions */
522 	hammer2_off_t	heur_freemap[HAMMER2_FREEMAP_HEUR];
523 	int		flushcnt;	/* #of flush trans on the list */
524 
525 	int		volhdrno;	/* last volhdrno written */
526 	hammer2_volume_data_t voldata;
527 	hammer2_volume_data_t volsync;	/* synchronized voldata */
528 };
529 
530 typedef struct hammer2_mount hammer2_mount_t;
531 
532 /*
533  * HAMMER2 cluster - a device/root associated with a PFS.
534  *
535  * A PFS may have several hammer2_cluster's associated with it.
536  */
537 #define HAMMER2_MAXCLUSTER	8
538 
539 struct hammer2_cluster {
540 	int			nchains;
541 	int			status;
542 	hammer2_chain_t		*chains[HAMMER2_MAXCLUSTER];
543 };
544 
545 typedef struct hammer2_cluster hammer2_cluster_t;
546 
547 /*
548  * HAMMER2 PFS mount point structure (aka vp->v_mount->mnt_data).
549  * This has a 1:1 correspondence to struct mount (note that the
550  * hammer2_mount structure has a N:1 correspondence).
551  *
552  * This structure represents a cluster mount and not necessarily a
553  * PFS under a specific device mount (HMP).  The distinction is important
554  * because the elements backing a cluster mount can change on the fly.
555  *
556  * Usually the first element under the cluster represents the original
557  * user-requested mount that bootstraps the whole mess.  In significant
558  * setups the original is usually just a read-only media image (or
559  * representitive file) that simply contains a bootstrap volume header
560  * listing the configuration.
561  */
562 struct hammer2_pfsmount {
563 	struct mount		*mp;
564 	hammer2_cluster_t	cluster;
565 	hammer2_inode_t		*iroot;		/* PFS root inode */
566 	hammer2_off_t		inode_count;	/* copy of inode_count */
567 	ccms_domain_t		ccms_dom;
568 	struct netexport	export;		/* nfs export */
569 	int			ronly;		/* read-only mount */
570 	struct malloc_type	*minode;
571 	struct malloc_type	*mmsg;
572 	kdmsg_iocom_t		iocom;
573 	struct spinlock		inum_spin;	/* inumber lookup */
574 	struct hammer2_inode_tree inum_tree;
575 	long			inmem_inodes;
576 	long			inmem_chains;
577 	int			inmem_waiting;
578 	int			count_lwinprog;	/* logical write in prog */
579 	thread_t		wthread_td;	/* write thread td */
580 	struct bio_queue_head	wthread_bioq;	/* logical buffer bioq */
581 	struct mtx		wthread_mtx;	/* interlock */
582 	int			wthread_destroy;/* termination sequencing */
583 };
584 
585 typedef struct hammer2_pfsmount hammer2_pfsmount_t;
586 
587 #define HAMMER2_LWINPROG_WAITING	0x80000000
588 #define HAMMER2_LWINPROG_MASK		0x7FFFFFFF
589 
590 #if defined(_KERNEL)
591 
592 MALLOC_DECLARE(M_HAMMER2);
593 
594 #define VTOI(vp)	((hammer2_inode_t *)(vp)->v_data)
595 #define ITOV(ip)	((ip)->vp)
596 
597 /*
598  * Currently locked chains retain the locked buffer cache buffer for
599  * indirect blocks, and indirect blocks can be one of two sizes.  The
600  * device buffer has to match the case to avoid deadlocking recursive
601  * chains that might otherwise try to access different offsets within
602  * the same device buffer.
603  */
604 static __inline
605 int
606 hammer2_devblkradix(int radix)
607 {
608 	if (radix <= HAMMER2_LBUFRADIX) {
609 		return (HAMMER2_LBUFRADIX);
610 	} else {
611 		return (HAMMER2_PBUFRADIX);
612 	}
613 }
614 
615 static __inline
616 size_t
617 hammer2_devblksize(size_t bytes)
618 {
619 	if (bytes <= HAMMER2_LBUFSIZE) {
620 		return(HAMMER2_LBUFSIZE);
621 	} else {
622 		KKASSERT(bytes <= HAMMER2_PBUFSIZE &&
623 			 (bytes ^ (bytes - 1)) == ((bytes << 1) - 1));
624 		return (HAMMER2_PBUFSIZE);
625 	}
626 }
627 
628 
629 static __inline
630 hammer2_pfsmount_t *
631 MPTOPMP(struct mount *mp)
632 {
633 	return ((hammer2_pfsmount_t *)mp->mnt_data);
634 }
635 
636 extern struct vop_ops hammer2_vnode_vops;
637 extern struct vop_ops hammer2_spec_vops;
638 extern struct vop_ops hammer2_fifo_vops;
639 
640 extern int hammer2_debug;
641 extern int hammer2_cluster_enable;
642 extern int hammer2_hardlink_enable;
643 extern int hammer2_flush_pipe;
644 extern long hammer2_iod_file_read;
645 extern long hammer2_iod_meta_read;
646 extern long hammer2_iod_indr_read;
647 extern long hammer2_iod_fmap_read;
648 extern long hammer2_iod_volu_read;
649 extern long hammer2_iod_file_write;
650 extern long hammer2_iod_meta_write;
651 extern long hammer2_iod_indr_write;
652 extern long hammer2_iod_fmap_write;
653 extern long hammer2_iod_volu_write;
654 extern long hammer2_ioa_file_read;
655 extern long hammer2_ioa_meta_read;
656 extern long hammer2_ioa_indr_read;
657 extern long hammer2_ioa_fmap_read;
658 extern long hammer2_ioa_volu_read;
659 extern long hammer2_ioa_file_write;
660 extern long hammer2_ioa_meta_write;
661 extern long hammer2_ioa_indr_write;
662 extern long hammer2_ioa_fmap_write;
663 extern long hammer2_ioa_volu_write;
664 
665 extern struct objcache *cache_buffer_read;
666 extern struct objcache *cache_buffer_write;
667 
668 extern int destroy;
669 extern int write_thread_wakeup;
670 
671 extern mtx_t thread_protect;
672 
673 /*
674  * hammer2_subr.c
675  */
676 #define hammer2_icrc32(buf, size)	iscsi_crc32((buf), (size))
677 #define hammer2_icrc32c(buf, size, crc)	iscsi_crc32_ext((buf), (size), (crc))
678 
679 hammer2_chain_t *hammer2_inode_lock_ex(hammer2_inode_t *ip);
680 hammer2_chain_t *hammer2_inode_lock_sh(hammer2_inode_t *ip);
681 void hammer2_inode_unlock_ex(hammer2_inode_t *ip, hammer2_chain_t *chain);
682 void hammer2_inode_unlock_sh(hammer2_inode_t *ip, hammer2_chain_t *chain);
683 void hammer2_chain_refactor(hammer2_chain_t **chainp);
684 void hammer2_voldata_lock(hammer2_mount_t *hmp);
685 void hammer2_voldata_unlock(hammer2_mount_t *hmp, int modify);
686 ccms_state_t hammer2_inode_lock_temp_release(hammer2_inode_t *ip);
687 void hammer2_inode_lock_temp_restore(hammer2_inode_t *ip, ccms_state_t ostate);
688 ccms_state_t hammer2_inode_lock_upgrade(hammer2_inode_t *ip);
689 void hammer2_inode_lock_downgrade(hammer2_inode_t *ip, ccms_state_t ostate);
690 
691 void hammer2_mount_exlock(hammer2_mount_t *hmp);
692 void hammer2_mount_shlock(hammer2_mount_t *hmp);
693 void hammer2_mount_unlock(hammer2_mount_t *hmp);
694 
695 int hammer2_get_dtype(hammer2_chain_t *chain);
696 int hammer2_get_vtype(hammer2_chain_t *chain);
697 u_int8_t hammer2_get_obj_type(enum vtype vtype);
698 void hammer2_time_to_timespec(u_int64_t xtime, struct timespec *ts);
699 u_int64_t hammer2_timespec_to_time(struct timespec *ts);
700 u_int32_t hammer2_to_unix_xid(uuid_t *uuid);
701 void hammer2_guid_to_uuid(uuid_t *uuid, u_int32_t guid);
702 
703 hammer2_key_t hammer2_dirhash(const unsigned char *name, size_t len);
704 int hammer2_getradix(size_t bytes);
705 
706 int hammer2_calc_logical(hammer2_inode_t *ip, hammer2_off_t uoff,
707 			hammer2_key_t *lbasep, hammer2_key_t *leofp);
708 int hammer2_calc_physical(hammer2_inode_t *ip, hammer2_key_t lbase);
709 void hammer2_update_time(uint64_t *timep);
710 
711 /*
712  * hammer2_inode.c
713  */
714 struct vnode *hammer2_igetv(hammer2_inode_t *ip, int *errorp);
715 
716 void hammer2_inode_lock_nlinks(hammer2_inode_t *ip);
717 void hammer2_inode_unlock_nlinks(hammer2_inode_t *ip);
718 hammer2_inode_t *hammer2_inode_lookup(hammer2_pfsmount_t *pmp,
719 			hammer2_tid_t inum);
720 hammer2_inode_t *hammer2_inode_get(hammer2_pfsmount_t *pmp,
721 			hammer2_inode_t *dip, hammer2_chain_t *chain);
722 void hammer2_inode_free(hammer2_inode_t *ip);
723 void hammer2_inode_ref(hammer2_inode_t *ip);
724 void hammer2_inode_drop(hammer2_inode_t *ip);
725 void hammer2_inode_repoint(hammer2_inode_t *ip, hammer2_inode_t *pip,
726 			hammer2_chain_t *chain);
727 
728 hammer2_inode_t *hammer2_inode_create(hammer2_trans_t *trans,
729 			hammer2_inode_t *dip,
730 			struct vattr *vap, struct ucred *cred,
731 			const uint8_t *name, size_t name_len,
732 			hammer2_chain_t **chainp, int *errorp);
733 int hammer2_inode_connect(hammer2_trans_t *trans, int hlink,
734 			hammer2_inode_t *dip, hammer2_chain_t **chainp,
735 			const uint8_t *name, size_t name_len);
736 hammer2_inode_t *hammer2_inode_common_parent(hammer2_inode_t *fdip,
737 			hammer2_inode_t *tdip);
738 void hammer2_inode_fsync(hammer2_trans_t *trans, hammer2_inode_t *ip,
739 			hammer2_chain_t **parentp);
740 int hammer2_unlink_file(hammer2_trans_t *trans, hammer2_inode_t *dip,
741 			const uint8_t *name, size_t name_len, int isdir,
742 			int *hlinkp);
743 int hammer2_hardlink_consolidate(hammer2_trans_t *trans, hammer2_inode_t *ip,
744 			hammer2_chain_t **chainp,
745 			hammer2_inode_t *tdip, int linkcnt);
746 int hammer2_hardlink_deconsolidate(hammer2_trans_t *trans, hammer2_inode_t *dip,
747 			hammer2_chain_t **chainp, hammer2_chain_t **ochainp);
748 int hammer2_hardlink_find(hammer2_inode_t *dip,
749 			hammer2_chain_t **chainp, hammer2_chain_t **ochainp);
750 
751 /*
752  * hammer2_chain.c
753  */
754 void hammer2_modify_volume(hammer2_mount_t *hmp);
755 hammer2_chain_t *hammer2_chain_alloc(hammer2_mount_t *hmp,
756 				hammer2_pfsmount_t *pmp,
757 				hammer2_trans_t *trans,
758 				hammer2_blockref_t *bref);
759 void hammer2_chain_core_alloc(hammer2_trans_t *trans, hammer2_chain_t *nchain,
760 				hammer2_chain_t *ochain);
761 void hammer2_chain_ref(hammer2_chain_t *chain);
762 void hammer2_chain_drop(hammer2_chain_t *chain);
763 int hammer2_chain_lock(hammer2_chain_t *chain, int how);
764 void hammer2_chain_load_async(hammer2_chain_t *chain,
765 				void (*func)(hammer2_io_t *dio,
766 					     hammer2_chain_t *chain,
767 					     void *arg_p, off_t arg_o),
768 				void *arg_p, off_t arg_o);
769 void hammer2_chain_moved(hammer2_chain_t *chain);
770 void hammer2_chain_modify(hammer2_trans_t *trans,
771 				hammer2_chain_t **chainp, int flags);
772 hammer2_inode_data_t *hammer2_chain_modify_ip(hammer2_trans_t *trans,
773 				hammer2_inode_t *ip, hammer2_chain_t **chainp,
774 				int flags);
775 void hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
776 				hammer2_chain_t *parent,
777 				hammer2_chain_t **chainp,
778 				int nradix, int flags);
779 void hammer2_chain_unlock(hammer2_chain_t *chain);
780 void hammer2_chain_wait(hammer2_chain_t *chain);
781 hammer2_chain_t *hammer2_chain_get(hammer2_chain_t *parent,
782 				hammer2_blockref_t *bref, int generation);
783 hammer2_chain_t *hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags);
784 void hammer2_chain_lookup_done(hammer2_chain_t *parent);
785 hammer2_chain_t *hammer2_chain_lookup(hammer2_chain_t **parentp,
786 				hammer2_key_t *key_nextp,
787 				hammer2_key_t key_beg, hammer2_key_t key_end,
788 				int *cache_indexp, int flags);
789 hammer2_chain_t *hammer2_chain_next(hammer2_chain_t **parentp,
790 				hammer2_chain_t *chain,
791 				hammer2_key_t *key_nextp,
792 				hammer2_key_t key_beg, hammer2_key_t key_end,
793 				int *cache_indexp, int flags);
794 hammer2_chain_t *hammer2_chain_scan(hammer2_chain_t *parent,
795 				hammer2_chain_t *chain,
796 				int *cache_indexp, int flags);
797 
798 int hammer2_chain_create(hammer2_trans_t *trans,
799 				hammer2_chain_t **parentp,
800 				hammer2_chain_t **chainp,
801 				hammer2_key_t key, int keybits,
802 				int type, size_t bytes);
803 void hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
804 				hammer2_chain_t **chainp,
805 				hammer2_blockref_t *bref, int snapshot,
806 				int duplicate_reason);
807 int hammer2_chain_snapshot(hammer2_trans_t *trans, hammer2_chain_t **chainp,
808 				hammer2_ioc_pfs_t *pfs);
809 void hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *chain,
810 				int flags);
811 void hammer2_chain_delete_duplicate(hammer2_trans_t *trans,
812 				hammer2_chain_t **chainp, int flags);
813 void hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp);
814 void hammer2_chain_commit(hammer2_trans_t *trans, hammer2_chain_t *chain);
815 void hammer2_chain_setsubmod(hammer2_trans_t *trans, hammer2_chain_t *chain);
816 
817 void hammer2_chain_memory_wait(hammer2_pfsmount_t *pmp);
818 void hammer2_chain_memory_wakeup(hammer2_pfsmount_t *pmp);
819 void hammer2_chain_countbrefs(hammer2_chain_t *chain,
820 				hammer2_blockref_t *base, int count);
821 void hammer2_chain_layer_check_locked(hammer2_mount_t *hmp,
822 				hammer2_chain_core_t *core);
823 
824 int hammer2_base_find(hammer2_chain_t *chain,
825 				hammer2_blockref_t *base, int count,
826 				int *cache_indexp, hammer2_key_t *key_nextp,
827 				hammer2_key_t key_beg, hammer2_key_t key_end);
828 void hammer2_base_delete(hammer2_trans_t *trans, hammer2_chain_t *chain,
829 				hammer2_blockref_t *base, int count,
830 				int *cache_indexp, hammer2_chain_t *child);
831 void hammer2_base_insert(hammer2_trans_t *trans, hammer2_chain_t *chain,
832 				hammer2_blockref_t *base, int count,
833 				int *cache_indexp, hammer2_chain_t *child);
834 
835 /*
836  * hammer2_trans.c
837  */
838 void hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp,
839 				hammer2_mount_t *hmp, int flags);
840 void hammer2_trans_clear_invfsync(hammer2_trans_t *trans);
841 void hammer2_trans_done(hammer2_trans_t *trans);
842 
843 /*
844  * hammer2_ioctl.c
845  */
846 int hammer2_ioctl(hammer2_inode_t *ip, u_long com, void *data,
847 				int fflag, struct ucred *cred);
848 
849 /*
850  * hammer2_io.c
851  */
852 hammer2_io_t *hammer2_io_getblk(hammer2_mount_t *hmp, off_t lbase,
853 				int lsize, int *ownerp);
854 void hammer2_io_putblk(hammer2_io_t **diop);
855 void hammer2_io_cleanup(hammer2_mount_t *hmp, struct hammer2_io_tree *tree);
856 char *hammer2_io_data(hammer2_io_t *dio, off_t lbase);
857 int hammer2_io_new(hammer2_mount_t *hmp, off_t lbase, int lsize,
858 				hammer2_io_t **diop);
859 int hammer2_io_newnz(hammer2_mount_t *hmp, off_t lbase, int lsize,
860 				hammer2_io_t **diop);
861 int hammer2_io_newq(hammer2_mount_t *hmp, off_t lbase, int lsize,
862 				hammer2_io_t **diop);
863 int hammer2_io_bread(hammer2_mount_t *hmp, off_t lbase, int lsize,
864 				hammer2_io_t **diop);
865 void hammer2_io_breadcb(hammer2_mount_t *hmp, off_t lbase, int lsize,
866 				void (*callback)(hammer2_io_t *dio,
867 						 hammer2_chain_t *arg_c,
868 						 void *arg_p, off_t arg_o),
869 				hammer2_chain_t *arg_c,
870 				void *arg_p, off_t arg_o);
871 void hammer2_io_bawrite(hammer2_io_t **diop);
872 void hammer2_io_bdwrite(hammer2_io_t **diop);
873 int hammer2_io_bwrite(hammer2_io_t **diop);
874 void hammer2_io_setdirty(hammer2_io_t *dio);
875 void hammer2_io_setinval(hammer2_io_t *dio, u_int bytes);
876 void hammer2_io_brelse(hammer2_io_t **diop);
877 void hammer2_io_bqrelse(hammer2_io_t **diop);
878 int hammer2_io_isdirty(hammer2_io_t *dio);
879 
880 /*
881  * hammer2_msgops.c
882  */
883 int hammer2_msg_dbg_rcvmsg(kdmsg_msg_t *msg);
884 int hammer2_msg_adhoc_input(kdmsg_msg_t *msg);
885 
886 /*
887  * hammer2_vfsops.c
888  */
889 void hammer2_clusterctl_wakeup(kdmsg_iocom_t *iocom);
890 void hammer2_volconf_update(hammer2_pfsmount_t *pmp, int index);
891 void hammer2_cluster_reconnect(hammer2_pfsmount_t *pmp, struct file *fp);
892 void hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp);
893 void hammer2_bioq_sync(hammer2_pfsmount_t *pmp);
894 int hammer2_vfs_sync(struct mount *mp, int waitflags);
895 void hammer2_lwinprog_ref(hammer2_pfsmount_t *pmp);
896 void hammer2_lwinprog_drop(hammer2_pfsmount_t *pmp);
897 void hammer2_lwinprog_wait(hammer2_pfsmount_t *pmp);
898 
899 /*
900  * hammer2_freemap.c
901  */
902 int hammer2_freemap_alloc(hammer2_trans_t *trans, hammer2_chain_t *chain,
903 				size_t bytes);
904 void hammer2_freemap_adjust(hammer2_trans_t *trans, hammer2_mount_t *hmp,
905 				hammer2_blockref_t *bref, int how);
906 
907 
908 #endif /* !_KERNEL */
909 #endif /* !_VFS_HAMMER2_HAMMER2_H_ */
910