xref: /dragonfly/sys/vfs/hammer2/hammer2_vfsops.c (revision cfd1aba3)
1 /*-
2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression)
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/nlookup.h>
39 #include <sys/vnode.h>
40 #include <sys/mount.h>
41 #include <sys/fcntl.h>
42 #include <sys/buf.h>
43 #include <sys/uuid.h>
44 #include <sys/vfsops.h>
45 #include <sys/sysctl.h>
46 #include <sys/socket.h>
47 #include <sys/objcache.h>
48 
49 #include <sys/proc.h>
50 #include <sys/namei.h>
51 #include <sys/mountctl.h>
52 #include <sys/dirent.h>
53 #include <sys/uio.h>
54 
55 #include <sys/mutex.h>
56 #include <sys/mutex2.h>
57 
58 #include "hammer2.h"
59 #include "hammer2_disk.h"
60 #include "hammer2_mount.h"
61 
62 #include "hammer2.h"
63 #include "hammer2_lz4.h"
64 
65 #include "zlib/hammer2_zlib.h"
66 
67 #define REPORT_REFS_ERRORS 1	/* XXX remove me */
68 
69 MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache");
70 
71 struct hammer2_sync_info {
72 	hammer2_trans_t trans;
73 	int error;
74 	int waitfor;
75 };
76 
77 TAILQ_HEAD(hammer2_mntlist, hammer2_mount);
78 static struct hammer2_mntlist hammer2_mntlist;
79 static struct lock hammer2_mntlk;
80 
81 int hammer2_debug;
82 int hammer2_cluster_enable = 1;
83 int hammer2_hardlink_enable = 1;
84 int hammer2_flush_pipe = 100;
85 int hammer2_synchronous_flush = 1;
86 long hammer2_limit_dirty_chains;
87 long hammer2_iod_file_read;
88 long hammer2_iod_meta_read;
89 long hammer2_iod_indr_read;
90 long hammer2_iod_fmap_read;
91 long hammer2_iod_volu_read;
92 long hammer2_iod_file_write;
93 long hammer2_iod_meta_write;
94 long hammer2_iod_indr_write;
95 long hammer2_iod_fmap_write;
96 long hammer2_iod_volu_write;
97 long hammer2_ioa_file_read;
98 long hammer2_ioa_meta_read;
99 long hammer2_ioa_indr_read;
100 long hammer2_ioa_fmap_read;
101 long hammer2_ioa_volu_read;
102 long hammer2_ioa_fmap_write;
103 long hammer2_ioa_file_write;
104 long hammer2_ioa_meta_write;
105 long hammer2_ioa_indr_write;
106 long hammer2_ioa_volu_write;
107 
108 MALLOC_DECLARE(C_BUFFER);
109 MALLOC_DEFINE(C_BUFFER, "compbuffer", "Buffer used for compression.");
110 
111 MALLOC_DECLARE(D_BUFFER);
112 MALLOC_DEFINE(D_BUFFER, "decompbuffer", "Buffer used for decompression.");
113 
114 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem");
115 
116 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW,
117 	   &hammer2_debug, 0, "");
118 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_enable, CTLFLAG_RW,
119 	   &hammer2_cluster_enable, 0, "");
120 SYSCTL_INT(_vfs_hammer2, OID_AUTO, hardlink_enable, CTLFLAG_RW,
121 	   &hammer2_hardlink_enable, 0, "");
122 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW,
123 	   &hammer2_flush_pipe, 0, "");
124 SYSCTL_INT(_vfs_hammer2, OID_AUTO, synchronous_flush, CTLFLAG_RW,
125 	   &hammer2_synchronous_flush, 0, "");
126 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_chains, CTLFLAG_RW,
127 	   &hammer2_limit_dirty_chains, 0, "");
128 
129 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW,
130 	   &hammer2_iod_file_read, 0, "");
131 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW,
132 	   &hammer2_iod_meta_read, 0, "");
133 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW,
134 	   &hammer2_iod_indr_read, 0, "");
135 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW,
136 	   &hammer2_iod_fmap_read, 0, "");
137 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW,
138 	   &hammer2_iod_volu_read, 0, "");
139 
140 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW,
141 	   &hammer2_iod_file_write, 0, "");
142 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW,
143 	   &hammer2_iod_meta_write, 0, "");
144 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW,
145 	   &hammer2_iod_indr_write, 0, "");
146 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW,
147 	   &hammer2_iod_fmap_write, 0, "");
148 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW,
149 	   &hammer2_iod_volu_write, 0, "");
150 
151 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_read, CTLFLAG_RW,
152 	   &hammer2_ioa_file_read, 0, "");
153 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_read, CTLFLAG_RW,
154 	   &hammer2_ioa_meta_read, 0, "");
155 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_read, CTLFLAG_RW,
156 	   &hammer2_ioa_indr_read, 0, "");
157 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_read, CTLFLAG_RW,
158 	   &hammer2_ioa_fmap_read, 0, "");
159 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_read, CTLFLAG_RW,
160 	   &hammer2_ioa_volu_read, 0, "");
161 
162 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_write, CTLFLAG_RW,
163 	   &hammer2_ioa_file_write, 0, "");
164 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_write, CTLFLAG_RW,
165 	   &hammer2_ioa_meta_write, 0, "");
166 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_write, CTLFLAG_RW,
167 	   &hammer2_ioa_indr_write, 0, "");
168 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_write, CTLFLAG_RW,
169 	   &hammer2_ioa_fmap_write, 0, "");
170 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_write, CTLFLAG_RW,
171 	   &hammer2_ioa_volu_write, 0, "");
172 
173 static int hammer2_vfs_init(struct vfsconf *conf);
174 static int hammer2_vfs_uninit(struct vfsconf *vfsp);
175 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
176 				struct ucred *cred);
177 static int hammer2_remount(hammer2_mount_t *, struct mount *, char *,
178 				struct vnode *, struct ucred *);
179 static int hammer2_recovery(hammer2_mount_t *hmp);
180 static int hammer2_vfs_unmount(struct mount *mp, int mntflags);
181 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp);
182 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp,
183 				struct ucred *cred);
184 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp,
185 				struct ucred *cred);
186 static int hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
187 				ino_t ino, struct vnode **vpp);
188 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
189 				struct fid *fhp, struct vnode **vpp);
190 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp);
191 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
192 				int *exflagsp, struct ucred **credanonp);
193 
194 static int hammer2_install_volume_header(hammer2_mount_t *hmp);
195 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
196 
197 static void hammer2_write_thread(void *arg);
198 
199 static void hammer2_vfs_unmount_hmp1(struct mount *mp, hammer2_mount_t *hmp);
200 static void hammer2_vfs_unmount_hmp2(struct mount *mp, hammer2_mount_t *hmp);
201 
202 /*
203  * Functions for compression in threads,
204  * from hammer2_vnops.c
205  */
206 static void hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
207 				hammer2_inode_t *ip,
208 				hammer2_inode_data_t *ipdata,
209 				hammer2_cluster_t *cparent,
210 				hammer2_key_t lbase, int ioflag, int pblksize,
211 				int *errorp);
212 static void hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
213 				hammer2_inode_t *ip,
214 				hammer2_inode_data_t *ipdata,
215 				hammer2_cluster_t *cparent,
216 				hammer2_key_t lbase, int ioflag,
217 				int pblksize, int *errorp, int comp_algo);
218 static void hammer2_zero_check_and_write(struct buf *bp,
219 				hammer2_trans_t *trans, hammer2_inode_t *ip,
220 				hammer2_inode_data_t *ipdata,
221 				hammer2_cluster_t *cparent,
222 				hammer2_key_t lbase,
223 				int ioflag, int pblksize, int *errorp);
224 static int test_block_zeros(const char *buf, size_t bytes);
225 static void zero_write(struct buf *bp, hammer2_trans_t *trans,
226 				hammer2_inode_t *ip,
227 				hammer2_inode_data_t *ipdata,
228 				hammer2_cluster_t *cparent,
229 				hammer2_key_t lbase,
230 				int *errorp);
231 static void hammer2_write_bp(hammer2_cluster_t *cluster, struct buf *bp,
232 				int ioflag, int pblksize, int *errorp);
233 
234 static int hammer2_rcvdmsg(kdmsg_msg_t *msg);
235 static void hammer2_autodmsg(kdmsg_msg_t *msg);
236 
237 
238 /*
239  * HAMMER2 vfs operations.
240  */
241 static struct vfsops hammer2_vfsops = {
242 	.vfs_init	= hammer2_vfs_init,
243 	.vfs_uninit = hammer2_vfs_uninit,
244 	.vfs_sync	= hammer2_vfs_sync,
245 	.vfs_mount	= hammer2_vfs_mount,
246 	.vfs_unmount	= hammer2_vfs_unmount,
247 	.vfs_root 	= hammer2_vfs_root,
248 	.vfs_statfs	= hammer2_vfs_statfs,
249 	.vfs_statvfs	= hammer2_vfs_statvfs,
250 	.vfs_vget	= hammer2_vfs_vget,
251 	.vfs_vptofh	= hammer2_vfs_vptofh,
252 	.vfs_fhtovp	= hammer2_vfs_fhtovp,
253 	.vfs_checkexp	= hammer2_vfs_checkexp
254 };
255 
256 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", "");
257 
258 VFS_SET(hammer2_vfsops, hammer2, 0);
259 MODULE_VERSION(hammer2, 1);
260 
261 static
262 int
263 hammer2_vfs_init(struct vfsconf *conf)
264 {
265 	static struct objcache_malloc_args margs_read;
266 	static struct objcache_malloc_args margs_write;
267 
268 	int error;
269 
270 	error = 0;
271 
272 	if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref))
273 		error = EINVAL;
274 	if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data))
275 		error = EINVAL;
276 	if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data))
277 		error = EINVAL;
278 
279 	if (error)
280 		kprintf("HAMMER2 structure size mismatch; cannot continue.\n");
281 
282 	margs_read.objsize = 65536;
283 	margs_read.mtype = D_BUFFER;
284 
285 	margs_write.objsize = 32768;
286 	margs_write.mtype = C_BUFFER;
287 
288 	cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc,
289 				0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
290 				objcache_malloc_free, &margs_read);
291 	cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc,
292 				0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
293 				objcache_malloc_free, &margs_write);
294 
295 	lockinit(&hammer2_mntlk, "mntlk", 0, 0);
296 	TAILQ_INIT(&hammer2_mntlist);
297 
298 	hammer2_limit_dirty_chains = desiredvnodes / 10;
299 
300 	return (error);
301 }
302 
303 static
304 int
305 hammer2_vfs_uninit(struct vfsconf *vfsp __unused)
306 {
307 	objcache_destroy(cache_buffer_read);
308 	objcache_destroy(cache_buffer_write);
309 	return 0;
310 }
311 
312 /*
313  * Mount or remount HAMMER2 fileystem from physical media
314  *
315  *	mountroot
316  *		mp		mount point structure
317  *		path		NULL
318  *		data		<unused>
319  *		cred		<unused>
320  *
321  *	mount
322  *		mp		mount point structure
323  *		path		path to mount point
324  *		data		pointer to argument structure in user space
325  *			volume	volume path (device@LABEL form)
326  *			hflags	user mount flags
327  *		cred		user credentials
328  *
329  * RETURNS:	0	Success
330  *		!0	error number
331  */
332 static
333 int
334 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
335 		  struct ucred *cred)
336 {
337 	struct hammer2_mount_info info;
338 	hammer2_pfsmount_t *pmp;
339 	hammer2_mount_t *hmp;
340 	hammer2_key_t key_next;
341 	hammer2_key_t key_dummy;
342 	hammer2_key_t lhc;
343 	struct vnode *devvp;
344 	struct nlookupdata nd;
345 	hammer2_chain_t *parent;
346 	hammer2_chain_t *rchain;
347 	hammer2_chain_t *schain;
348 	hammer2_cluster_t *cluster;
349 	hammer2_cluster_t *cparent;
350 	struct file *fp;
351 	char devstr[MNAMELEN];
352 	size_t size;
353 	size_t done;
354 	char *dev;
355 	char *label;
356 	int ronly = 1;
357 	int error;
358 	int cache_index;
359 	int ddflag;
360 	int i;
361 
362 	hmp = NULL;
363 	pmp = NULL;
364 	dev = NULL;
365 	label = NULL;
366 	devvp = NULL;
367 	cache_index = -1;
368 
369 	kprintf("hammer2_mount\n");
370 
371 	if (path == NULL) {
372 		/*
373 		 * Root mount
374 		 */
375 		bzero(&info, sizeof(info));
376 		info.cluster_fd = -1;
377 		return (EOPNOTSUPP);
378 	} else {
379 		/*
380 		 * Non-root mount or updating a mount
381 		 */
382 		error = copyin(data, &info, sizeof(info));
383 		if (error)
384 			return (error);
385 
386 		error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done);
387 		if (error)
388 			return (error);
389 
390 		/* Extract device and label */
391 		dev = devstr;
392 		label = strchr(devstr, '@');
393 		if (label == NULL ||
394 		    ((label + 1) - dev) > done) {
395 			return (EINVAL);
396 		}
397 		*label = '\0';
398 		label++;
399 		if (*label == '\0')
400 			return (EINVAL);
401 
402 		if (mp->mnt_flag & MNT_UPDATE) {
403 			/* Update mount */
404 			/* HAMMER2 implements NFS export via mountctl */
405 			pmp = MPTOPMP(mp);
406 			for (i = 0; i < pmp->cluster.nchains; ++i) {
407 				hmp = pmp->cluster.array[i]->hmp;
408 				devvp = hmp->devvp;
409 				error = hammer2_remount(hmp, mp, path,
410 							devvp, cred);
411 				if (error)
412 					break;
413 			}
414 			hammer2_inode_install_hidden(pmp);
415 
416 			return error;
417 		}
418 	}
419 
420 	/*
421 	 * PFS mount
422 	 *
423 	 * Lookup name and verify it refers to a block device.
424 	 */
425 	error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW);
426 	if (error == 0)
427 		error = nlookup(&nd);
428 	if (error == 0)
429 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp);
430 	nlookup_done(&nd);
431 
432 	if (error == 0) {
433 		if (vn_isdisk(devvp, &error))
434 			error = vfs_mountedon(devvp);
435 	}
436 
437 	/*
438 	 * Determine if the device has already been mounted.  After this
439 	 * check hmp will be non-NULL if we are doing the second or more
440 	 * hammer2 mounts from the same device.
441 	 */
442 	lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
443 	TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
444 		if (hmp->devvp == devvp)
445 			break;
446 	}
447 
448 	/*
449 	 * Open the device if this isn't a secondary mount and construct
450 	 * the H2 device mount (hmp).
451 	 */
452 	if (hmp == NULL) {
453 		if (error == 0 && vcount(devvp) > 0)
454 			error = EBUSY;
455 
456 		/*
457 		 * Now open the device
458 		 */
459 		if (error == 0) {
460 			ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
461 			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
462 			error = vinvalbuf(devvp, V_SAVE, 0, 0);
463 			if (error == 0) {
464 				error = VOP_OPEN(devvp,
465 						 ronly ? FREAD : FREAD | FWRITE,
466 						 FSCRED, NULL);
467 			}
468 			vn_unlock(devvp);
469 		}
470 		if (error && devvp) {
471 			vrele(devvp);
472 			devvp = NULL;
473 		}
474 		if (error) {
475 			lockmgr(&hammer2_mntlk, LK_RELEASE);
476 			return error;
477 		}
478 		hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO);
479 		hmp->ronly = ronly;
480 		hmp->devvp = devvp;
481 		kmalloc_create(&hmp->mchain, "HAMMER2-chains");
482 		TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry);
483 		RB_INIT(&hmp->iotree);
484 
485 		lockinit(&hmp->alloclk, "h2alloc", 0, 0);
486 		lockinit(&hmp->voldatalk, "voldata", 0, LK_CANRECURSE);
487 		TAILQ_INIT(&hmp->transq);
488 
489 		/*
490 		 * vchain setup. vchain.data is embedded.
491 		 * vchain.refs is initialized and will never drop to 0.
492 		 *
493 		 * NOTE! voldata is not yet loaded.
494 		 */
495 		hmp->vchain.hmp = hmp;
496 		hmp->vchain.refs = 1;
497 		hmp->vchain.data = (void *)&hmp->voldata;
498 		hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME;
499 		hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
500 		hmp->vchain.delete_tid = HAMMER2_MAX_TID;
501 
502 		hammer2_chain_core_alloc(NULL, &hmp->vchain, NULL);
503 		/* hmp->vchain.u.xxx is left NULL */
504 
505 		/*
506 		 * fchain setup.  fchain.data is embedded.
507 		 * fchain.refs is initialized and will never drop to 0.
508 		 *
509 		 * The data is not used but needs to be initialized to
510 		 * pass assertion muster.  We use this chain primarily
511 		 * as a placeholder for the freemap's top-level RBTREE
512 		 * so it does not interfere with the volume's topology
513 		 * RBTREE.
514 		 */
515 		hmp->fchain.hmp = hmp;
516 		hmp->fchain.refs = 1;
517 		hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset;
518 		hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP;
519 		hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
520 		hmp->fchain.bref.methods =
521 			HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) |
522 			HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
523 		hmp->fchain.delete_tid = HAMMER2_MAX_TID;
524 
525 		hammer2_chain_core_alloc(NULL, &hmp->fchain, NULL);
526 		/* hmp->fchain.u.xxx is left NULL */
527 
528 		/*
529 		 * Install the volume header and initialize fields from
530 		 * voldata.
531 		 */
532 		error = hammer2_install_volume_header(hmp);
533 		if (error) {
534 			++hmp->pmp_count;
535 			hammer2_vfs_unmount_hmp1(mp, hmp);
536 			hammer2_vfs_unmount_hmp2(mp, hmp);
537 			hammer2_vfs_unmount(mp, MNT_FORCE);
538 			return error;
539 		}
540 
541 		/*
542 		 * Really important to get these right or flush will get
543 		 * confused.
544 		 */
545 		hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
546 		hmp->vchain.modify_tid = hmp->voldata.mirror_tid;
547 		hmp->vchain.update_lo = hmp->voldata.mirror_tid;
548 		hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
549 		hmp->fchain.modify_tid = hmp->voldata.freemap_tid;
550 		hmp->fchain.update_lo = hmp->voldata.freemap_tid;
551 
552 		/*
553 		 * First locate the super-root inode, which is key 0
554 		 * relative to the volume header's blockset.
555 		 *
556 		 * Then locate the root inode by scanning the directory keyspace
557 		 * represented by the label.
558 		 */
559 		parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
560 		schain = hammer2_chain_lookup(&parent, &key_dummy,
561 				      HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY,
562 				      &cache_index, 0, &ddflag);
563 		hammer2_chain_lookup_done(parent);
564 		if (schain == NULL) {
565 			kprintf("hammer2_mount: invalid super-root\n");
566 			++hmp->pmp_count;
567 			hammer2_vfs_unmount_hmp1(mp, hmp);
568 			hammer2_vfs_unmount_hmp2(mp, hmp);
569 			hammer2_vfs_unmount(mp, MNT_FORCE);
570 			return EINVAL;
571 		}
572 
573 		/*
574 		 * NOTE: inode_get sucks up schain's lock.
575 		 */
576 		atomic_set_int(&schain->flags, HAMMER2_CHAIN_PFSROOT);
577 		cluster = hammer2_cluster_from_chain(schain);
578 		hmp->sroot = hammer2_inode_get(NULL, NULL, cluster);
579 		hammer2_inode_ref(hmp->sroot);
580 		hammer2_inode_unlock_ex(hmp->sroot, cluster);
581 		schain = NULL;
582 		/* leave hmp->sroot with one ref */
583 
584 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
585 			error = hammer2_recovery(hmp);
586 			/* XXX do something with error */
587 		}
588 	}
589 
590 	/*
591 	 * Block device opened successfully, finish initializing the
592 	 * mount structure.
593 	 *
594 	 * From this point on we have to call hammer2_unmount() on failure.
595 	 */
596 	pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO);
597 
598 	kmalloc_create(&pmp->minode, "HAMMER2-inodes");
599 	kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg");
600 	lockinit(&pmp->lock, "pfslk", 0, 0);
601 	spin_init(&pmp->inum_spin);
602 	RB_INIT(&pmp->inum_tree);
603 	TAILQ_INIT(&pmp->unlinkq);
604 	spin_init(&pmp->unlinkq_spin);
605 	pmp->cluster.flags = HAMMER2_CLUSTER_PFS;
606 
607 	kdmsg_iocom_init(&pmp->iocom, pmp,
608 			 KDMSG_IOCOMF_AUTOCONN |
609 			 KDMSG_IOCOMF_AUTOSPAN |
610 			 KDMSG_IOCOMF_AUTOCIRC,
611 			 pmp->mmsg, hammer2_rcvdmsg);
612 
613 	ccms_domain_init(&pmp->ccms_dom);
614 	++hmp->pmp_count;
615 	lockmgr(&hammer2_mntlk, LK_RELEASE);
616 	kprintf("hammer2_mount hmp=%p pmp=%p pmpcnt=%d\n",
617 		hmp, pmp, hmp->pmp_count);
618 
619 	mp->mnt_flag = MNT_LOCAL;
620 	mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;	/* all entry pts are SMP */
621 	mp->mnt_kern_flag |= MNTK_THR_SYNC;	/* new vsyncscan semantics */
622 
623 	/*
624 	 * required mount structure initializations
625 	 */
626 	mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE;
627 	mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE;
628 
629 	mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE;
630 	mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
631 
632 	/*
633 	 * Optional fields
634 	 */
635 	mp->mnt_iosize_max = MAXPHYS;
636 	mp->mnt_data = (qaddr_t)pmp;
637 	pmp->mp = mp;
638 
639 	/*
640 	 * Lookup mount point under the media-localized super-root.
641 	 */
642 	cparent = hammer2_inode_lock_ex(hmp->sroot);
643 	lhc = hammer2_dirhash(label, strlen(label));
644 	cluster = hammer2_cluster_lookup(cparent, &key_next,
645 				      lhc, lhc + HAMMER2_DIRHASH_LOMASK,
646 				      0, &ddflag);
647 	while (cluster) {
648 		if (hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE &&
649 		    strcmp(label,
650 		       hammer2_cluster_data(cluster)->ipdata.filename) == 0) {
651 			break;
652 		}
653 		cluster = hammer2_cluster_next(cparent, cluster, &key_next,
654 					    key_next,
655 					    lhc + HAMMER2_DIRHASH_LOMASK, 0);
656 	}
657 	hammer2_inode_unlock_ex(hmp->sroot, cparent);
658 
659 	if (cluster == NULL) {
660 		kprintf("hammer2_mount: PFS label not found\n");
661 		hammer2_vfs_unmount_hmp1(mp, hmp);
662 		hammer2_vfs_unmount_hmp2(mp, hmp);
663 		hammer2_vfs_unmount(mp, MNT_FORCE);
664 		return EINVAL;
665 	}
666 
667 	for (i = 0; i < cluster->nchains; ++i) {
668 		rchain = cluster->array[i];
669 		if (rchain->flags & HAMMER2_CHAIN_MOUNTED) {
670 			kprintf("hammer2_mount: PFS label already mounted!\n");
671 			hammer2_cluster_unlock(cluster);
672 			hammer2_vfs_unmount_hmp1(mp, hmp);
673 			hammer2_vfs_unmount_hmp2(mp, hmp);
674 			hammer2_vfs_unmount(mp, MNT_FORCE);
675 			return EBUSY;
676 		}
677 #if 0
678 		if (rchain->flags & HAMMER2_CHAIN_RECYCLE) {
679 			kprintf("hammer2_mount: PFS label is recycling\n");
680 			hammer2_cluster_unlock(cluster);
681 			hammer2_vfs_unmount_hmp1(mp, hmp);
682 			hammer2_vfs_unmount_hmp2(mp, hmp);
683 			hammer2_vfs_unmount(mp, MNT_FORCE);
684 			return EBUSY;
685 		}
686 #endif
687 	}
688 
689 	/*
690 	 * After this point hammer2_vfs_unmount() has visibility on hmp
691 	 * and manual hmp1/hmp2 calls are not needed on fatal errors.
692 	 */
693 	pmp->cluster = *cluster;
694 	KKASSERT(pmp->cluster.refs == 1);
695 	for (i = 0; i < cluster->nchains; ++i) {
696 		rchain = cluster->array[i];
697 		KKASSERT(rchain->pmp == NULL);	/* tracking pmp for rchain */
698 		rchain->pmp = pmp;
699 		atomic_set_int(&rchain->flags, HAMMER2_CHAIN_MOUNTED);
700 		hammer2_chain_ref(rchain);	/* ref for pmp->cluster */
701 	}
702 	pmp->iroot = hammer2_inode_get(pmp, NULL, cluster);
703 	hammer2_inode_ref(pmp->iroot);		/* ref for pmp->iroot */
704 	hammer2_inode_unlock_ex(pmp->iroot, cluster);
705 
706 	kprintf("iroot %p\n", pmp->iroot);
707 
708 	/*
709 	 * The logical file buffer bio write thread handles things
710 	 * like physical block assignment and compression.
711 	 */
712 	mtx_init(&pmp->wthread_mtx);
713 	bioq_init(&pmp->wthread_bioq);
714 	pmp->wthread_destroy = 0;
715 	lwkt_create(hammer2_write_thread, pmp,
716 		    &pmp->wthread_td, NULL, 0, -1, "hwrite-%s", label);
717 
718 	/*
719 	 * Ref the cluster management messaging descriptor.  The mount
720 	 * program deals with the other end of the communications pipe.
721 	 */
722 	fp = holdfp(curproc->p_fd, info.cluster_fd, -1);
723 	if (fp == NULL) {
724 		kprintf("hammer2_mount: bad cluster_fd!\n");
725 		hammer2_vfs_unmount(mp, MNT_FORCE);
726 		return EBADF;
727 	}
728 	hammer2_cluster_reconnect(pmp, fp);
729 
730 	/*
731 	 * With the cluster operational install ihidden.
732 	 */
733 	hammer2_inode_install_hidden(pmp);
734 
735 	/*
736 	 * Finish setup
737 	 */
738 	vfs_getnewfsid(mp);
739 	vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops);
740 	vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops);
741 	vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops);
742 
743 	copyinstr(info.volume, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size);
744 	bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
745 	bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname));
746 	copyinstr(path, mp->mnt_stat.f_mntonname,
747 		  sizeof(mp->mnt_stat.f_mntonname) - 1,
748 		  &size);
749 
750 	/*
751 	 * Initial statfs to prime mnt_stat.
752 	 */
753 	hammer2_vfs_statfs(mp, &mp->mnt_stat, cred);
754 
755 	return 0;
756 }
757 
758 /*
759  * Handle bioq for strategy write
760  */
761 static
762 void
763 hammer2_write_thread(void *arg)
764 {
765 	hammer2_pfsmount_t *pmp;
766 	struct bio *bio;
767 	struct buf *bp;
768 	hammer2_trans_t trans;
769 	struct vnode *vp;
770 	hammer2_inode_t *ip;
771 	hammer2_cluster_t *cparent;
772 	hammer2_inode_data_t *ipdata;
773 	hammer2_key_t lbase;
774 	int lblksize;
775 	int pblksize;
776 	int error;
777 
778 	pmp = arg;
779 
780 	mtx_lock(&pmp->wthread_mtx);
781 	while (pmp->wthread_destroy == 0) {
782 		if (bioq_first(&pmp->wthread_bioq) == NULL) {
783 			mtxsleep(&pmp->wthread_bioq, &pmp->wthread_mtx,
784 				 0, "h2bioqw", 0);
785 		}
786 		cparent = NULL;
787 
788 		hammer2_trans_init(&trans, pmp, NULL, HAMMER2_TRANS_BUFCACHE);
789 
790 		while ((bio = bioq_takefirst(&pmp->wthread_bioq)) != NULL) {
791 			/*
792 			 * dummy bio for synchronization.  The transaction
793 			 * must be reinitialized.
794 			 */
795 			if (bio->bio_buf == NULL) {
796 				bio->bio_flags |= BIO_DONE;
797 				wakeup(bio);
798 				hammer2_trans_done(&trans);
799 				hammer2_trans_init(&trans, pmp, NULL,
800 						   HAMMER2_TRANS_BUFCACHE);
801 				continue;
802 			}
803 
804 			/*
805 			 * else normal bio processing
806 			 */
807 			mtx_unlock(&pmp->wthread_mtx);
808 
809 			hammer2_lwinprog_drop(pmp);
810 
811 			error = 0;
812 			bp = bio->bio_buf;
813 			vp = bp->b_vp;
814 			ip = VTOI(vp);
815 
816 			/*
817 			 * Inode is modified, flush size and mtime changes
818 			 * to ensure that the file size remains consistent
819 			 * with the buffers being flushed.
820 			 *
821 			 * NOTE: The inode_fsync() call only flushes the
822 			 *	 inode's meta-data state, it doesn't try
823 			 *	 to flush underlying buffers or chains.
824 			 */
825 			cparent = hammer2_inode_lock_ex(ip);
826 			if (ip->flags & (HAMMER2_INODE_RESIZED |
827 					 HAMMER2_INODE_MTIME)) {
828 				hammer2_inode_fsync(&trans, ip, cparent);
829 			}
830 			ipdata = hammer2_cluster_modify_ip(&trans, ip,
831 							 cparent, 0);
832 			lblksize = hammer2_calc_logical(ip, bio->bio_offset,
833 							&lbase, NULL);
834 			pblksize = hammer2_calc_physical(ip, ipdata, lbase);
835 			hammer2_write_file_core(bp, &trans, ip, ipdata,
836 						cparent,
837 						lbase, IO_ASYNC,
838 						pblksize, &error);
839 			hammer2_inode_unlock_ex(ip, cparent);
840 			if (error) {
841 				kprintf("hammer2: error in buffer write\n");
842 				bp->b_flags |= B_ERROR;
843 				bp->b_error = EIO;
844 			}
845 			biodone(bio);
846 			mtx_lock(&pmp->wthread_mtx);
847 		}
848 		hammer2_trans_done(&trans);
849 	}
850 	pmp->wthread_destroy = -1;
851 	wakeup(&pmp->wthread_destroy);
852 
853 	mtx_unlock(&pmp->wthread_mtx);
854 }
855 
856 void
857 hammer2_bioq_sync(hammer2_pfsmount_t *pmp)
858 {
859 	struct bio sync_bio;
860 
861 	bzero(&sync_bio, sizeof(sync_bio));	/* dummy with no bio_buf */
862 	mtx_lock(&pmp->wthread_mtx);
863 	if (pmp->wthread_destroy == 0 &&
864 	    TAILQ_FIRST(&pmp->wthread_bioq.queue)) {
865 		bioq_insert_tail(&pmp->wthread_bioq, &sync_bio);
866 		while ((sync_bio.bio_flags & BIO_DONE) == 0)
867 			mtxsleep(&sync_bio, &pmp->wthread_mtx, 0, "h2bioq", 0);
868 	}
869 	mtx_unlock(&pmp->wthread_mtx);
870 }
871 
872 /*
873  * Return a chain suitable for I/O, creating the chain if necessary
874  * and assigning its physical block.
875  */
876 static
877 hammer2_cluster_t *
878 hammer2_assign_physical(hammer2_trans_t *trans,
879 			hammer2_inode_t *ip, hammer2_cluster_t *cparent,
880 			hammer2_key_t lbase, int pblksize, int *errorp)
881 {
882 	hammer2_cluster_t *cluster;
883 	hammer2_cluster_t *dparent;
884 	hammer2_key_t key_dummy;
885 	int pradix = hammer2_getradix(pblksize);
886 	int ddflag;
887 
888 	/*
889 	 * Locate the chain associated with lbase, return a locked chain.
890 	 * However, do not instantiate any data reference (which utilizes a
891 	 * device buffer) because we will be using direct IO via the
892 	 * logical buffer cache buffer.
893 	 */
894 	*errorp = 0;
895 	KKASSERT(pblksize >= HAMMER2_MIN_ALLOC);
896 retry:
897 	dparent = hammer2_cluster_lookup_init(cparent, 0);
898 	cluster = hammer2_cluster_lookup(dparent, &key_dummy,
899 				     lbase, lbase,
900 				     HAMMER2_LOOKUP_NODATA, &ddflag);
901 
902 	if (cluster == NULL) {
903 		/*
904 		 * We found a hole, create a new chain entry.
905 		 *
906 		 * NOTE: DATA chains are created without device backing
907 		 *	 store (nor do we want any).
908 		 */
909 		*errorp = hammer2_cluster_create(trans, dparent, &cluster,
910 					       lbase, HAMMER2_PBUFRADIX,
911 					       HAMMER2_BREF_TYPE_DATA,
912 					       pblksize);
913 		if (cluster == NULL) {
914 			hammer2_cluster_lookup_done(dparent);
915 			panic("hammer2_cluster_create: par=%p error=%d\n",
916 				dparent->focus, *errorp);
917 			goto retry;
918 		}
919 		/*ip->delta_dcount += pblksize;*/
920 	} else {
921 		switch (hammer2_cluster_type(cluster)) {
922 		case HAMMER2_BREF_TYPE_INODE:
923 			/*
924 			 * The data is embedded in the inode.  The
925 			 * caller is responsible for marking the inode
926 			 * modified and copying the data to the embedded
927 			 * area.
928 			 */
929 			break;
930 		case HAMMER2_BREF_TYPE_DATA:
931 			if (hammer2_cluster_bytes(cluster) != pblksize) {
932 				hammer2_cluster_resize(trans, ip,
933 						     dparent, cluster,
934 						     pradix,
935 						     HAMMER2_MODIFY_OPTDATA);
936 			}
937 			hammer2_cluster_modify(trans, cluster,
938 					     HAMMER2_MODIFY_OPTDATA);
939 			break;
940 		default:
941 			panic("hammer2_assign_physical: bad type");
942 			/* NOT REACHED */
943 			break;
944 		}
945 	}
946 
947 	/*
948 	 * Cleanup.  If cluster wound up being the inode itself, i.e.
949 	 * the DIRECTDATA case for offset 0, then we need to update cparent.
950 	 * The caller expects cparent to not become stale.
951 	 */
952 	hammer2_cluster_lookup_done(dparent);
953 	/* dparent = NULL; safety */
954 	if (cluster && ddflag)
955 		hammer2_cluster_replace_locked(cparent, cluster);
956 	return (cluster);
957 }
958 
959 /*
960  * From hammer2_vnops.c.
961  * The core write function which determines which path to take
962  * depending on compression settings.
963  */
964 static
965 void
966 hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
967 			hammer2_inode_t *ip, hammer2_inode_data_t *ipdata,
968 			hammer2_cluster_t *cparent,
969 			hammer2_key_t lbase, int ioflag, int pblksize,
970 			int *errorp)
971 {
972 	hammer2_cluster_t *cluster;
973 
974 	switch(HAMMER2_DEC_COMP(ipdata->comp_algo)) {
975 	case HAMMER2_COMP_NONE:
976 		/*
977 		 * We have to assign physical storage to the buffer
978 		 * we intend to dirty or write now to avoid deadlocks
979 		 * in the strategy code later.
980 		 *
981 		 * This can return NOOFFSET for inode-embedded data.
982 		 * The strategy code will take care of it in that case.
983 		 */
984 		cluster = hammer2_assign_physical(trans, ip, cparent,
985 						lbase, pblksize,
986 						errorp);
987 		hammer2_write_bp(cluster, bp, ioflag, pblksize, errorp);
988 		if (cluster)
989 			hammer2_cluster_unlock(cluster);
990 		break;
991 	case HAMMER2_COMP_AUTOZERO:
992 		/*
993 		 * Check for zero-fill only
994 		 */
995 		hammer2_zero_check_and_write(bp, trans, ip,
996 				    ipdata, cparent, lbase,
997 				    ioflag, pblksize, errorp);
998 		break;
999 	case HAMMER2_COMP_LZ4:
1000 	case HAMMER2_COMP_ZLIB:
1001 	default:
1002 		/*
1003 		 * Check for zero-fill and attempt compression.
1004 		 */
1005 		hammer2_compress_and_write(bp, trans, ip,
1006 					   ipdata, cparent,
1007 					   lbase, ioflag,
1008 					   pblksize, errorp,
1009 					   ipdata->comp_algo);
1010 		break;
1011 	}
1012 }
1013 
1014 /*
1015  * Generic function that will perform the compression in compression
1016  * write path. The compression algorithm is determined by the settings
1017  * obtained from inode.
1018  */
1019 static
1020 void
1021 hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
1022 	hammer2_inode_t *ip, hammer2_inode_data_t *ipdata,
1023 	hammer2_cluster_t *cparent,
1024 	hammer2_key_t lbase, int ioflag, int pblksize,
1025 	int *errorp, int comp_algo)
1026 {
1027 	hammer2_cluster_t *cluster;
1028 	hammer2_chain_t *chain;
1029 	int comp_size;
1030 	int comp_block_size;
1031 	int i;
1032 	char *comp_buffer;
1033 
1034 	if (test_block_zeros(bp->b_data, pblksize)) {
1035 		zero_write(bp, trans, ip, ipdata, cparent, lbase, errorp);
1036 		return;
1037 	}
1038 
1039 	comp_size = 0;
1040 	comp_buffer = NULL;
1041 
1042 	KKASSERT(pblksize / 2 <= 32768);
1043 
1044 	if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) {
1045 		z_stream strm_compress;
1046 		int comp_level;
1047 		int ret;
1048 
1049 		switch(HAMMER2_DEC_COMP(comp_algo)) {
1050 		case HAMMER2_COMP_LZ4:
1051 			comp_buffer = objcache_get(cache_buffer_write,
1052 						   M_INTWAIT);
1053 			comp_size = LZ4_compress_limitedOutput(
1054 					bp->b_data,
1055 					&comp_buffer[sizeof(int)],
1056 					pblksize,
1057 					pblksize / 2 - sizeof(int));
1058 			/*
1059 			 * We need to prefix with the size, LZ4
1060 			 * doesn't do it for us.  Add the related
1061 			 * overhead.
1062 			 */
1063 			*(int *)comp_buffer = comp_size;
1064 			if (comp_size)
1065 				comp_size += sizeof(int);
1066 			break;
1067 		case HAMMER2_COMP_ZLIB:
1068 			comp_level = HAMMER2_DEC_LEVEL(comp_algo);
1069 			if (comp_level == 0)
1070 				comp_level = 6;	/* default zlib compression */
1071 			else if (comp_level < 6)
1072 				comp_level = 6;
1073 			else if (comp_level > 9)
1074 				comp_level = 9;
1075 			ret = deflateInit(&strm_compress, comp_level);
1076 			if (ret != Z_OK) {
1077 				kprintf("HAMMER2 ZLIB: fatal error "
1078 					"on deflateInit.\n");
1079 			}
1080 
1081 			comp_buffer = objcache_get(cache_buffer_write,
1082 						   M_INTWAIT);
1083 			strm_compress.next_in = bp->b_data;
1084 			strm_compress.avail_in = pblksize;
1085 			strm_compress.next_out = comp_buffer;
1086 			strm_compress.avail_out = pblksize / 2;
1087 			ret = deflate(&strm_compress, Z_FINISH);
1088 			if (ret == Z_STREAM_END) {
1089 				comp_size = pblksize / 2 -
1090 					    strm_compress.avail_out;
1091 			} else {
1092 				comp_size = 0;
1093 			}
1094 			ret = deflateEnd(&strm_compress);
1095 			break;
1096 		default:
1097 			kprintf("Error: Unknown compression method.\n");
1098 			kprintf("Comp_method = %d.\n", comp_algo);
1099 			break;
1100 		}
1101 	}
1102 
1103 	if (comp_size == 0) {
1104 		/*
1105 		 * compression failed or turned off
1106 		 */
1107 		comp_block_size = pblksize;	/* safety */
1108 		if (++ip->comp_heuristic > 128)
1109 			ip->comp_heuristic = 8;
1110 	} else {
1111 		/*
1112 		 * compression succeeded
1113 		 */
1114 		ip->comp_heuristic = 0;
1115 		if (comp_size <= 1024) {
1116 			comp_block_size = 1024;
1117 		} else if (comp_size <= 2048) {
1118 			comp_block_size = 2048;
1119 		} else if (comp_size <= 4096) {
1120 			comp_block_size = 4096;
1121 		} else if (comp_size <= 8192) {
1122 			comp_block_size = 8192;
1123 		} else if (comp_size <= 16384) {
1124 			comp_block_size = 16384;
1125 		} else if (comp_size <= 32768) {
1126 			comp_block_size = 32768;
1127 		} else {
1128 			panic("hammer2: WRITE PATH: "
1129 			      "Weird comp_size value.");
1130 			/* NOT REACHED */
1131 			comp_block_size = pblksize;
1132 		}
1133 	}
1134 
1135 	cluster = hammer2_assign_physical(trans, ip, cparent,
1136 					  lbase, comp_block_size,
1137 					  errorp);
1138 	ipdata = &hammer2_cluster_data(cparent)->ipdata;
1139 
1140 	if (*errorp) {
1141 		kprintf("WRITE PATH: An error occurred while "
1142 			"assigning physical space.\n");
1143 		KKASSERT(cluster == NULL);
1144 		goto done;
1145 	}
1146 
1147 	for (i = 0; i < cluster->nchains; ++i) {
1148 		hammer2_io_t *dio;
1149 		char *bdata;
1150 		int temp_check;
1151 
1152 		chain = cluster->array[i];
1153 		KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1154 
1155 		switch(chain->bref.type) {
1156 		case HAMMER2_BREF_TYPE_INODE:
1157 			KKASSERT(chain->data->ipdata.op_flags &
1158 				 HAMMER2_OPFLAG_DIRECTDATA);
1159 			KKASSERT(bp->b_loffset == 0);
1160 			bcopy(bp->b_data, chain->data->ipdata.u.data,
1161 			      HAMMER2_EMBEDDED_BYTES);
1162 			break;
1163 		case HAMMER2_BREF_TYPE_DATA:
1164 			temp_check = HAMMER2_DEC_CHECK(chain->bref.methods);
1165 
1166 			/*
1167 			 * Optimize out the read-before-write
1168 			 * if possible.
1169 			 */
1170 			*errorp = hammer2_io_newnz(chain->hmp,
1171 						   chain->bref.data_off,
1172 						   chain->bytes,
1173 						   &dio);
1174 			if (*errorp) {
1175 				hammer2_io_brelse(&dio);
1176 				kprintf("hammer2: WRITE PATH: "
1177 					"dbp bread error\n");
1178 				break;
1179 			}
1180 			bdata = hammer2_io_data(dio, chain->bref.data_off);
1181 
1182 			/*
1183 			 * When loading the block make sure we don't
1184 			 * leave garbage after the compressed data.
1185 			 */
1186 			if (comp_size) {
1187 				chain->bref.methods =
1188 					HAMMER2_ENC_COMP(comp_algo) +
1189 					HAMMER2_ENC_CHECK(temp_check);
1190 				bcopy(comp_buffer, bdata, comp_size);
1191 				if (comp_size != comp_block_size) {
1192 					bzero(bdata + comp_size,
1193 					      comp_block_size - comp_size);
1194 				}
1195 			} else {
1196 				chain->bref.methods =
1197 					HAMMER2_ENC_COMP(
1198 						HAMMER2_COMP_NONE) +
1199 					HAMMER2_ENC_CHECK(temp_check);
1200 				bcopy(bp->b_data, bdata, pblksize);
1201 			}
1202 
1203 			/*
1204 			 * Device buffer is now valid, chain is no
1205 			 * longer in the initial state.
1206 			 */
1207 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1208 
1209 			/* Now write the related bdp. */
1210 			if (ioflag & IO_SYNC) {
1211 				/*
1212 				 * Synchronous I/O requested.
1213 				 */
1214 				hammer2_io_bwrite(&dio);
1215 			/*
1216 			} else if ((ioflag & IO_DIRECT) &&
1217 				   loff + n == pblksize) {
1218 				hammer2_io_bdwrite(&dio);
1219 			*/
1220 			} else if (ioflag & IO_ASYNC) {
1221 				hammer2_io_bawrite(&dio);
1222 			} else {
1223 				hammer2_io_bdwrite(&dio);
1224 			}
1225 			break;
1226 		default:
1227 			panic("hammer2_write_bp: bad chain type %d\n",
1228 				chain->bref.type);
1229 			/* NOT REACHED */
1230 			break;
1231 		}
1232 
1233 		hammer2_chain_unlock(chain);
1234 	}
1235 done:
1236 	if (comp_buffer)
1237 		objcache_put(cache_buffer_write, comp_buffer);
1238 }
1239 
1240 /*
1241  * Function that performs zero-checking and writing without compression,
1242  * it corresponds to default zero-checking path.
1243  */
1244 static
1245 void
1246 hammer2_zero_check_and_write(struct buf *bp, hammer2_trans_t *trans,
1247 	hammer2_inode_t *ip, hammer2_inode_data_t *ipdata,
1248 	hammer2_cluster_t *cparent,
1249 	hammer2_key_t lbase, int ioflag, int pblksize, int *errorp)
1250 {
1251 	hammer2_cluster_t *cluster;
1252 
1253 	if (test_block_zeros(bp->b_data, pblksize)) {
1254 		zero_write(bp, trans, ip, ipdata, cparent, lbase, errorp);
1255 	} else {
1256 		cluster = hammer2_assign_physical(trans, ip, cparent,
1257 						  lbase, pblksize, errorp);
1258 		hammer2_write_bp(cluster, bp, ioflag, pblksize, errorp);
1259 		if (cluster)
1260 			hammer2_cluster_unlock(cluster);
1261 	}
1262 }
1263 
1264 /*
1265  * A function to test whether a block of data contains only zeros,
1266  * returns TRUE (non-zero) if the block is all zeros.
1267  */
1268 static
1269 int
1270 test_block_zeros(const char *buf, size_t bytes)
1271 {
1272 	size_t i;
1273 
1274 	for (i = 0; i < bytes; i += sizeof(long)) {
1275 		if (*(const long *)(buf + i) != 0)
1276 			return (0);
1277 	}
1278 	return (1);
1279 }
1280 
1281 /*
1282  * Function to "write" a block that contains only zeros.
1283  */
1284 static
1285 void
1286 zero_write(struct buf *bp, hammer2_trans_t *trans, hammer2_inode_t *ip,
1287 	hammer2_inode_data_t *ipdata, hammer2_cluster_t *cparent,
1288 	hammer2_key_t lbase, int *errorp __unused)
1289 {
1290 	hammer2_cluster_t *cluster;
1291 	hammer2_media_data_t *data;
1292 	hammer2_key_t key_dummy;
1293 	int ddflag;
1294 
1295 	cparent = hammer2_cluster_lookup_init(cparent, 0);
1296 	cluster = hammer2_cluster_lookup(cparent, &key_dummy, lbase, lbase,
1297 				     HAMMER2_LOOKUP_NODATA, &ddflag);
1298 	if (cluster) {
1299 		data = hammer2_cluster_data(cluster);
1300 
1301 		if (ddflag) {
1302 			bzero(data->ipdata.u.data, HAMMER2_EMBEDDED_BYTES);
1303 		} else {
1304 			hammer2_cluster_delete(trans, cluster, 0);
1305 		}
1306 		hammer2_cluster_unlock(cluster);
1307 	}
1308 	hammer2_cluster_lookup_done(cparent);
1309 }
1310 
1311 /*
1312  * Function to write the data as it is, without performing any sort of
1313  * compression. This function is used in path without compression and
1314  * default zero-checking path.
1315  */
1316 static
1317 void
1318 hammer2_write_bp(hammer2_cluster_t *cluster, struct buf *bp, int ioflag,
1319 				int pblksize, int *errorp)
1320 {
1321 	hammer2_chain_t *chain;
1322 	hammer2_io_t *dio;
1323 	char *bdata;
1324 	int error;
1325 	int i;
1326 	int temp_check;
1327 
1328 	error = 0;	/* XXX TODO below */
1329 
1330 	for (i = 0; i < cluster->nchains; ++i) {
1331 		chain = cluster->array[i];
1332 
1333 		temp_check = HAMMER2_DEC_CHECK(chain->bref.methods);
1334 
1335 		KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1336 
1337 		switch(chain->bref.type) {
1338 		case HAMMER2_BREF_TYPE_INODE:
1339 			KKASSERT(chain->data->ipdata.op_flags &
1340 				 HAMMER2_OPFLAG_DIRECTDATA);
1341 			KKASSERT(bp->b_loffset == 0);
1342 			bcopy(bp->b_data, chain->data->ipdata.u.data,
1343 			      HAMMER2_EMBEDDED_BYTES);
1344 			error = 0;
1345 			break;
1346 		case HAMMER2_BREF_TYPE_DATA:
1347 			error = hammer2_io_newnz(chain->hmp,
1348 						 chain->bref.data_off,
1349 						 chain->bytes, &dio);
1350 			if (error) {
1351 				hammer2_io_bqrelse(&dio);
1352 				kprintf("hammer2: WRITE PATH: "
1353 					"dbp bread error\n");
1354 				break;
1355 			}
1356 			bdata = hammer2_io_data(dio, chain->bref.data_off);
1357 
1358 			chain->bref.methods = HAMMER2_ENC_COMP(
1359 							HAMMER2_COMP_NONE) +
1360 					      HAMMER2_ENC_CHECK(temp_check);
1361 			bcopy(bp->b_data, bdata, chain->bytes);
1362 
1363 			/*
1364 			 * Device buffer is now valid, chain is no
1365 			 * longer in the initial state.
1366 			 */
1367 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1368 
1369 			if (ioflag & IO_SYNC) {
1370 				/*
1371 				 * Synchronous I/O requested.
1372 				 */
1373 				hammer2_io_bwrite(&dio);
1374 			/*
1375 			} else if ((ioflag & IO_DIRECT) &&
1376 				   loff + n == pblksize) {
1377 				hammer2_io_bdwrite(&dio);
1378 			*/
1379 			} else if (ioflag & IO_ASYNC) {
1380 				hammer2_io_bawrite(&dio);
1381 			} else {
1382 				hammer2_io_bdwrite(&dio);
1383 			}
1384 			break;
1385 		default:
1386 			panic("hammer2_write_bp: bad chain type %d\n",
1387 			      chain->bref.type);
1388 			/* NOT REACHED */
1389 			error = 0;
1390 			break;
1391 		}
1392 		KKASSERT(error == 0);	/* XXX TODO */
1393 	}
1394 	*errorp = error;
1395 }
1396 
1397 static
1398 int
1399 hammer2_remount(hammer2_mount_t *hmp, struct mount *mp, char *path,
1400 		struct vnode *devvp, struct ucred *cred)
1401 {
1402 	int error;
1403 
1404 	if (hmp->ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) {
1405 		error = hammer2_recovery(hmp);
1406 	} else {
1407 		error = 0;
1408 	}
1409 	return error;
1410 }
1411 
1412 static
1413 int
1414 hammer2_vfs_unmount(struct mount *mp, int mntflags)
1415 {
1416 	hammer2_pfsmount_t *pmp;
1417 	hammer2_mount_t *hmp;
1418 	hammer2_chain_t *rchain;
1419 	int flags;
1420 	int error = 0;
1421 	int i;
1422 
1423 	pmp = MPTOPMP(mp);
1424 
1425 	if (pmp == NULL)
1426 		return(0);
1427 
1428 	lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
1429 
1430 	/*
1431 	 * If mount initialization proceeded far enough we must flush
1432 	 * its vnodes.
1433 	 */
1434 	if (mntflags & MNT_FORCE)
1435 		flags = FORCECLOSE;
1436 	else
1437 		flags = 0;
1438 	if (pmp->iroot) {
1439 		error = vflush(mp, 0, flags);
1440 		if (error)
1441 			goto failed;
1442 	}
1443 
1444 	ccms_domain_uninit(&pmp->ccms_dom);
1445 	kdmsg_iocom_uninit(&pmp->iocom);	/* XXX chain dependency */
1446 
1447 	if (pmp->wthread_td) {
1448 		mtx_lock(&pmp->wthread_mtx);
1449 		pmp->wthread_destroy = 1;
1450 		wakeup(&pmp->wthread_bioq);
1451 		while (pmp->wthread_destroy != -1) {
1452 			mtxsleep(&pmp->wthread_destroy,
1453 				&pmp->wthread_mtx, 0,
1454 				"umount-sleep",	0);
1455 		}
1456 		mtx_unlock(&pmp->wthread_mtx);
1457 		pmp->wthread_td = NULL;
1458 	}
1459 
1460 	/*
1461 	 * Cleanup our reference on ihidden.
1462 	 */
1463 	if (pmp->ihidden) {
1464 		hammer2_inode_drop(pmp->ihidden);
1465 		pmp->ihidden = NULL;
1466 	}
1467 
1468 	/*
1469 	 * Cleanup our reference on iroot.  iroot is (should) not be needed
1470 	 * by the flush code.
1471 	 */
1472 	if (pmp->iroot) {
1473 #if REPORT_REFS_ERRORS
1474 		if (pmp->iroot->refs != 1)
1475 			kprintf("PMP->IROOT %p REFS WRONG %d\n",
1476 				pmp->iroot, pmp->iroot->refs);
1477 #else
1478 		KKASSERT(pmp->iroot->refs == 1);
1479 #endif
1480 		/* ref for pmp->iroot */
1481 		hammer2_inode_drop(pmp->iroot);
1482 		pmp->iroot = NULL;
1483 	}
1484 
1485 	for (i = 0; i < pmp->cluster.nchains; ++i) {
1486 		hmp = pmp->cluster.array[i]->hmp;
1487 
1488 		hammer2_vfs_unmount_hmp1(mp, hmp);
1489 
1490 		rchain = pmp->cluster.array[i];
1491 		if (rchain) {
1492 			atomic_clear_int(&rchain->flags, HAMMER2_CHAIN_MOUNTED);
1493 #if REPORT_REFS_ERRORS
1494 			if (rchain->refs != 1)
1495 				kprintf("PMP->RCHAIN %p REFS WRONG %d\n",
1496 					rchain, rchain->refs);
1497 #else
1498 			KKASSERT(rchain->refs == 1);
1499 #endif
1500 			hammer2_chain_drop(rchain);
1501 			pmp->cluster.array[i] = NULL;
1502 		}
1503 
1504 		hammer2_vfs_unmount_hmp2(mp, hmp);
1505 	}
1506 
1507 	pmp->mp = NULL;
1508 	mp->mnt_data = NULL;
1509 
1510 	kmalloc_destroy(&pmp->mmsg);
1511 	kmalloc_destroy(&pmp->minode);
1512 
1513 	kfree(pmp, M_HAMMER2);
1514 	error = 0;
1515 
1516 failed:
1517 	lockmgr(&hammer2_mntlk, LK_RELEASE);
1518 
1519 	return (error);
1520 }
1521 
1522 static
1523 void
1524 hammer2_vfs_unmount_hmp1(struct mount *mp, hammer2_mount_t *hmp)
1525 {
1526 	hammer2_mount_exlock(hmp);
1527 	--hmp->pmp_count;
1528 
1529 	kprintf("hammer2_unmount hmp=%p pmpcnt=%d\n", hmp, hmp->pmp_count);
1530 
1531 	/*
1532 	 * Flush any left over chains.  The voldata lock is only used
1533 	 * to synchronize against HAMMER2_CHAIN_MODIFIED_AUX.
1534 	 *
1535 	 * Flush twice to ensure that the freemap is completely
1536 	 * synchronized.  If we only do it once the next mount's
1537 	 * recovery scan will have to do some fixups (which isn't
1538 	 * bad, but we don't want it to have to do it except when
1539 	 * recovering from a crash).
1540 	 */
1541 	hammer2_voldata_lock(hmp);
1542 	if (((hmp->vchain.flags | hmp->fchain.flags) &
1543 	     HAMMER2_CHAIN_MODIFIED) ||
1544 	    hmp->vchain.update_hi > hmp->voldata.mirror_tid ||
1545 	    hmp->fchain.update_hi > hmp->voldata.freemap_tid) {
1546 		hammer2_voldata_unlock(hmp, 0);
1547 		hammer2_vfs_sync(mp, MNT_WAIT);
1548 		/*hammer2_vfs_sync(mp, MNT_WAIT);*/
1549 	} else {
1550 		hammer2_voldata_unlock(hmp, 0);
1551 	}
1552 	if (hmp->pmp_count == 0) {
1553 		if (((hmp->vchain.flags | hmp->fchain.flags) &
1554 		     HAMMER2_CHAIN_MODIFIED) ||
1555 		    (hmp->vchain.update_hi >
1556 		     hmp->voldata.mirror_tid) ||
1557 		    (hmp->fchain.update_hi >
1558 		     hmp->voldata.freemap_tid)) {
1559 			kprintf("hammer2_unmount: chains left over "
1560 				"after final sync\n");
1561 			kprintf("    vchain %08x update_hi %jx/%jx\n",
1562 				hmp->vchain.flags,
1563 				hmp->voldata.mirror_tid,
1564 				hmp->vchain.update_hi);
1565 			kprintf("    fchain %08x update_hi %jx/%jx\n",
1566 				hmp->fchain.flags,
1567 				hmp->voldata.freemap_tid,
1568 				hmp->fchain.update_hi);
1569 
1570 			if (hammer2_debug & 0x0010)
1571 				Debugger("entered debugger");
1572 		}
1573 	}
1574 }
1575 
1576 static
1577 void
1578 hammer2_vfs_unmount_hmp2(struct mount *mp, hammer2_mount_t *hmp)
1579 {
1580 	struct vnode *devvp;
1581 	int dumpcnt;
1582 	int ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
1583 
1584 	/*
1585 	 * If no PFS's left drop the master hammer2_mount for the
1586 	 * device.
1587 	 */
1588 	if (hmp->pmp_count == 0) {
1589 		if (hmp->sroot) {
1590 			hammer2_inode_drop(hmp->sroot);
1591 			hmp->sroot = NULL;
1592 		}
1593 
1594 		/*
1595 		 * Finish up with the device vnode
1596 		 */
1597 		if ((devvp = hmp->devvp) != NULL) {
1598 			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1599 			vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0);
1600 			hmp->devvp = NULL;
1601 			VOP_CLOSE(devvp, (ronly ? FREAD : FREAD|FWRITE), NULL);
1602 			vn_unlock(devvp);
1603 			vrele(devvp);
1604 			devvp = NULL;
1605 		}
1606 
1607 		/*
1608 		 * Final drop of embedded freemap root chain to
1609 		 * clean up fchain.core (fchain structure is not
1610 		 * flagged ALLOCATED so it is cleaned out and then
1611 		 * left to rot).
1612 		 */
1613 		hammer2_chain_drop(&hmp->fchain);
1614 
1615 		/*
1616 		 * Final drop of embedded volume root chain to clean
1617 		 * up vchain.core (vchain structure is not flagged
1618 		 * ALLOCATED so it is cleaned out and then left to
1619 		 * rot).
1620 		 */
1621 		dumpcnt = 50;
1622 		hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v');
1623 		dumpcnt = 50;
1624 		hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f');
1625 		hammer2_mount_unlock(hmp);
1626 		hammer2_chain_drop(&hmp->vchain);
1627 
1628 		hammer2_io_cleanup(hmp, &hmp->iotree);
1629 		if (hmp->iofree_count) {
1630 			kprintf("io_cleanup: %d I/O's left hanging\n",
1631 				hmp->iofree_count);
1632 		}
1633 
1634 		TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry);
1635 		kmalloc_destroy(&hmp->mchain);
1636 		kfree(hmp, M_HAMMER2);
1637 	} else {
1638 		hammer2_mount_unlock(hmp);
1639 	}
1640 }
1641 
1642 static
1643 int
1644 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
1645 	     ino_t ino, struct vnode **vpp)
1646 {
1647 	kprintf("hammer2_vget\n");
1648 	return (EOPNOTSUPP);
1649 }
1650 
1651 static
1652 int
1653 hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
1654 {
1655 	hammer2_pfsmount_t *pmp;
1656 	hammer2_cluster_t *cparent;
1657 	int error;
1658 	struct vnode *vp;
1659 
1660 	pmp = MPTOPMP(mp);
1661 	if (pmp->iroot == NULL) {
1662 		*vpp = NULL;
1663 		error = EINVAL;
1664 	} else {
1665 		cparent = hammer2_inode_lock_sh(pmp->iroot);
1666 		vp = hammer2_igetv(pmp->iroot, cparent, &error);
1667 		hammer2_inode_unlock_sh(pmp->iroot, cparent);
1668 		*vpp = vp;
1669 		if (vp == NULL)
1670 			kprintf("vnodefail\n");
1671 	}
1672 
1673 	return (error);
1674 }
1675 
1676 /*
1677  * Filesystem status
1678  *
1679  * XXX incorporate ipdata->inode_quota and data_quota
1680  */
1681 static
1682 int
1683 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred)
1684 {
1685 	hammer2_pfsmount_t *pmp;
1686 	hammer2_mount_t *hmp;
1687 
1688 	pmp = MPTOPMP(mp);
1689 	KKASSERT(pmp->cluster.nchains >= 1);
1690 	hmp = pmp->cluster.focus->hmp;	/* XXX */
1691 
1692 	mp->mnt_stat.f_files = pmp->inode_count;
1693 	mp->mnt_stat.f_ffree = 0;
1694 	mp->mnt_stat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
1695 	mp->mnt_stat.f_bfree =  hmp->voldata.allocator_free / HAMMER2_PBUFSIZE;
1696 	mp->mnt_stat.f_bavail = mp->mnt_stat.f_bfree;
1697 
1698 	*sbp = mp->mnt_stat;
1699 	return (0);
1700 }
1701 
1702 static
1703 int
1704 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred)
1705 {
1706 	hammer2_pfsmount_t *pmp;
1707 	hammer2_mount_t *hmp;
1708 
1709 	pmp = MPTOPMP(mp);
1710 	KKASSERT(pmp->cluster.nchains >= 1);
1711 	hmp = pmp->cluster.focus->hmp;	/* XXX */
1712 
1713 	mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
1714 	mp->mnt_vstat.f_files = pmp->inode_count;
1715 	mp->mnt_vstat.f_ffree = 0;
1716 	mp->mnt_vstat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
1717 	mp->mnt_vstat.f_bfree =  hmp->voldata.allocator_free / HAMMER2_PBUFSIZE;
1718 	mp->mnt_vstat.f_bavail = mp->mnt_vstat.f_bfree;
1719 
1720 	*sbp = mp->mnt_vstat;
1721 	return (0);
1722 }
1723 
1724 /*
1725  * Mount-time recovery (RW mounts)
1726  *
1727  * Updates to the free block table are allowed to lag flushes by one
1728  * transaction.  In case of a crash, then on a fresh mount we must do an
1729  * incremental scan of transaction id voldata.mirror_tid and make sure the
1730  * related blocks have been marked allocated.
1731  *
1732  */
1733 struct hammer2_recovery_elm {
1734 	TAILQ_ENTRY(hammer2_recovery_elm) entry;
1735 	hammer2_chain_t *chain;
1736 };
1737 
1738 TAILQ_HEAD(hammer2_recovery_list, hammer2_recovery_elm);
1739 
1740 static int hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_mount_t *hmp,
1741 			hammer2_chain_t *parent,
1742 			struct hammer2_recovery_list *list, int depth);
1743 
1744 #define HAMMER2_RECOVERY_MAXDEPTH	10
1745 
1746 static
1747 int
1748 hammer2_recovery(hammer2_mount_t *hmp)
1749 {
1750 	hammer2_trans_t trans;
1751 	struct hammer2_recovery_list list;
1752 	struct hammer2_recovery_elm *elm;
1753 	hammer2_chain_t *parent;
1754 	int error;
1755 	int cumulative_error = 0;
1756 
1757 	hammer2_trans_init(&trans, NULL, hmp, 0);
1758 
1759 	TAILQ_INIT(&list);
1760 	parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
1761 	cumulative_error = hammer2_recovery_scan(&trans, hmp, parent, &list, 0);
1762 	hammer2_chain_lookup_done(parent);
1763 
1764 	while ((elm = TAILQ_FIRST(&list)) != NULL) {
1765 		TAILQ_REMOVE(&list, elm, entry);
1766 		parent = elm->chain;
1767 		kfree(elm, M_HAMMER2);
1768 
1769 		hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
1770 					   HAMMER2_RESOLVE_NOREF);
1771 		error = hammer2_recovery_scan(&trans, hmp, parent, &list, 0);
1772 		hammer2_chain_unlock(parent);
1773 		if (error)
1774 			cumulative_error = error;
1775 	}
1776 	hammer2_trans_done(&trans);
1777 
1778 	return cumulative_error;
1779 }
1780 
1781 static
1782 int
1783 hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_mount_t *hmp,
1784 		      hammer2_chain_t *parent,
1785 		      struct hammer2_recovery_list *list, int depth)
1786 {
1787 	hammer2_chain_t *chain;
1788 	int cache_index;
1789 	int cumulative_error = 0;
1790 	int error;
1791 
1792 	/*
1793 	 * Defer operation if depth limit reached.
1794 	 */
1795 	if (depth >= HAMMER2_RECOVERY_MAXDEPTH) {
1796 		struct hammer2_recovery_elm *elm;
1797 
1798 		elm = kmalloc(sizeof(*elm), M_HAMMER2, M_ZERO | M_WAITOK);
1799 		elm->chain = parent;
1800 		hammer2_chain_ref(parent);
1801 		TAILQ_INSERT_TAIL(list, elm, entry);
1802 		/* unlocked by caller */
1803 
1804 		return(0);
1805 	}
1806 
1807 	/*
1808 	 * Adjust freemap to ensure that the block(s) are marked allocated.
1809 	 */
1810 	if (parent->bref.type != HAMMER2_BREF_TYPE_VOLUME) {
1811 		hammer2_freemap_adjust(trans, hmp, &parent->bref,
1812 				       HAMMER2_FREEMAP_DORECOVER);
1813 	}
1814 
1815 	/*
1816 	 * Check type for recursive scan
1817 	 */
1818 	switch(parent->bref.type) {
1819 	case HAMMER2_BREF_TYPE_VOLUME:
1820 		/* data already instantiated */
1821 		break;
1822 	case HAMMER2_BREF_TYPE_INODE:
1823 		/*
1824 		 * Must instantiate data for DIRECTDATA test and also
1825 		 * for recursion.
1826 		 */
1827 		hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
1828 		hammer2_chain_unlock(parent);
1829 		if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1830 			/* not applicable to recovery scan */
1831 			return 0;
1832 		}
1833 		break;
1834 	case HAMMER2_BREF_TYPE_INDIRECT:
1835 		/*
1836 		 * Must instantiate data for recursion
1837 		 */
1838 		hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
1839 		hammer2_chain_unlock(parent);
1840 		break;
1841 	case HAMMER2_BREF_TYPE_DATA:
1842 	case HAMMER2_BREF_TYPE_FREEMAP:
1843 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1844 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1845 		/* not applicable to recovery scan */
1846 		return 0;
1847 		break;
1848 	default:
1849 		return EDOM;
1850 	}
1851 
1852 	/*
1853 	 * Recursive scan of the last flushed transaction only.  We are
1854 	 * doing this without pmp assignments so don't leave the chains
1855 	 * hanging around after we are done with them.
1856 	 */
1857 	cache_index = 0;
1858 	chain = hammer2_chain_scan(parent, NULL, &cache_index,
1859 				   HAMMER2_LOOKUP_NODATA);
1860 	while (chain) {
1861 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
1862 		if (chain->bref.mirror_tid >= hmp->voldata.alloc_tid - 1) {
1863 			error = hammer2_recovery_scan(trans, hmp, chain,
1864 						      list, depth + 1);
1865 			if (error)
1866 				cumulative_error = error;
1867 		}
1868 		chain = hammer2_chain_scan(parent, chain, &cache_index,
1869 					   HAMMER2_LOOKUP_NODATA);
1870 	}
1871 
1872 	return cumulative_error;
1873 }
1874 
1875 /*
1876  * Sync the entire filesystem; this is called from the filesystem syncer
1877  * process periodically and whenever a user calls sync(1) on the hammer
1878  * mountpoint.
1879  *
1880  * Currently is actually called from the syncer! \o/
1881  *
1882  * This task will have to snapshot the state of the dirty inode chain.
1883  * From that, it will have to make sure all of the inodes on the dirty
1884  * chain have IO initiated. We make sure that io is initiated for the root
1885  * block.
1886  *
1887  * If waitfor is set, we wait for media to acknowledge the new rootblock.
1888  *
1889  * THINKS: side A vs side B, to have sync not stall all I/O?
1890  */
1891 int
1892 hammer2_vfs_sync(struct mount *mp, int waitfor)
1893 {
1894 	struct hammer2_sync_info info;
1895 	hammer2_chain_t *chain;
1896 	hammer2_pfsmount_t *pmp;
1897 	hammer2_mount_t *hmp;
1898 	int flags;
1899 	int error;
1900 	int total_error;
1901 	int force_fchain;
1902 	int i;
1903 
1904 	pmp = MPTOPMP(mp);
1905 
1906 	/*
1907 	 * We can't acquire locks on existing vnodes while in a transaction
1908 	 * without risking a deadlock.  This assumes that vfsync() can be
1909 	 * called without the vnode locked (which it can in DragonFly).
1910 	 * Otherwise we'd have to implement a multi-pass or flag the lock
1911 	 * failures and retry.
1912 	 *
1913 	 * The reclamation code interlocks with the sync list's token
1914 	 * (by removing the vnode from the scan list) before unlocking
1915 	 * the inode, giving us time to ref the inode.
1916 	 */
1917 	/*flags = VMSC_GETVP;*/
1918 	flags = 0;
1919 	if (waitfor & MNT_LAZY)
1920 		flags |= VMSC_ONEPASS;
1921 
1922 	/*
1923 	 * Start our flush transaction.  This does not return until all
1924 	 * concurrent transactions have completed and will prevent any
1925 	 * new transactions from running concurrently, except for the
1926 	 * buffer cache transactions.
1927 	 *
1928 	 * For efficiency do an async pass before making sure with a
1929 	 * synchronous pass on all related buffer cache buffers.  It
1930 	 * should theoretically not be possible for any new file buffers
1931 	 * to be instantiated during this sequence.
1932 	 */
1933 	hammer2_trans_init(&info.trans, pmp, NULL, HAMMER2_TRANS_ISFLUSH |
1934 						   HAMMER2_TRANS_PREFLUSH);
1935 	hammer2_run_unlinkq(&info.trans, pmp);
1936 	info.error = 0;
1937 	info.waitfor = MNT_NOWAIT;
1938 	vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info);
1939 	info.waitfor = MNT_WAIT;
1940 	vsyncscan(mp, flags, hammer2_sync_scan2, &info);
1941 
1942 	/*
1943 	 * Clear PREFLUSH.  This prevents (or asserts on) any new logical
1944 	 * buffer cache flushes which occur during the flush.  Device buffers
1945 	 * are not affected.
1946 	 */
1947 
1948 #if 0
1949 	if (info.error == 0 && (waitfor & MNT_WAIT)) {
1950 		info.waitfor = waitfor;
1951 		    vsyncscan(mp, flags, hammer2_sync_scan2, &info);
1952 
1953 	}
1954 #endif
1955 	hammer2_bioq_sync(info.trans.pmp);
1956 	atomic_clear_int(&info.trans.flags, HAMMER2_TRANS_PREFLUSH);
1957 
1958 #if 0
1959 	/*
1960 	 * Start the flush transaction and flush all meta-data.
1961 	 */
1962 	hammer2_trans_init(&info.trans, pmp, NULL, HAMMER2_TRANS_ISFLUSH);
1963 #endif
1964 
1965 	total_error = 0;
1966 	for (i = 0; i < pmp->cluster.nchains; ++i) {
1967 		hmp = pmp->cluster.array[i]->hmp;
1968 
1969 		/*
1970 		 * Media mounts have two 'roots', vchain for the topology
1971 		 * and fchain for the free block table.  Flush both.
1972 		 *
1973 		 * Note that the topology and free block table are handled
1974 		 * independently, so the free block table can wind up being
1975 		 * ahead of the topology.  We depend on the bulk free scan
1976 		 * code to deal with any loose ends.
1977 		 */
1978 #if 1
1979 		hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
1980 		kprintf("sync tid test fmap %016jx %016jx\n",
1981 			hmp->fchain.update_hi, hmp->voldata.freemap_tid);
1982 		if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) ||
1983 		    hmp->fchain.update_hi > hmp->voldata.freemap_tid) {
1984 			/* this will also modify vchain as a side effect */
1985 			chain = &hmp->fchain;
1986 			hammer2_flush(&info.trans, &chain);
1987 			KKASSERT(chain == &hmp->fchain);
1988 		}
1989 		hammer2_chain_unlock(&hmp->fchain);
1990 #endif
1991 
1992 		hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1993 		kprintf("sync tid test vmap %016jx %016jx\n",
1994 			hmp->vchain.update_hi, hmp->voldata.mirror_tid);
1995 		if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) ||
1996 		    hmp->vchain.update_hi > hmp->voldata.mirror_tid) {
1997 			chain = &hmp->vchain;
1998 			hammer2_flush(&info.trans, &chain);
1999 			KKASSERT(chain == &hmp->vchain);
2000 			force_fchain = 1;
2001 		} else {
2002 			force_fchain = 0;
2003 		}
2004 		hammer2_chain_unlock(&hmp->vchain);
2005 
2006 #if 0
2007 		hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
2008 		if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) ||
2009 		    hmp->fchain.update_hi > hmp->voldata.freemap_tid ||
2010 		    force_fchain) {
2011 			/* this will also modify vchain as a side effect */
2012 			chain = &hmp->fchain;
2013 			hammer2_flush(&info.trans, &chain);
2014 			KKASSERT(chain == &hmp->fchain);
2015 		}
2016 		hammer2_chain_unlock(&hmp->fchain);
2017 #endif
2018 
2019 		error = 0;
2020 
2021 		/*
2022 		 * We can't safely flush the volume header until we have
2023 		 * flushed any device buffers which have built up.
2024 		 *
2025 		 * XXX this isn't being incremental
2026 		 */
2027 		vn_lock(hmp->devvp, LK_EXCLUSIVE | LK_RETRY);
2028 		error = VOP_FSYNC(hmp->devvp, MNT_WAIT, 0);
2029 		vn_unlock(hmp->devvp);
2030 
2031 		/*
2032 		 * The flush code sets CHAIN_VOLUMESYNC to indicate that the
2033 		 * volume header needs synchronization via hmp->volsync.
2034 		 *
2035 		 * XXX synchronize the flag & data with only this flush XXX
2036 		 */
2037 		if (error == 0 &&
2038 		    (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) {
2039 			struct buf *bp;
2040 
2041 			/*
2042 			 * Synchronize the disk before flushing the volume
2043 			 * header.
2044 			 */
2045 			bp = getpbuf(NULL);
2046 			bp->b_bio1.bio_offset = 0;
2047 			bp->b_bufsize = 0;
2048 			bp->b_bcount = 0;
2049 			bp->b_cmd = BUF_CMD_FLUSH;
2050 			bp->b_bio1.bio_done = biodone_sync;
2051 			bp->b_bio1.bio_flags |= BIO_SYNC;
2052 			vn_strategy(hmp->devvp, &bp->b_bio1);
2053 			biowait(&bp->b_bio1, "h2vol");
2054 			relpbuf(bp, NULL);
2055 
2056 			/*
2057 			 * Then we can safely flush the version of the
2058 			 * volume header synchronized by the flush code.
2059 			 */
2060 			i = hmp->volhdrno + 1;
2061 			if (i >= HAMMER2_NUM_VOLHDRS)
2062 				i = 0;
2063 			if (i * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE >
2064 			    hmp->volsync.volu_size) {
2065 				i = 0;
2066 			}
2067 			kprintf("sync volhdr %d %jd\n",
2068 				i, (intmax_t)hmp->volsync.volu_size);
2069 			bp = getblk(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
2070 				    HAMMER2_PBUFSIZE, 0, 0);
2071 			atomic_clear_int(&hmp->vchain.flags,
2072 					 HAMMER2_CHAIN_VOLUMESYNC);
2073 			bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE);
2074 			bawrite(bp);
2075 			hmp->volhdrno = i;
2076 		}
2077 		if (error)
2078 			total_error = error;
2079 	}
2080 	hammer2_trans_done(&info.trans);
2081 
2082 	return (total_error);
2083 }
2084 
2085 /*
2086  * Sync passes.
2087  */
2088 static int
2089 hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
2090 {
2091 	struct hammer2_sync_info *info = data;
2092 	hammer2_inode_t *ip;
2093 	int error;
2094 
2095 	/*
2096 	 *
2097 	 */
2098 	ip = VTOI(vp);
2099 	if (ip == NULL)
2100 		return(0);
2101 	if (vp->v_type == VNON || vp->v_type == VBAD) {
2102 		vclrisdirty(vp);
2103 		return(0);
2104 	}
2105 	if ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 &&
2106 	    RB_EMPTY(&vp->v_rbdirty_tree)) {
2107 		vclrisdirty(vp);
2108 		return(0);
2109 	}
2110 
2111 	/*
2112 	 * VOP_FSYNC will start a new transaction so replicate some code
2113 	 * here to do it inline (see hammer2_vop_fsync()).
2114 	 *
2115 	 * WARNING: The vfsync interacts with the buffer cache and might
2116 	 *          block, we can't hold the inode lock at that time.
2117 	 *	    However, we MUST ref ip before blocking to ensure that
2118 	 *	    it isn't ripped out from under us (since we do not
2119 	 *	    hold a lock on the vnode).
2120 	 */
2121 	hammer2_inode_ref(ip);
2122 	atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
2123 	if (vp)
2124 		vfsync(vp, MNT_NOWAIT, 1, NULL, NULL);
2125 
2126 	hammer2_inode_drop(ip);
2127 #if 1
2128 	error = 0;
2129 	if (error)
2130 		info->error = error;
2131 #endif
2132 	return(0);
2133 }
2134 
2135 static
2136 int
2137 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp)
2138 {
2139 	return (0);
2140 }
2141 
2142 static
2143 int
2144 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
2145 	       struct fid *fhp, struct vnode **vpp)
2146 {
2147 	return (0);
2148 }
2149 
2150 static
2151 int
2152 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
2153 		 int *exflagsp, struct ucred **credanonp)
2154 {
2155 	return (0);
2156 }
2157 
2158 /*
2159  * Support code for hammer2_mount().  Read, verify, and install the volume
2160  * header into the HMP
2161  *
2162  * XXX read four volhdrs and use the one with the highest TID whos CRC
2163  *     matches.
2164  *
2165  * XXX check iCRCs.
2166  *
2167  * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to
2168  *     nonexistant locations.
2169  *
2170  * XXX Record selected volhdr and ring updates to each of 4 volhdrs
2171  */
2172 static
2173 int
2174 hammer2_install_volume_header(hammer2_mount_t *hmp)
2175 {
2176 	hammer2_volume_data_t *vd;
2177 	struct buf *bp;
2178 	hammer2_crc32_t crc0, crc, bcrc0, bcrc;
2179 	int error_reported;
2180 	int error;
2181 	int valid;
2182 	int i;
2183 
2184 	error_reported = 0;
2185 	error = 0;
2186 	valid = 0;
2187 	bp = NULL;
2188 
2189 	/*
2190 	 * There are up to 4 copies of the volume header (syncs iterate
2191 	 * between them so there is no single master).  We don't trust the
2192 	 * volu_size field so we don't know precisely how large the filesystem
2193 	 * is, so depend on the OS to return an error if we go beyond the
2194 	 * block device's EOF.
2195 	 */
2196 	for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) {
2197 		error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
2198 			      HAMMER2_VOLUME_BYTES, &bp);
2199 		if (error) {
2200 			brelse(bp);
2201 			bp = NULL;
2202 			continue;
2203 		}
2204 
2205 		vd = (struct hammer2_volume_data *) bp->b_data;
2206 		if ((vd->magic != HAMMER2_VOLUME_ID_HBO) &&
2207 		    (vd->magic != HAMMER2_VOLUME_ID_ABO)) {
2208 			brelse(bp);
2209 			bp = NULL;
2210 			continue;
2211 		}
2212 
2213 		if (vd->magic == HAMMER2_VOLUME_ID_ABO) {
2214 			/* XXX: Reversed-endianness filesystem */
2215 			kprintf("hammer2: reverse-endian filesystem detected");
2216 			brelse(bp);
2217 			bp = NULL;
2218 			continue;
2219 		}
2220 
2221 		crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0];
2222 		crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF,
2223 				      HAMMER2_VOLUME_ICRC0_SIZE);
2224 		bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1];
2225 		bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF,
2226 				       HAMMER2_VOLUME_ICRC1_SIZE);
2227 		if ((crc0 != crc) || (bcrc0 != bcrc)) {
2228 			kprintf("hammer2 volume header crc "
2229 				"mismatch copy #%d %08x/%08x\n",
2230 				i, crc0, crc);
2231 			error_reported = 1;
2232 			brelse(bp);
2233 			bp = NULL;
2234 			continue;
2235 		}
2236 		if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) {
2237 			valid = 1;
2238 			hmp->voldata = *vd;
2239 			hmp->volhdrno = i;
2240 		}
2241 		brelse(bp);
2242 		bp = NULL;
2243 	}
2244 	if (valid) {
2245 		hmp->volsync = hmp->voldata;
2246 		error = 0;
2247 		if (error_reported || bootverbose || 1) { /* 1/DEBUG */
2248 			kprintf("hammer2: using volume header #%d\n",
2249 				hmp->volhdrno);
2250 		}
2251 	} else {
2252 		error = EINVAL;
2253 		kprintf("hammer2: no valid volume headers found!\n");
2254 	}
2255 	return (error);
2256 }
2257 
2258 /*
2259  * Reconnect using the passed file pointer.  The caller must ref the
2260  * fp for us.
2261  */
2262 void
2263 hammer2_cluster_reconnect(hammer2_pfsmount_t *pmp, struct file *fp)
2264 {
2265 	hammer2_inode_data_t *ipdata;
2266 	hammer2_cluster_t *cparent;
2267 	hammer2_mount_t *hmp;
2268 	size_t name_len;
2269 
2270 	hmp = pmp->cluster.focus->hmp;	/* XXX */
2271 
2272 	/*
2273 	 * Closes old comm descriptor, kills threads, cleans up
2274 	 * states, then installs the new descriptor and creates
2275 	 * new threads.
2276 	 */
2277 	kdmsg_iocom_reconnect(&pmp->iocom, fp, "hammer2");
2278 
2279 	/*
2280 	 * Setup LNK_CONN fields for autoinitiated state machine
2281 	 */
2282 	cparent = hammer2_inode_lock_ex(pmp->iroot);
2283 	ipdata = &hammer2_cluster_data(cparent)->ipdata;
2284 	pmp->iocom.auto_lnk_conn.pfs_clid = ipdata->pfs_clid;
2285 	pmp->iocom.auto_lnk_conn.pfs_fsid = ipdata->pfs_fsid;
2286 	pmp->iocom.auto_lnk_conn.pfs_type = ipdata->pfs_type;
2287 	pmp->iocom.auto_lnk_conn.proto_version = DMSG_SPAN_PROTO_1;
2288 	pmp->iocom.auto_lnk_conn.peer_type = hmp->voldata.peer_type;
2289 
2290 	/*
2291 	 * Filter adjustment.  Clients do not need visibility into other
2292 	 * clients (otherwise millions of clients would present a serious
2293 	 * problem).  The fs_label also serves to restrict the namespace.
2294 	 */
2295 	pmp->iocom.auto_lnk_conn.peer_mask = 1LLU << HAMMER2_PEER_HAMMER2;
2296 	pmp->iocom.auto_lnk_conn.pfs_mask = (uint64_t)-1;
2297 	switch (ipdata->pfs_type) {
2298 	case DMSG_PFSTYPE_CLIENT:
2299 		pmp->iocom.auto_lnk_conn.peer_mask &=
2300 				~(1LLU << DMSG_PFSTYPE_CLIENT);
2301 		break;
2302 	default:
2303 		break;
2304 	}
2305 
2306 	name_len = ipdata->name_len;
2307 	if (name_len >= sizeof(pmp->iocom.auto_lnk_conn.fs_label))
2308 		name_len = sizeof(pmp->iocom.auto_lnk_conn.fs_label) - 1;
2309 	bcopy(ipdata->filename,
2310 	      pmp->iocom.auto_lnk_conn.fs_label,
2311 	      name_len);
2312 	pmp->iocom.auto_lnk_conn.fs_label[name_len] = 0;
2313 
2314 	/*
2315 	 * Setup LNK_SPAN fields for autoinitiated state machine
2316 	 */
2317 	pmp->iocom.auto_lnk_span.pfs_clid = ipdata->pfs_clid;
2318 	pmp->iocom.auto_lnk_span.pfs_fsid = ipdata->pfs_fsid;
2319 	pmp->iocom.auto_lnk_span.pfs_type = ipdata->pfs_type;
2320 	pmp->iocom.auto_lnk_span.peer_type = hmp->voldata.peer_type;
2321 	pmp->iocom.auto_lnk_span.proto_version = DMSG_SPAN_PROTO_1;
2322 	name_len = ipdata->name_len;
2323 	if (name_len >= sizeof(pmp->iocom.auto_lnk_span.fs_label))
2324 		name_len = sizeof(pmp->iocom.auto_lnk_span.fs_label) - 1;
2325 	bcopy(ipdata->filename,
2326 	      pmp->iocom.auto_lnk_span.fs_label,
2327 	      name_len);
2328 	pmp->iocom.auto_lnk_span.fs_label[name_len] = 0;
2329 	hammer2_inode_unlock_ex(pmp->iroot, cparent);
2330 
2331 	kdmsg_iocom_autoinitiate(&pmp->iocom, hammer2_autodmsg);
2332 }
2333 
2334 static int
2335 hammer2_rcvdmsg(kdmsg_msg_t *msg)
2336 {
2337 	switch(msg->any.head.cmd & DMSGF_TRANSMASK) {
2338 	case DMSG_DBG_SHELL:
2339 		/*
2340 		 * (non-transaction)
2341 		 * Execute shell command (not supported atm)
2342 		 */
2343 		kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
2344 		break;
2345 	case DMSG_DBG_SHELL | DMSGF_REPLY:
2346 		/*
2347 		 * (non-transaction)
2348 		 */
2349 		if (msg->aux_data) {
2350 			msg->aux_data[msg->aux_size - 1] = 0;
2351 			kprintf("HAMMER2 DBG: %s\n", msg->aux_data);
2352 		}
2353 		break;
2354 	default:
2355 		/*
2356 		 * Unsupported message received.  We only need to
2357 		 * reply if it's a transaction in order to close our end.
2358 		 * Ignore any one-way messages are any further messages
2359 		 * associated with the transaction.
2360 		 *
2361 		 * NOTE: This case also includes DMSG_LNK_ERROR messages
2362 		 *	 which might be one-way, replying to those would
2363 		 *	 cause an infinite ping-pong.
2364 		 */
2365 		if (msg->any.head.cmd & DMSGF_CREATE)
2366 			kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
2367 		break;
2368 	}
2369 	return(0);
2370 }
2371 
2372 /*
2373  * This function is called after KDMSG has automatically handled processing
2374  * of a LNK layer message (typically CONN, SPAN, or CIRC).
2375  *
2376  * We tag off the LNK_CONN to trigger our LNK_VOLCONF messages which
2377  * advertises all available hammer2 super-root volumes.
2378  */
2379 static void
2380 hammer2_autodmsg(kdmsg_msg_t *msg)
2381 {
2382 	hammer2_pfsmount_t *pmp = msg->iocom->handle;
2383 	hammer2_mount_t *hmp = pmp->cluster.focus->hmp; /* XXX */
2384 	int copyid;
2385 
2386 	/*
2387 	 * We only care about replies to our LNK_CONN auto-request.  kdmsg
2388 	 * has already processed the reply, we use this calback as a shim
2389 	 * to know when we can advertise available super-root volumes.
2390 	 */
2391 	if ((msg->any.head.cmd & DMSGF_TRANSMASK) !=
2392 	    (DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_REPLY) ||
2393 	    msg->state == NULL) {
2394 		return;
2395 	}
2396 
2397 	kprintf("LNK_CONN REPLY RECEIVED CMD %08x\n", msg->any.head.cmd);
2398 
2399 	if (msg->any.head.cmd & DMSGF_CREATE) {
2400 		kprintf("HAMMER2: VOLDATA DUMP\n");
2401 
2402 		/*
2403 		 * Dump the configuration stored in the volume header
2404 		 */
2405 		hammer2_voldata_lock(hmp);
2406 		for (copyid = 0; copyid < HAMMER2_COPYID_COUNT; ++copyid) {
2407 			if (hmp->voldata.copyinfo[copyid].copyid == 0)
2408 				continue;
2409 			hammer2_volconf_update(pmp, copyid);
2410 		}
2411 		hammer2_voldata_unlock(hmp, 0);
2412 	}
2413 	if ((msg->any.head.cmd & DMSGF_DELETE) &&
2414 	    msg->state && (msg->state->txcmd & DMSGF_DELETE) == 0) {
2415 		kprintf("HAMMER2: CONN WAS TERMINATED\n");
2416 	}
2417 }
2418 
2419 /*
2420  * Volume configuration updates are passed onto the userland service
2421  * daemon via the open LNK_CONN transaction.
2422  */
2423 void
2424 hammer2_volconf_update(hammer2_pfsmount_t *pmp, int index)
2425 {
2426 	hammer2_mount_t *hmp = pmp->cluster.focus->hmp;	/* XXX */
2427 	kdmsg_msg_t *msg;
2428 
2429 	/* XXX interlock against connection state termination */
2430 	kprintf("volconf update %p\n", pmp->iocom.conn_state);
2431 	if (pmp->iocom.conn_state) {
2432 		kprintf("TRANSMIT VOLCONF VIA OPEN CONN TRANSACTION\n");
2433 		msg = kdmsg_msg_alloc_state(pmp->iocom.conn_state,
2434 					    DMSG_LNK_VOLCONF, NULL, NULL);
2435 		msg->any.lnk_volconf.copy = hmp->voldata.copyinfo[index];
2436 		msg->any.lnk_volconf.mediaid = hmp->voldata.fsid;
2437 		msg->any.lnk_volconf.index = index;
2438 		kdmsg_msg_write(msg);
2439 	}
2440 }
2441 
2442 /*
2443  * This handles hysteresis on regular file flushes.  Because the BIOs are
2444  * routed to a thread it is possible for an excessive number to build up
2445  * and cause long front-end stalls long before the runningbuffspace limit
2446  * is hit, so we implement hammer2_flush_pipe to control the
2447  * hysteresis.
2448  *
2449  * This is a particular problem when compression is used.
2450  */
2451 void
2452 hammer2_lwinprog_ref(hammer2_pfsmount_t *pmp)
2453 {
2454 	atomic_add_int(&pmp->count_lwinprog, 1);
2455 }
2456 
2457 void
2458 hammer2_lwinprog_drop(hammer2_pfsmount_t *pmp)
2459 {
2460 	int lwinprog;
2461 
2462 	lwinprog = atomic_fetchadd_int(&pmp->count_lwinprog, -1);
2463 	if ((lwinprog & HAMMER2_LWINPROG_WAITING) &&
2464 	    (lwinprog & HAMMER2_LWINPROG_MASK) <= hammer2_flush_pipe * 2 / 3) {
2465 		atomic_clear_int(&pmp->count_lwinprog,
2466 				 HAMMER2_LWINPROG_WAITING);
2467 		wakeup(&pmp->count_lwinprog);
2468 	}
2469 }
2470 
2471 void
2472 hammer2_lwinprog_wait(hammer2_pfsmount_t *pmp)
2473 {
2474 	int lwinprog;
2475 
2476 	for (;;) {
2477 		lwinprog = pmp->count_lwinprog;
2478 		cpu_ccfence();
2479 		if ((lwinprog & HAMMER2_LWINPROG_MASK) < hammer2_flush_pipe)
2480 			break;
2481 		tsleep_interlock(&pmp->count_lwinprog, 0);
2482 		atomic_set_int(&pmp->count_lwinprog, HAMMER2_LWINPROG_WAITING);
2483 		lwinprog = pmp->count_lwinprog;
2484 		if ((lwinprog & HAMMER2_LWINPROG_MASK) < hammer2_flush_pipe)
2485 			break;
2486 		tsleep(&pmp->count_lwinprog, PINTERLOCKED, "h2wpipe", hz);
2487 	}
2488 }
2489 
2490 /*
2491  * Manage excessive memory resource use for chain and related
2492  * structures.
2493  */
2494 void
2495 hammer2_pfs_memory_wait(hammer2_pfsmount_t *pmp)
2496 {
2497 	long waiting;
2498 	long count;
2499 	long limit;
2500 #if 0
2501 	static int zzticks;
2502 #endif
2503 
2504 	/*
2505 	 * Atomic check condition and wait.  Also do an early speedup of
2506 	 * the syncer to try to avoid hitting the wait.
2507 	 */
2508 	for (;;) {
2509 		waiting = pmp->inmem_dirty_chains;
2510 		cpu_ccfence();
2511 		count = waiting & HAMMER2_DIRTYCHAIN_MASK;
2512 
2513 		limit = pmp->mp->mnt_nvnodelistsize / 10;
2514 		if (limit < hammer2_limit_dirty_chains)
2515 			limit = hammer2_limit_dirty_chains;
2516 		if (limit < 1000)
2517 			limit = 1000;
2518 
2519 #if 0
2520 		if ((int)(ticks - zzticks) > hz) {
2521 			zzticks = ticks;
2522 			kprintf("count %ld %ld\n", count, limit);
2523 		}
2524 #endif
2525 
2526 		/*
2527 		 * Block if there are too many dirty chains present, wait
2528 		 * for the flush to clean some out.
2529 		 */
2530 		if (count > limit) {
2531 			tsleep_interlock(&pmp->inmem_dirty_chains, 0);
2532 			if (atomic_cmpset_long(&pmp->inmem_dirty_chains,
2533 					       waiting,
2534 				       waiting | HAMMER2_DIRTYCHAIN_WAITING)) {
2535 				speedup_syncer(pmp->mp);
2536 				tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED,
2537 				       "chnmem", hz);
2538 			}
2539 			continue;	/* loop on success or fail */
2540 		}
2541 
2542 		/*
2543 		 * Try to start an early flush before we are forced to block.
2544 		 */
2545 		if (count > limit * 7 / 10)
2546 			speedup_syncer(pmp->mp);
2547 		break;
2548 	}
2549 }
2550 
2551 void
2552 hammer2_pfs_memory_inc(hammer2_pfsmount_t *pmp)
2553 {
2554 	if (pmp)
2555 		atomic_add_long(&pmp->inmem_dirty_chains, 1);
2556 }
2557 
2558 void
2559 hammer2_pfs_memory_wakeup(hammer2_pfsmount_t *pmp)
2560 {
2561 	long waiting;
2562 
2563 	if (pmp == NULL)
2564 		return;
2565 
2566 	for (;;) {
2567 		waiting = pmp->inmem_dirty_chains;
2568 		cpu_ccfence();
2569 		if (atomic_cmpset_long(&pmp->inmem_dirty_chains,
2570 				       waiting,
2571 				       (waiting - 1) &
2572 					~HAMMER2_DIRTYCHAIN_WAITING)) {
2573 			break;
2574 		}
2575 	}
2576 
2577 	if (waiting & HAMMER2_DIRTYCHAIN_WAITING)
2578 		wakeup(&pmp->inmem_dirty_chains);
2579 }
2580 
2581 /*
2582  * Debugging
2583  */
2584 void
2585 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx)
2586 {
2587 	hammer2_chain_t *scan;
2588 	hammer2_chain_t *first_parent;
2589 
2590 	--*countp;
2591 	if (*countp == 0) {
2592 		kprintf("%*.*s...\n", tab, tab, "");
2593 		return;
2594 	}
2595 	if (*countp < 0)
2596 		return;
2597 	first_parent = chain->core ? TAILQ_FIRST(&chain->core->ownerq) : NULL;
2598 	kprintf("%*.*s%c-chain %p.%d %016jx/%d mir=%016jx\n",
2599 		tab, tab, "", pfx,
2600 		chain, chain->bref.type,
2601 		chain->bref.key, chain->bref.keybits,
2602 		chain->bref.mirror_tid);
2603 
2604 	kprintf("%*.*s      [%08x] (%s) mod=%016jx del=%016jx "
2605 		"lo=%08jx hi=%08jx refs=%d\n",
2606 		tab, tab, "",
2607 		chain->flags,
2608 		((chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
2609 		chain->data) ?  (char *)chain->data->ipdata.filename : "?"),
2610 		chain->modify_tid,
2611 		chain->delete_tid,
2612 		chain->update_lo,
2613 		chain->update_hi,
2614 		chain->refs);
2615 
2616 	kprintf("%*.*s      core %p [%08x]",
2617 		tab, tab, "",
2618 		chain->core, (chain->core ? chain->core->flags : 0));
2619 
2620 	if (first_parent)
2621 		kprintf("\n%*.*s      fp=%p np=%p [fpflags %08x fprefs %d",
2622 			tab, tab, "",
2623 			first_parent,
2624 			(first_parent ? TAILQ_NEXT(first_parent, core_entry) :
2625 					NULL),
2626 			first_parent->flags,
2627 			first_parent->refs);
2628 	if (chain->core == NULL || RB_EMPTY(&chain->core->rbtree))
2629 		kprintf("\n");
2630 	else
2631 		kprintf(" {\n");
2632 	if (chain->core) {
2633 		RB_FOREACH(scan, hammer2_chain_tree, &chain->core->rbtree)
2634 			hammer2_dump_chain(scan, tab + 4, countp, 'a');
2635 		RB_FOREACH(scan, hammer2_chain_tree, &chain->core->dbtree)
2636 			hammer2_dump_chain(scan, tab + 4, countp, 'r');
2637 		TAILQ_FOREACH(scan, &chain->core->dbq, db_entry)
2638 			hammer2_dump_chain(scan, tab + 4, countp, 'd');
2639 	}
2640 	if (chain->core && !RB_EMPTY(&chain->core->rbtree)) {
2641 		if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data)
2642 			kprintf("%*.*s}(%s)\n", tab, tab, "",
2643 				chain->data->ipdata.filename);
2644 		else
2645 			kprintf("%*.*s}\n", tab, tab, "");
2646 	}
2647 }
2648