xref: /dragonfly/sys/vfs/hammer2/hammer2_inode.c (revision 896f2e3a)
1 /*
2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/cdefs.h>
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/types.h>
39 #include <sys/lock.h>
40 #include <sys/uuid.h>
41 
42 #include "hammer2.h"
43 
44 #define INODE_DEBUG	0
45 
46 static void hammer2_inode_move_to_hidden(hammer2_trans_t *trans,
47 					 hammer2_cluster_t **cparentp,
48 					 hammer2_cluster_t **clusterp,
49 					 hammer2_tid_t inum);
50 
51 RB_GENERATE2(hammer2_inode_tree, hammer2_inode, rbnode, hammer2_inode_cmp,
52 	     hammer2_tid_t, inum);
53 
54 int
55 hammer2_inode_cmp(hammer2_inode_t *ip1, hammer2_inode_t *ip2)
56 {
57 	if (ip1->inum < ip2->inum)
58 		return(-1);
59 	if (ip1->inum > ip2->inum)
60 		return(1);
61 	return(0);
62 }
63 
64 /*
65  * HAMMER2 inode locks
66  *
67  * HAMMER2 offers shared and exclusive locks on inodes.  Pass a mask of
68  * flags for options:
69  *
70  *	- pass HAMMER2_RESOLVE_SHARED if a shared lock is desired.  The
71  *	  inode locking function will automatically set the RDONLY flag.
72  *
73  *	- pass HAMMER2_RESOLVE_ALWAYS if you need the inode's meta-data.
74  *	  Most front-end inode locks do.
75  *
76  *	- pass HAMMER2_RESOLVE_NEVER if you do not want to require that
77  *	  the inode data be resolved.  This is used by the syncthr because
78  *	  it can run on an unresolved/out-of-sync cluster, and also by the
79  *	  vnode reclamation code to avoid unnecessary I/O (particularly when
80  *	  disposing of hundreds of thousands of cached vnodes).
81  *
82  * The inode locking function locks the inode itself, resolves any stale
83  * chains in the inode's cluster, and allocates a fresh copy of the
84  * cluster with 1 ref and all the underlying chains locked.
85  *
86  * ip->cluster will be stable while the inode is locked.
87  *
88  * NOTE: We don't combine the inode/chain lock because putting away an
89  *       inode would otherwise confuse multiple lock holders of the inode.
90  *
91  * NOTE: In-memory inodes always point to hardlink targets (the actual file),
92  *	 and never point to a hardlink pointer.
93  *
94  * NOTE: If caller passes HAMMER2_RESOLVE_RDONLY the exclusive locking code
95  *	 will feel free to reduce the chain set in the cluster as an
96  *	 optimization.  It will still be validated against the quorum if
97  *	 appropriate, but the optimization might be able to reduce data
98  *	 accesses to one node.  This flag is automatically set if the inode
99  *	 is locked with HAMMER2_RESOLVE_SHARED.
100  */
101 hammer2_cluster_t *
102 hammer2_inode_lock(hammer2_inode_t *ip, int how)
103 {
104 	hammer2_cluster_t *cluster;
105 
106 	hammer2_inode_ref(ip);
107 
108 	/*
109 	 * Inode structure mutex
110 	 */
111 	if (how & HAMMER2_RESOLVE_SHARED) {
112 		how |= HAMMER2_RESOLVE_RDONLY;
113 		hammer2_mtx_sh(&ip->lock);
114 	} else {
115 		hammer2_mtx_ex(&ip->lock);
116 	}
117 
118 	/*
119 	 * Create a copy of ip->cluster and lock it.  Note that the copy
120 	 * will have a ref on the cluster AND its chains and we don't want
121 	 * a second ref to either when we lock it.
122 	 *
123 	 * The copy will not have a focus until it is locked.
124 	 *
125 	 * Exclusive inode locks set the template focus chain in (ip)
126 	 * as a hint.  Cluster locks can ALWAYS replace the focus in the
127 	 * working copy if the hint does not work out, so beware.
128 	 */
129 	cluster = hammer2_cluster_copy(&ip->cluster);
130 	hammer2_cluster_lock(cluster, how);
131 
132 	/*
133 	 * cluster->focus will be set if resolving RESOLVE_ALWAYS, but
134 	 * only update the cached focus in the inode structure when taking
135 	 * out an exclusive lock.
136 	 */
137 	if ((how & HAMMER2_RESOLVE_SHARED) == 0)
138 		ip->cluster.focus = cluster->focus;
139 
140 	/*
141 	 * Returned cluster must resolve hardlink pointers.
142 	 * XXX remove me.
143 	 */
144 	if ((how & HAMMER2_RESOLVE_MASK) == HAMMER2_RESOLVE_ALWAYS &&
145 	    cluster->error == 0 &&
146 	    cluster->focus) {
147 		const hammer2_inode_data_t *ripdata;
148 
149 		ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
150 		KKASSERT(ripdata->type != HAMMER2_OBJTYPE_HARDLINK);
151 	}
152 	return (cluster);
153 }
154 
155 void
156 hammer2_inode_unlock(hammer2_inode_t *ip, hammer2_cluster_t *cluster)
157 {
158 	if (cluster) {
159 		hammer2_cluster_unlock(cluster);
160 		hammer2_cluster_drop(cluster);
161 	}
162 	hammer2_mtx_unlock(&ip->lock);
163 	hammer2_inode_drop(ip);
164 }
165 
166 /*
167  * Temporarily release a lock held shared or exclusive.  Caller must
168  * hold the lock shared or exclusive on call and lock will be released
169  * on return.
170  *
171  * Restore a lock that was temporarily released.
172  */
173 hammer2_mtx_state_t
174 hammer2_inode_lock_temp_release(hammer2_inode_t *ip)
175 {
176 	return hammer2_mtx_temp_release(&ip->lock);
177 }
178 
179 void
180 hammer2_inode_lock_temp_restore(hammer2_inode_t *ip, hammer2_mtx_state_t ostate)
181 {
182 	hammer2_mtx_temp_restore(&ip->lock, ostate);
183 }
184 
185 /*
186  * Upgrade a shared inode lock to exclusive and return.  If the inode lock
187  * is already held exclusively this is a NOP.
188  *
189  * The caller MUST hold the inode lock either shared or exclusive on call
190  * and will own the lock exclusively on return.
191  *
192  * Returns non-zero if the lock was already exclusive prior to the upgrade.
193  */
194 int
195 hammer2_inode_lock_upgrade(hammer2_inode_t *ip)
196 {
197 	int wasexclusive;
198 
199 	if (mtx_islocked_ex(&ip->lock)) {
200 		wasexclusive = 1;
201 	} else {
202 		hammer2_mtx_unlock(&ip->lock);
203 		hammer2_mtx_ex(&ip->lock);
204 		wasexclusive = 0;
205 	}
206 	return wasexclusive;
207 }
208 
209 /*
210  * Downgrade an inode lock from exclusive to shared only if the inode
211  * lock was previously shared.  If the inode lock was previously exclusive,
212  * this is a NOP.
213  */
214 void
215 hammer2_inode_lock_downgrade(hammer2_inode_t *ip, int wasexclusive)
216 {
217 	if (wasexclusive == 0)
218 		mtx_downgrade(&ip->lock);
219 }
220 
221 /*
222  * Lookup an inode by inode number
223  */
224 hammer2_inode_t *
225 hammer2_inode_lookup(hammer2_pfs_t *pmp, hammer2_tid_t inum)
226 {
227 	hammer2_inode_t *ip;
228 
229 	KKASSERT(pmp);
230 	if (pmp->spmp_hmp) {
231 		ip = NULL;
232 	} else {
233 		hammer2_spin_ex(&pmp->inum_spin);
234 		ip = RB_LOOKUP(hammer2_inode_tree, &pmp->inum_tree, inum);
235 		if (ip)
236 			hammer2_inode_ref(ip);
237 		hammer2_spin_unex(&pmp->inum_spin);
238 	}
239 	return(ip);
240 }
241 
242 /*
243  * Adding a ref to an inode is only legal if the inode already has at least
244  * one ref.
245  *
246  * (can be called with spinlock held)
247  */
248 void
249 hammer2_inode_ref(hammer2_inode_t *ip)
250 {
251 	atomic_add_int(&ip->refs, 1);
252 }
253 
254 /*
255  * Drop an inode reference, freeing the inode when the last reference goes
256  * away.
257  */
258 void
259 hammer2_inode_drop(hammer2_inode_t *ip)
260 {
261 	hammer2_pfs_t *pmp;
262 	hammer2_inode_t *pip;
263 	u_int refs;
264 
265 	while (ip) {
266 		refs = ip->refs;
267 		cpu_ccfence();
268 		if (refs == 1) {
269 			/*
270 			 * Transition to zero, must interlock with
271 			 * the inode inumber lookup tree (if applicable).
272 			 * It should not be possible for anyone to race
273 			 * the transition to 0.
274 			 *
275 			 */
276 			pmp = ip->pmp;
277 			KKASSERT(pmp);
278 			hammer2_spin_ex(&pmp->inum_spin);
279 
280 			if (atomic_cmpset_int(&ip->refs, 1, 0)) {
281 				KKASSERT(hammer2_mtx_refs(&ip->lock) == 0);
282 				if (ip->flags & HAMMER2_INODE_ONRBTREE) {
283 					atomic_clear_int(&ip->flags,
284 						     HAMMER2_INODE_ONRBTREE);
285 					RB_REMOVE(hammer2_inode_tree,
286 						  &pmp->inum_tree, ip);
287 				}
288 				hammer2_spin_unex(&pmp->inum_spin);
289 
290 				pip = ip->pip;
291 				ip->pip = NULL;
292 				ip->pmp = NULL;
293 
294 				/*
295 				 * Cleaning out ip->cluster isn't entirely
296 				 * trivial.
297 				 */
298 				hammer2_inode_repoint(ip, NULL, NULL);
299 
300 				/*
301 				 * We have to drop pip (if non-NULL) to
302 				 * dispose of our implied reference from
303 				 * ip->pip.  We can simply loop on it.
304 				 */
305 				kfree(ip, pmp->minode);
306 				atomic_add_long(&pmp->inmem_inodes, -1);
307 				ip = pip;
308 				/* continue with pip (can be NULL) */
309 			} else {
310 				hammer2_spin_unex(&ip->pmp->inum_spin);
311 			}
312 		} else {
313 			/*
314 			 * Non zero transition
315 			 */
316 			if (atomic_cmpset_int(&ip->refs, refs, refs - 1))
317 				break;
318 		}
319 	}
320 }
321 
322 /*
323  * Get the vnode associated with the given inode, allocating the vnode if
324  * necessary.  The vnode will be returned exclusively locked.
325  *
326  * The caller must lock the inode (shared or exclusive).
327  *
328  * Great care must be taken to avoid deadlocks and vnode acquisition/reclaim
329  * races.
330  */
331 struct vnode *
332 hammer2_igetv(hammer2_inode_t *ip, hammer2_cluster_t *cparent, int *errorp)
333 {
334 	const hammer2_inode_data_t *ripdata;
335 	hammer2_pfs_t *pmp;
336 	struct vnode *vp;
337 
338 	pmp = ip->pmp;
339 	KKASSERT(pmp != NULL);
340 	*errorp = 0;
341 
342 	ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
343 
344 	for (;;) {
345 		/*
346 		 * Attempt to reuse an existing vnode assignment.  It is
347 		 * possible to race a reclaim so the vget() may fail.  The
348 		 * inode must be unlocked during the vget() to avoid a
349 		 * deadlock against a reclaim.
350 		 */
351 		int wasexclusive;
352 
353 		vp = ip->vp;
354 		if (vp) {
355 			/*
356 			 * Inode must be unlocked during the vget() to avoid
357 			 * possible deadlocks, but leave the ip ref intact.
358 			 *
359 			 * vnode is held to prevent destruction during the
360 			 * vget().  The vget() can still fail if we lost
361 			 * a reclaim race on the vnode.
362 			 */
363 			hammer2_mtx_state_t ostate;
364 
365 			vhold(vp);
366 			ostate = hammer2_inode_lock_temp_release(ip);
367 			if (vget(vp, LK_EXCLUSIVE)) {
368 				vdrop(vp);
369 				hammer2_inode_lock_temp_restore(ip, ostate);
370 				continue;
371 			}
372 			hammer2_inode_lock_temp_restore(ip, ostate);
373 			vdrop(vp);
374 			/* vp still locked and ref from vget */
375 			if (ip->vp != vp) {
376 				kprintf("hammer2: igetv race %p/%p\n",
377 					ip->vp, vp);
378 				vput(vp);
379 				continue;
380 			}
381 			*errorp = 0;
382 			break;
383 		}
384 
385 		/*
386 		 * No vnode exists, allocate a new vnode.  Beware of
387 		 * allocation races.  This function will return an
388 		 * exclusively locked and referenced vnode.
389 		 */
390 		*errorp = getnewvnode(VT_HAMMER2, pmp->mp, &vp, 0, 0);
391 		if (*errorp) {
392 			kprintf("hammer2: igetv getnewvnode failed %d\n",
393 				*errorp);
394 			vp = NULL;
395 			break;
396 		}
397 
398 		/*
399 		 * Lock the inode and check for an allocation race.
400 		 */
401 		wasexclusive = hammer2_inode_lock_upgrade(ip);
402 		if (ip->vp != NULL) {
403 			vp->v_type = VBAD;
404 			vx_put(vp);
405 			hammer2_inode_lock_downgrade(ip, wasexclusive);
406 			continue;
407 		}
408 
409 		switch (ripdata->type) {
410 		case HAMMER2_OBJTYPE_DIRECTORY:
411 			vp->v_type = VDIR;
412 			break;
413 		case HAMMER2_OBJTYPE_REGFILE:
414 			vp->v_type = VREG;
415 			vinitvmio(vp, ripdata->size,
416 				  HAMMER2_LBUFSIZE,
417 				  (int)ripdata->size & HAMMER2_LBUFMASK);
418 			break;
419 		case HAMMER2_OBJTYPE_SOFTLINK:
420 			/*
421 			 * XXX for now we are using the generic file_read
422 			 * and file_write code so we need a buffer cache
423 			 * association.
424 			 */
425 			vp->v_type = VLNK;
426 			vinitvmio(vp, ripdata->size,
427 				  HAMMER2_LBUFSIZE,
428 				  (int)ripdata->size & HAMMER2_LBUFMASK);
429 			break;
430 		case HAMMER2_OBJTYPE_CDEV:
431 			vp->v_type = VCHR;
432 			/* fall through */
433 		case HAMMER2_OBJTYPE_BDEV:
434 			vp->v_ops = &pmp->mp->mnt_vn_spec_ops;
435 			if (ripdata->type != HAMMER2_OBJTYPE_CDEV)
436 				vp->v_type = VBLK;
437 			addaliasu(vp, ripdata->rmajor, ripdata->rminor);
438 			break;
439 		case HAMMER2_OBJTYPE_FIFO:
440 			vp->v_type = VFIFO;
441 			vp->v_ops = &pmp->mp->mnt_vn_fifo_ops;
442 			break;
443 		default:
444 			panic("hammer2: unhandled objtype %d", ripdata->type);
445 			break;
446 		}
447 
448 		if (ip == pmp->iroot)
449 			vsetflags(vp, VROOT);
450 
451 		vp->v_data = ip;
452 		ip->vp = vp;
453 		hammer2_inode_ref(ip);		/* vp association */
454 		hammer2_inode_lock_downgrade(ip, wasexclusive);
455 		break;
456 	}
457 
458 	/*
459 	 * Return non-NULL vp and *errorp == 0, or NULL vp and *errorp != 0.
460 	 */
461 	if (hammer2_debug & 0x0002) {
462 		kprintf("igetv vp %p refs 0x%08x aux 0x%08x\n",
463 			vp, vp->v_refcnt, vp->v_auxrefs);
464 	}
465 	return (vp);
466 }
467 
468 /*
469  * Returns the inode associated with the passed-in cluster, creating the
470  * inode if necessary and synchronizing it to the passed-in cluster otherwise.
471  *
472  * The passed-in cluster must be locked and will remain locked on return.
473  * The returned inode will be locked and the caller may dispose of both
474  * via hammer2_inode_unlock_ex().  However, if the caller needs to resolve
475  * a hardlink it must ref/unlock/relock/drop the inode.
476  *
477  * The hammer2_inode structure regulates the interface between the high level
478  * kernel VNOPS API and the filesystem backend (the chains).
479  *
480  * On return the inode is locked with the supplied cluster.
481  */
482 hammer2_inode_t *
483 hammer2_inode_get(hammer2_pfs_t *pmp, hammer2_inode_t *dip,
484 		  hammer2_cluster_t *cluster)
485 {
486 	hammer2_inode_t *nip;
487 	const hammer2_inode_data_t *iptmp;
488 	const hammer2_inode_data_t *nipdata;
489 
490 	KKASSERT(cluster == NULL ||
491 		 hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE);
492 	KKASSERT(pmp);
493 
494 	/*
495 	 * Interlocked lookup/ref of the inode.  This code is only needed
496 	 * when looking up inodes with nlinks != 0 (TODO: optimize out
497 	 * otherwise and test for duplicates).
498 	 *
499 	 * Cluster can be NULL during the initial pfs allocation.
500 	 */
501 again:
502 	while (cluster) {
503 		iptmp = &hammer2_cluster_rdata(cluster)->ipdata;
504 		nip = hammer2_inode_lookup(pmp, iptmp->inum);
505 		if (nip == NULL)
506 			break;
507 
508 		hammer2_mtx_ex(&nip->lock);
509 
510 		/*
511 		 * Handle SMP race (not applicable to the super-root spmp
512 		 * which can't index inodes due to duplicative inode numbers).
513 		 */
514 		if (pmp->spmp_hmp == NULL &&
515 		    (nip->flags & HAMMER2_INODE_ONRBTREE) == 0) {
516 			hammer2_mtx_unlock(&nip->lock);
517 			hammer2_inode_drop(nip);
518 			continue;
519 		}
520 		hammer2_inode_repoint(nip, NULL, cluster);
521 
522 		return nip;
523 	}
524 
525 	/*
526 	 * We couldn't find the inode number, create a new inode.
527 	 */
528 	nip = kmalloc(sizeof(*nip), pmp->minode, M_WAITOK | M_ZERO);
529 	atomic_add_long(&pmp->inmem_inodes, 1);
530 	hammer2_pfs_memory_inc(pmp);
531 	hammer2_pfs_memory_wakeup(pmp);
532 	if (pmp->spmp_hmp)
533 		nip->flags = HAMMER2_INODE_SROOT;
534 
535 	/*
536 	 * Initialize nip's cluster.  A cluster is provided for normal
537 	 * inodes but typically not for the super-root or PFS inodes.
538 	 */
539 	nip->cluster.refs = 1;
540 	nip->cluster.pmp = pmp;
541 	nip->cluster.flags |= HAMMER2_CLUSTER_INODE;
542 	if (cluster) {
543 		nipdata = &hammer2_cluster_rdata(cluster)->ipdata;
544 		nip->inum = nipdata->inum;
545 		nip->size = nipdata->size;
546 		nip->mtime = nipdata->mtime;
547 		hammer2_inode_repoint(nip, NULL, cluster);
548 	} else {
549 		nip->inum = 1;			/* PFS inum is always 1 XXX */
550 		/* mtime will be updated when a cluster is available */
551 	}
552 
553 	nip->pip = dip;				/* can be NULL */
554 	if (dip)
555 		hammer2_inode_ref(dip);	/* ref dip for nip->pip */
556 
557 	nip->pmp = pmp;
558 
559 	/*
560 	 * ref and lock on nip gives it state compatible to after a
561 	 * hammer2_inode_lock() call.
562 	 */
563 	nip->refs = 1;
564 	hammer2_mtx_init(&nip->lock, "h2inode");
565 	hammer2_mtx_ex(&nip->lock);
566 	/* combination of thread lock and chain lock == inode lock */
567 
568 	/*
569 	 * Attempt to add the inode.  If it fails we raced another inode
570 	 * get.  Undo all the work and try again.
571 	 */
572 	if (pmp->spmp_hmp == NULL) {
573 		hammer2_spin_ex(&pmp->inum_spin);
574 		if (RB_INSERT(hammer2_inode_tree, &pmp->inum_tree, nip)) {
575 			hammer2_spin_unex(&pmp->inum_spin);
576 			hammer2_mtx_unlock(&nip->lock);
577 			hammer2_inode_drop(nip);
578 			goto again;
579 		}
580 		atomic_set_int(&nip->flags, HAMMER2_INODE_ONRBTREE);
581 		hammer2_spin_unex(&pmp->inum_spin);
582 	}
583 
584 	return (nip);
585 }
586 
587 /*
588  * Create a new inode in the specified directory using the vattr to
589  * figure out the type of inode.
590  *
591  * If no error occurs the new inode with its cluster locked is returned in
592  * *nipp, otherwise an error is returned and *nipp is set to NULL.
593  *
594  * If vap and/or cred are NULL the related fields are not set and the
595  * inode type defaults to a directory.  This is used when creating PFSs
596  * under the super-root, so the inode number is set to 1 in this case.
597  *
598  * dip is not locked on entry.
599  *
600  * NOTE: When used to create a snapshot, the inode is temporarily associated
601  *	 with the super-root spmp. XXX should pass new pmp for snapshot.
602  */
603 hammer2_inode_t *
604 hammer2_inode_create(hammer2_trans_t *trans, hammer2_inode_t *dip,
605 		     struct vattr *vap, struct ucred *cred,
606 		     const uint8_t *name, size_t name_len,
607 		     hammer2_cluster_t **clusterp,
608 		     int flags, int *errorp)
609 {
610 	const hammer2_inode_data_t *dipdata;
611 	hammer2_inode_data_t *nipdata;
612 	hammer2_cluster_t *cluster;
613 	hammer2_cluster_t *cparent;
614 	hammer2_inode_t *nip;
615 	hammer2_key_t key_dummy;
616 	hammer2_key_t lhc;
617 	int error;
618 	uid_t xuid;
619 	uuid_t dip_uid;
620 	uuid_t dip_gid;
621 	uint32_t dip_mode;
622 	uint8_t dip_comp_algo;
623 	uint8_t dip_check_algo;
624 
625 	lhc = hammer2_dirhash(name, name_len);
626 	*errorp = 0;
627 
628 	/*
629 	 * Locate the inode or indirect block to create the new
630 	 * entry in.  At the same time check for key collisions
631 	 * and iterate until we don't get one.
632 	 *
633 	 * NOTE: hidden inodes do not have iterators.
634 	 */
635 retry:
636 	cparent = hammer2_inode_lock(dip, HAMMER2_RESOLVE_ALWAYS);
637 	dipdata = &hammer2_cluster_rdata(cparent)->ipdata;
638 	dip_uid = dipdata->uid;
639 	dip_gid = dipdata->gid;
640 	dip_mode = dipdata->mode;
641 	dip_comp_algo = dipdata->comp_algo;
642 	dip_check_algo = dipdata->check_algo;
643 
644 	error = 0;
645 	while (error == 0) {
646 		cluster = hammer2_cluster_lookup(cparent, &key_dummy,
647 						 lhc, lhc, 0);
648 		if (cluster == NULL)
649 			break;
650 		if ((lhc & HAMMER2_DIRHASH_VISIBLE) == 0)
651 			error = ENOSPC;
652 		if ((lhc & HAMMER2_DIRHASH_LOMASK) == HAMMER2_DIRHASH_LOMASK)
653 			error = ENOSPC;
654 		hammer2_cluster_unlock(cluster);
655 		hammer2_cluster_drop(cluster);
656 		cluster = NULL;
657 		++lhc;
658 	}
659 
660 	if (error == 0) {
661 		error = hammer2_cluster_create(trans, cparent, &cluster,
662 					     lhc, 0,
663 					     HAMMER2_BREF_TYPE_INODE,
664 					     HAMMER2_INODE_BYTES,
665 					     flags);
666 	}
667 #if INODE_DEBUG
668 	kprintf("CREATE INODE %*.*s chain=%p\n",
669 		(int)name_len, (int)name_len, name,
670 		(cluster ? cluster->focus : NULL));
671 #endif
672 
673 	/*
674 	 * Cleanup and handle retries.
675 	 */
676 	if (error == EAGAIN) {
677 		hammer2_cluster_ref(cparent);
678 		hammer2_inode_unlock(dip, cparent);
679 		hammer2_cluster_wait(cparent);
680 		hammer2_cluster_drop(cparent);
681 		goto retry;
682 	}
683 	hammer2_inode_unlock(dip, cparent);
684 	cparent = NULL;
685 
686 	if (error) {
687 		KKASSERT(cluster == NULL);
688 		*errorp = error;
689 		return (NULL);
690 	}
691 
692 	/*
693 	 * Set up the new inode.
694 	 *
695 	 * NOTE: *_get() integrates chain's lock into the inode lock.
696 	 *
697 	 * NOTE: Only one new inode can currently be created per
698 	 *	 transaction.  If the need arises we can adjust
699 	 *	 hammer2_trans_init() to allow more.
700 	 *
701 	 * NOTE: nipdata will have chain's blockset data.
702 	 */
703 	KKASSERT(cluster->focus->flags & HAMMER2_CHAIN_MODIFIED);
704 	nipdata = &hammer2_cluster_wdata(cluster)->ipdata;
705 	nipdata->inum = trans->inode_tid;
706 	hammer2_cluster_modsync(cluster);
707 	nip = hammer2_inode_get(dip->pmp, dip, cluster);
708 	nipdata = &hammer2_cluster_wdata(cluster)->ipdata;
709 
710 	if (vap) {
711 		KKASSERT(trans->inodes_created == 0);
712 		nipdata->type = hammer2_get_obj_type(vap->va_type);
713 		nipdata->inum = trans->inode_tid;
714 		++trans->inodes_created;
715 
716 		switch (nipdata->type) {
717 		case HAMMER2_OBJTYPE_CDEV:
718 		case HAMMER2_OBJTYPE_BDEV:
719 			nipdata->rmajor = vap->va_rmajor;
720 			nipdata->rminor = vap->va_rminor;
721 			break;
722 		default:
723 			break;
724 		}
725 	} else {
726 		nipdata->type = HAMMER2_OBJTYPE_DIRECTORY;
727 		nipdata->inum = 1;
728 	}
729 
730 	/* Inherit parent's inode compression mode. */
731 	nip->comp_heuristic = 0;
732 	nipdata->comp_algo = dip_comp_algo;
733 	nipdata->check_algo = dip_check_algo;
734 	nipdata->version = HAMMER2_INODE_VERSION_ONE;
735 	hammer2_update_time(&nipdata->ctime);
736 	nipdata->mtime = nipdata->ctime;
737 	if (vap)
738 		nipdata->mode = vap->va_mode;
739 	nipdata->nlinks = 1;
740 	if (vap) {
741 		if (dip && dip->pmp) {
742 			xuid = hammer2_to_unix_xid(&dip_uid);
743 			xuid = vop_helper_create_uid(dip->pmp->mp,
744 						     dip_mode,
745 						     xuid,
746 						     cred,
747 						     &vap->va_mode);
748 		} else {
749 			/* super-root has no dip and/or pmp */
750 			xuid = 0;
751 		}
752 		if (vap->va_vaflags & VA_UID_UUID_VALID)
753 			nipdata->uid = vap->va_uid_uuid;
754 		else if (vap->va_uid != (uid_t)VNOVAL)
755 			hammer2_guid_to_uuid(&nipdata->uid, vap->va_uid);
756 		else
757 			hammer2_guid_to_uuid(&nipdata->uid, xuid);
758 
759 		if (vap->va_vaflags & VA_GID_UUID_VALID)
760 			nipdata->gid = vap->va_gid_uuid;
761 		else if (vap->va_gid != (gid_t)VNOVAL)
762 			hammer2_guid_to_uuid(&nipdata->gid, vap->va_gid);
763 		else if (dip)
764 			nipdata->gid = dip_gid;
765 	}
766 
767 	/*
768 	 * Regular files and softlinks allow a small amount of data to be
769 	 * directly embedded in the inode.  This flag will be cleared if
770 	 * the size is extended past the embedded limit.
771 	 */
772 	if (nipdata->type == HAMMER2_OBJTYPE_REGFILE ||
773 	    nipdata->type == HAMMER2_OBJTYPE_SOFTLINK) {
774 		nipdata->op_flags |= HAMMER2_OPFLAG_DIRECTDATA;
775 	}
776 
777 	KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
778 	bcopy(name, nipdata->filename, name_len);
779 	nipdata->name_key = lhc;
780 	nipdata->name_len = name_len;
781 	hammer2_cluster_modsync(cluster);
782 	*clusterp = cluster;
783 
784 	return (nip);
785 }
786 
787 /*
788  * The cluster has been removed from the original directory and replaced
789  * with a hardlink pointer.  Move the cluster to the specified parent
790  * directory, change the filename to "0xINODENUMBER", and adjust the key.
791  * The cluster becomes our invisible hardlink target.
792  *
793  * The original cluster must be deleted on entry.
794  */
795 static
796 void
797 hammer2_hardlink_shiftup(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
798 			hammer2_inode_t *dip, hammer2_cluster_t *dcluster,
799 			int nlinks, int *errorp)
800 {
801 	const hammer2_inode_data_t *iptmp;
802 	hammer2_inode_data_t *nipdata;
803 	hammer2_cluster_t *xcluster;
804 	hammer2_key_t key_dummy;
805 	hammer2_key_t lhc;
806 	hammer2_blockref_t bref;
807 
808 	iptmp = &hammer2_cluster_rdata(cluster)->ipdata;
809 	lhc = iptmp->inum;
810 	KKASSERT((lhc & HAMMER2_DIRHASH_VISIBLE) == 0);
811 
812 	/*
813 	 * Locate the inode or indirect block to create the new
814 	 * entry in.  lhc represents the inode number so there is
815 	 * no collision iteration.
816 	 *
817 	 * There should be no key collisions with invisible inode keys.
818 	 *
819 	 * WARNING! Must use inode_lock_ex() on dip to handle a stale
820 	 *	    dip->cluster cache.
821 	 */
822 	*errorp = 0;
823 	xcluster = hammer2_cluster_lookup(dcluster, &key_dummy,
824 				      lhc, lhc, 0);
825 	if (xcluster) {
826 		kprintf("X3 chain %p dip %p dchain %p dip->chain %p\n",
827 			xcluster->focus, dip, dcluster->focus,
828 			dip->cluster.focus);
829 		hammer2_cluster_unlock(xcluster);
830 		hammer2_cluster_drop(xcluster);
831 		xcluster = NULL;
832 		*errorp = ENOSPC;
833 #if 0
834 		Debugger("X3");
835 #endif
836 	}
837 
838 	/*
839 	 * Handle the error case
840 	 */
841 	if (*errorp) {
842 		panic("error2");
843 		KKASSERT(xcluster == NULL);
844 		return;
845 	}
846 
847 	/*
848 	 * Use xcluster as a placeholder for (lhc).  Duplicate cluster to the
849 	 * same target bref as xcluster and then delete xcluster.  The
850 	 * duplication occurs after xcluster in flush order even though
851 	 * xcluster is deleted after the duplication. XXX
852 	 *
853 	 * WARNING! Duplications (to a different parent) can cause indirect
854 	 *	    blocks to be inserted, refactor xcluster.
855 	 *
856 	 * WARNING! Only key and keybits is extracted from a passed-in bref.
857 	 */
858 	hammer2_cluster_bref(cluster, &bref);
859 	bref.key = lhc;			/* invisible dir entry key */
860 	bref.keybits = 0;
861 	hammer2_cluster_rename(trans, &bref, dcluster, cluster, 0);
862 
863 	/*
864 	 * cluster is now 'live' again.. adjust the filename.
865 	 *
866 	 * Directory entries are inodes but this is a hidden hardlink
867 	 * target.  The name isn't used but to ease debugging give it
868 	 * a name after its inode number.
869 	 */
870 	hammer2_cluster_modify(trans, cluster, 0);
871 	nipdata = &hammer2_cluster_wdata(cluster)->ipdata;
872 	ksnprintf(nipdata->filename, sizeof(nipdata->filename),
873 		  "0x%016jx", (intmax_t)nipdata->inum);
874 	nipdata->name_len = strlen(nipdata->filename);
875 	nipdata->name_key = lhc;
876 	nipdata->nlinks += nlinks;
877 	hammer2_cluster_modsync(cluster);
878 }
879 
880 /*
881  * Connect the target inode represented by (cluster) to the media topology
882  * at (dip, name, len).  The caller can pass a rough *chainp, this function
883  * will issue lookup()s to position the parent chain properly for the
884  * chain insertion.
885  *
886  * If hlink is TRUE this function creates an OBJTYPE_HARDLINK directory
887  * entry instead of connecting (cluster).
888  *
889  * If hlink is FALSE this function expects (cluster) to be unparented.
890  */
891 int
892 hammer2_inode_connect(hammer2_trans_t *trans,
893 		      hammer2_cluster_t **clusterp, int hlink,
894 		      hammer2_inode_t *dip, hammer2_cluster_t *dcluster,
895 		      const uint8_t *name, size_t name_len,
896 		      hammer2_key_t lhc)
897 {
898 	hammer2_inode_data_t *wipdata;
899 	hammer2_cluster_t *ocluster;
900 	hammer2_cluster_t *ncluster;
901 	hammer2_key_t key_dummy;
902 	int error;
903 
904 	/*
905 	 * Since ocluster is either disconnected from the topology or
906 	 * represents a hardlink terminus which is always a parent of or
907 	 * equal to dip, we should be able to safely lock dip->chain for
908 	 * our setup.
909 	 *
910 	 * WARNING! Must use inode_lock_ex() on dip to handle a stale
911 	 *	    dip->cluster.
912 	 *
913 	 * If name is non-NULL we calculate lhc, else we use the passed-in
914 	 * lhc.
915 	 */
916 	ocluster = *clusterp;
917 
918 	if (name) {
919 		lhc = hammer2_dirhash(name, name_len);
920 
921 		/*
922 		 * Locate the inode or indirect block to create the new
923 		 * entry in.  At the same time check for key collisions
924 		 * and iterate until we don't get one.
925 		 */
926 		error = 0;
927 		while (error == 0) {
928 			ncluster = hammer2_cluster_lookup(dcluster, &key_dummy,
929 						      lhc, lhc, 0);
930 			if (ncluster == NULL)
931 				break;
932 			if ((lhc & HAMMER2_DIRHASH_LOMASK) ==
933 			    HAMMER2_DIRHASH_LOMASK) {
934 				error = ENOSPC;
935 			}
936 			hammer2_cluster_unlock(ncluster);
937 			hammer2_cluster_drop(ncluster);
938 			ncluster = NULL;
939 			++lhc;
940 		}
941 	} else {
942 		/*
943 		 * Reconnect to specific key (used when moving
944 		 * unlinked-but-open files into the hidden directory).
945 		 */
946 		ncluster = hammer2_cluster_lookup(dcluster, &key_dummy,
947 						  lhc, lhc, 0);
948 		KKASSERT(ncluster == NULL);
949 	}
950 
951 	if (error == 0) {
952 		if (hlink) {
953 			/*
954 			 * Hardlink pointer needed, create totally fresh
955 			 * directory entry.
956 			 *
957 			 * We must refactor ocluster because it might have
958 			 * been shifted into an indirect cluster by the
959 			 * create.
960 			 */
961 			KKASSERT(ncluster == NULL);
962 			error = hammer2_cluster_create(trans,
963 						       dcluster, &ncluster,
964 						       lhc, 0,
965 						       HAMMER2_BREF_TYPE_INODE,
966 						       HAMMER2_INODE_BYTES,
967 						       0);
968 		} else {
969 			/*
970 			 * Reconnect the original cluster under the new name.
971 			 * Original cluster must have already been deleted by
972 			 * teh caller.
973 			 *
974 			 * WARNING! Can cause held-over clusters to require a
975 			 *	    refactor.  Fortunately we have none (our
976 			 *	    locked clusters are passed into and
977 			 *	    modified by the call).
978 			 */
979 			ncluster = ocluster;
980 			ocluster = NULL;
981 			error = hammer2_cluster_create(trans,
982 						       dcluster, &ncluster,
983 						       lhc, 0,
984 						       HAMMER2_BREF_TYPE_INODE,
985 						       HAMMER2_INODE_BYTES,
986 						       0);
987 		}
988 	}
989 
990 	/*
991 	 * Unlock stuff.
992 	 */
993 	KKASSERT(error != EAGAIN);
994 
995 	/*
996 	 * ncluster should be NULL on error, leave ocluster
997 	 * (ocluster == *clusterp) alone.
998 	 */
999 	if (error) {
1000 		KKASSERT(ncluster == NULL);
1001 		return (error);
1002 	}
1003 
1004 	/*
1005 	 * Directory entries are inodes so if the name has changed we have
1006 	 * to update the inode.
1007 	 *
1008 	 * When creating an OBJTYPE_HARDLINK entry remember to unlock the
1009 	 * cluster, the caller will access the hardlink via the actual hardlink
1010 	 * target file and not the hardlink pointer entry, so we must still
1011 	 * return ocluster.
1012 	 */
1013 	if (hlink && hammer2_hardlink_enable >= 0) {
1014 		/*
1015 		 * Create the HARDLINK pointer.  oip represents the hardlink
1016 		 * target in this situation.
1017 		 *
1018 		 * We will return ocluster (the hardlink target).
1019 		 */
1020 		hammer2_cluster_modify(trans, ncluster, 0);
1021 		hammer2_cluster_clr_chainflags(ncluster,
1022 					       HAMMER2_CHAIN_UNLINKED);
1023 		KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
1024 		wipdata = &hammer2_cluster_wdata(ncluster)->ipdata;
1025 		bcopy(name, wipdata->filename, name_len);
1026 		wipdata->name_key = lhc;
1027 		wipdata->name_len = name_len;
1028 		wipdata->target_type =
1029 				hammer2_cluster_rdata(ocluster)->ipdata.type;
1030 		wipdata->type = HAMMER2_OBJTYPE_HARDLINK;
1031 		wipdata->inum = hammer2_cluster_rdata(ocluster)->ipdata.inum;
1032 		wipdata->version = HAMMER2_INODE_VERSION_ONE;
1033 		wipdata->nlinks = 1;
1034 		wipdata->op_flags = HAMMER2_OPFLAG_DIRECTDATA;
1035 		hammer2_cluster_modsync(ncluster);
1036 		hammer2_cluster_unlock(ncluster);
1037 		hammer2_cluster_drop(ncluster);
1038 		ncluster = ocluster;
1039 		ocluster = NULL;
1040 	} else {
1041 		/*
1042 		 * ncluster is a duplicate of ocluster at the new location.
1043 		 * We must fixup the name stored in the inode data.
1044 		 * The bref key has already been adjusted by inode_connect().
1045 		 */
1046 		hammer2_cluster_modify(trans, ncluster, 0);
1047 		hammer2_cluster_clr_chainflags(ncluster,
1048 					       HAMMER2_CHAIN_UNLINKED);
1049 		wipdata = &hammer2_cluster_wdata(ncluster)->ipdata;
1050 
1051 		KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
1052 		bcopy(name, wipdata->filename, name_len);
1053 		wipdata->name_key = lhc;
1054 		wipdata->name_len = name_len;
1055 		wipdata->nlinks = 1;
1056 		hammer2_cluster_modsync(ncluster);
1057 	}
1058 
1059 	/*
1060 	 * We are replacing ocluster with ncluster, unlock ocluster.  In the
1061 	 * case where ocluster is left unchanged the code above sets
1062 	 * ncluster to ocluster and ocluster to NULL, resulting in a NOP here.
1063 	 */
1064 	if (ocluster) {
1065 		hammer2_cluster_unlock(ocluster);
1066 		hammer2_cluster_drop(ocluster);
1067 	}
1068 	*clusterp = ncluster;
1069 
1070 	return (0);
1071 }
1072 
1073 /*
1074  * Repoint ip->cluster's chains to cluster's chains and fixup the default
1075  * focus.
1076  *
1077  * Caller must hold the inode and cluster exclusive locked, if not NULL,
1078  * must also be locked.
1079  *
1080  * Cluster may be NULL to clean out any chains in ip->cluster.
1081  */
1082 void
1083 hammer2_inode_repoint(hammer2_inode_t *ip, hammer2_inode_t *pip,
1084 		      hammer2_cluster_t *cluster)
1085 {
1086 	hammer2_chain_t *ochain;
1087 	hammer2_chain_t *nchain;
1088 	hammer2_inode_t *opip;
1089 	int i;
1090 
1091 	/*
1092 	 * Replace chains in ip->cluster with chains from cluster and
1093 	 * adjust the focus if necessary.
1094 	 *
1095 	 * NOTE: nchain and/or ochain can be NULL due to gaps
1096 	 *	 in the cluster arrays.
1097 	 */
1098 	for (i = 0; cluster && i < cluster->nchains; ++i) {
1099 		nchain = cluster->array[i].chain;
1100 		if (i < ip->cluster.nchains) {
1101 			ochain = ip->cluster.array[i].chain;
1102 			if (ochain == nchain)
1103 				continue;
1104 		} else {
1105 			ochain = NULL;
1106 		}
1107 
1108 		/*
1109 		 * Make adjustments
1110 		 */
1111 		ip->cluster.array[i].chain = nchain;
1112 		if (nchain)
1113 			hammer2_chain_ref(nchain);
1114 		if (ochain)
1115 			hammer2_chain_drop(ochain);
1116 	}
1117 
1118 	/*
1119 	 * Release any left-over chains in ip->cluster.
1120 	 */
1121 	while (i < ip->cluster.nchains) {
1122 		nchain = ip->cluster.array[i].chain;
1123 		if (nchain) {
1124 			ip->cluster.array[i].chain = NULL;
1125 			hammer2_chain_drop(nchain);
1126 		}
1127 		++i;
1128 	}
1129 
1130 	/*
1131 	 * Fixup fields.  Note that the inode-embedded cluster is never
1132 	 * directly locked.
1133 	 */
1134 	if (cluster) {
1135 		ip->cluster.nchains = cluster->nchains;
1136 		ip->cluster.focus = cluster->focus;
1137 		ip->cluster.flags = cluster->flags & ~HAMMER2_CLUSTER_LOCKED;
1138 	} else {
1139 		ip->cluster.nchains = 0;
1140 		ip->cluster.focus = NULL;
1141 		ip->cluster.flags &= ~HAMMER2_CLUSTER_ZFLAGS;
1142 	}
1143 
1144 	/*
1145 	 * Repoint ip->pip if requested (non-NULL pip).
1146 	 */
1147 	if (pip && ip->pip != pip) {
1148 		opip = ip->pip;
1149 		hammer2_inode_ref(pip);
1150 		ip->pip = pip;
1151 		if (opip)
1152 			hammer2_inode_drop(opip);
1153 	}
1154 }
1155 
1156 /*
1157  * Unlink the file from the specified directory inode.  The directory inode
1158  * does not need to be locked.
1159  *
1160  * isdir determines whether a directory/non-directory check should be made.
1161  * No check is made if isdir is set to -1.
1162  *
1163  * isopen specifies whether special unlink-with-open-descriptor handling
1164  * must be performed.  If set to -1 the caller is deleting a PFS and we
1165  * check whether the chain is mounted or not (chain->pmp != NULL).  1 is
1166  * implied if it is mounted.
1167  *
1168  * If isopen is 1 and nlinks drops to 0 this function must move the chain
1169  * to a special hidden directory until last-close occurs on the file.
1170  *
1171  * NOTE!  The underlying file can still be active with open descriptors
1172  *	  or if the chain is being manually held (e.g. for rename).
1173  *
1174  *	  The caller is responsible for fixing up ip->chain if e.g. a
1175  *	  rename occurs (see chain_duplicate()).
1176  *
1177  * NOTE!  The chain is not deleted if it is moved to the hidden directory,
1178  *	  but otherwise will be deleted.
1179  */
1180 int
1181 hammer2_unlink_file(hammer2_trans_t *trans, hammer2_inode_t *dip,
1182 		    const uint8_t *name, size_t name_len,
1183 		    int isdir, int *hlinkp, struct nchandle *nch,
1184 		    int nlinks)
1185 {
1186 	const hammer2_inode_data_t *ripdata;
1187 	hammer2_inode_data_t *wipdata;
1188 	hammer2_cluster_t *cparent;
1189 	hammer2_cluster_t *hcluster;
1190 	hammer2_cluster_t *hparent;
1191 	hammer2_cluster_t *cluster;
1192 	hammer2_cluster_t *dparent;
1193 	hammer2_cluster_t *dcluster;
1194 	hammer2_key_t key_dummy;
1195 	hammer2_key_t key_next;
1196 	hammer2_key_t lhc;
1197 	int error;
1198 	int hlink;
1199 	uint8_t type;
1200 
1201 	error = 0;
1202 	hlink = 0;
1203 	hcluster = NULL;
1204 	hparent = NULL;
1205 	lhc = hammer2_dirhash(name, name_len);
1206 
1207 again:
1208 	/*
1209 	 * Search for the filename in the directory
1210 	 */
1211 	cparent = hammer2_inode_lock(dip, HAMMER2_RESOLVE_ALWAYS);
1212 	cluster = hammer2_cluster_lookup(cparent, &key_next,
1213 				     lhc, lhc + HAMMER2_DIRHASH_LOMASK, 0);
1214 	while (cluster) {
1215 		if (hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE) {
1216 			ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1217 			if (ripdata->name_len == name_len &&
1218 			    bcmp(ripdata->filename, name, name_len) == 0) {
1219 				break;
1220 			}
1221 		}
1222 		cluster = hammer2_cluster_next(cparent, cluster, &key_next,
1223 					       key_next,
1224 					       lhc + HAMMER2_DIRHASH_LOMASK,
1225 					       0);
1226 	}
1227 	hammer2_inode_unlock(dip, NULL);	/* retain cparent */
1228 
1229 	/*
1230 	 * Not found or wrong type (isdir < 0 disables the type check).
1231 	 * If a hardlink pointer, type checks use the hardlink target.
1232 	 */
1233 	if (cluster == NULL) {
1234 		error = ENOENT;
1235 		goto done;
1236 	}
1237 	ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1238 	type = ripdata->type;
1239 	if (type == HAMMER2_OBJTYPE_HARDLINK) {
1240 		hlink = 1;
1241 		type = ripdata->target_type;
1242 	}
1243 
1244 	if (type == HAMMER2_OBJTYPE_DIRECTORY && isdir == 0) {
1245 		error = ENOTDIR;
1246 		goto done;
1247 	}
1248 	if (type != HAMMER2_OBJTYPE_DIRECTORY && isdir >= 1) {
1249 		error = EISDIR;
1250 		goto done;
1251 	}
1252 
1253 	/*
1254 	 * Hardlink must be resolved.  We can't hold the parent locked
1255 	 * while we do this or we could deadlock.  The physical file will
1256 	 * be located at or above the current directory.
1257 	 *
1258 	 * We loop to reacquire the hardlink origination.
1259 	 *
1260 	 * NOTE: hammer2_hardlink_find() will locate the hardlink target,
1261 	 *	 returning a modified hparent and hcluster.
1262 	 */
1263 	if (ripdata->type == HAMMER2_OBJTYPE_HARDLINK) {
1264 		if (hcluster == NULL) {
1265 			hcluster = cluster;
1266 			cluster = NULL;	/* safety */
1267 			hammer2_cluster_unlock(cparent);
1268 			hammer2_cluster_drop(cparent);
1269 			cparent = NULL; /* safety */
1270 			ripdata = NULL;	/* safety (associated w/cparent) */
1271 			error = hammer2_hardlink_find(dip, &hparent, &hcluster);
1272 
1273 			/*
1274 			 * If we couldn't find the hardlink target then some
1275 			 * parent directory containing the hardlink pointer
1276 			 * probably got renamed to above the original target,
1277 			 * a case not yet handled by H2.
1278 			 */
1279 			if (error) {
1280 				kprintf("H2 unlink_file: hardlink target for "
1281 					"\"%s\" not found\n",
1282 					name);
1283 				kprintf("(likely due to known directory "
1284 					"rename bug)\n");
1285 				goto done;
1286 			}
1287 			goto again;
1288 		}
1289 	}
1290 
1291 	/*
1292 	 * If this is a directory the directory must be empty.  However, if
1293 	 * isdir < 0 we are doing a rename and the directory does not have
1294 	 * to be empty, and if isdir > 1 we are deleting a PFS/snapshot
1295 	 * and the directory does not have to be empty.
1296 	 *
1297 	 * NOTE: We check the full key range here which covers both visible
1298 	 *	 and invisible entries.  Theoretically there should be no
1299 	 *	 invisible (hardlink target) entries if there are no visible
1300 	 *	 entries.
1301 	 */
1302 	if (type == HAMMER2_OBJTYPE_DIRECTORY && isdir == 1) {
1303 		dparent = hammer2_cluster_lookup_init(cluster, 0);
1304 		dcluster = hammer2_cluster_lookup(dparent, &key_dummy,
1305 					          0, (hammer2_key_t)-1,
1306 					          HAMMER2_LOOKUP_NODATA);
1307 		if (dcluster) {
1308 			hammer2_cluster_unlock(dcluster);
1309 			hammer2_cluster_drop(dcluster);
1310 			hammer2_cluster_lookup_done(dparent);
1311 			error = ENOTEMPTY;
1312 			goto done;
1313 		}
1314 		hammer2_cluster_lookup_done(dparent);
1315 		dparent = NULL;
1316 		/* dcluster NULL */
1317 	}
1318 
1319 	/*
1320 	 * If this was a hardlink then (cparent, cluster) is the hardlink
1321 	 * pointer, which we can simply destroy outright.  Discard the
1322 	 * clusters and replace with the hardlink target.
1323 	 */
1324 	if (hcluster) {
1325 		hammer2_cluster_delete(trans, cparent, cluster,
1326 				       HAMMER2_DELETE_PERMANENT);
1327 		hammer2_cluster_unlock(cparent);
1328 		hammer2_cluster_drop(cparent);
1329 		hammer2_cluster_unlock(cluster);
1330 		hammer2_cluster_drop(cluster);
1331 		cparent = hparent;
1332 		cluster = hcluster;
1333 		hparent = NULL;
1334 		hcluster = NULL;
1335 	}
1336 
1337 	/*
1338 	 * This leaves us with the hardlink target or non-hardlinked file
1339 	 * or directory in (cparent, cluster).
1340 	 *
1341 	 * Delete the target when nlinks reaches 0 with special handling
1342 	 * if (isopen) is set.
1343 	 *
1344 	 * NOTE! In DragonFly the vnops function calls cache_unlink() after
1345 	 *	 calling us here to clean out the namecache association,
1346 	 *	 (which does not represent a ref for the open-test), and to
1347 	 *	 force finalization of the vnode if/when the last ref gets
1348 	 *	 dropped.
1349 	 *
1350 	 * NOTE! Files are unlinked by rename and then relinked.  nch will be
1351 	 *	 passed as NULL in this situation.  hammer2_inode_connect()
1352 	 *	 will bump nlinks.
1353 	 */
1354 	KKASSERT(cluster != NULL);
1355 	hammer2_cluster_modify(trans, cluster, 0);
1356 	wipdata = &hammer2_cluster_wdata(cluster)->ipdata;
1357 	ripdata = wipdata;
1358 	wipdata->nlinks += nlinks;
1359 	if ((int64_t)wipdata->nlinks < 0) {	/* XXX debugging */
1360 		wipdata->nlinks = 0;
1361 	}
1362 	hammer2_cluster_modsync(cluster);
1363 
1364 	if (wipdata->nlinks == 0) {
1365 		/*
1366 		 * Target nlinks has reached 0, file now unlinked (but may
1367 		 * still be open).
1368 		 */
1369 		/* XXX need interlock if mounted
1370 		if ((cluster->focus->flags & HAMMER2_CHAIN_PFSROOT) &&
1371 		    cluster->pmp) {
1372 			error = EINVAL;
1373 			kprintf("hammer2: PFS \"%s\" cannot be deleted "
1374 				"while still mounted\n",
1375 				wipdata->filename);
1376 			goto done;
1377 		}
1378 		*/
1379 		hammer2_cluster_set_chainflags(cluster, HAMMER2_CHAIN_UNLINKED);
1380 		if (nch && cache_isopen(nch)) {
1381 			hammer2_inode_move_to_hidden(trans, &cparent, &cluster,
1382 						     wipdata->inum);
1383 		} else {
1384 			/*
1385 			 * This won't get everything if a vnode is still
1386 			 * present, but the cache_unlink() call the caller
1387 			 * makes will.
1388 			 */
1389 			hammer2_cluster_delete(trans, cparent, cluster,
1390 					       HAMMER2_DELETE_PERMANENT);
1391 		}
1392 	} else if (hlink == 0) {
1393 		/*
1394 		 * In this situation a normal non-hardlinked file (which can
1395 		 * only have nlinks == 1) still has a non-zero nlinks, the
1396 		 * caller must be doing a RENAME operation and so is passing
1397 		 * a nlinks adjustment of 0, and only wishes to remove file
1398 		 * in order to be able to reconnect it under a different name.
1399 		 *
1400 		 * In this situation we do a non-permanent deletion of the
1401 		 * chain in order to allow the file to be reconnected in
1402 		 * a different location.
1403 		 */
1404 		KKASSERT(nlinks == 0);
1405 		hammer2_cluster_delete(trans, cparent, cluster, 0);
1406 	}
1407 	error = 0;
1408 done:
1409 	if (cparent) {
1410 		hammer2_cluster_unlock(cparent);
1411 		hammer2_cluster_drop(cparent);
1412 	}
1413 	if (cluster) {
1414 		hammer2_cluster_unlock(cluster);
1415 		hammer2_cluster_drop(cluster);
1416 	}
1417 	if (hparent) {
1418 		hammer2_cluster_unlock(hparent);
1419 		hammer2_cluster_drop(hparent);
1420 	}
1421 	if (hcluster) {
1422 		hammer2_cluster_unlock(hcluster);
1423 		hammer2_cluster_drop(hcluster);
1424 	}
1425 	if (hlinkp)
1426 		*hlinkp = hlink;
1427 
1428 	return error;
1429 }
1430 
1431 /*
1432  * This is called from the mount code to initialize pmp->ihidden
1433  */
1434 void
1435 hammer2_inode_install_hidden(hammer2_pfs_t *pmp)
1436 {
1437 	hammer2_trans_t trans;
1438 	hammer2_cluster_t *cparent;
1439 	hammer2_cluster_t *cluster;
1440 	hammer2_cluster_t *scan;
1441 	const hammer2_inode_data_t *ripdata;
1442 	hammer2_inode_data_t *wipdata;
1443 	hammer2_key_t key_dummy;
1444 	hammer2_key_t key_next;
1445 	int error;
1446 	int count;
1447 	int dip_check_algo;
1448 	int dip_comp_algo;
1449 
1450 	if (pmp->ihidden)
1451 		return;
1452 
1453 	/*
1454 	 * Find the hidden directory
1455 	 */
1456 	bzero(&key_dummy, sizeof(key_dummy));
1457 	hammer2_trans_init(&trans, pmp, 0);
1458 
1459 	/*
1460 	 * Setup for lookup, retrieve iroot's check and compression
1461 	 * algorithm request which was likely generated by newfs_hammer2.
1462 	 *
1463 	 * The check/comp fields will probably never be used since inodes
1464 	 * are renamed into the hidden directory and not created relative to
1465 	 * the hidden directory, chain creation inherits from bref.methods,
1466 	 * and data chains inherit from their respective file inode *_algo
1467 	 * fields.
1468 	 */
1469 	cparent = hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_ALWAYS);
1470 	ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
1471 	dip_check_algo = ripdata->check_algo;
1472 	dip_comp_algo = ripdata->comp_algo;
1473 	ripdata = NULL;
1474 
1475 	cluster = hammer2_cluster_lookup(cparent, &key_dummy,
1476 					 HAMMER2_INODE_HIDDENDIR,
1477 					 HAMMER2_INODE_HIDDENDIR,
1478 					 0);
1479 	if (cluster) {
1480 		pmp->ihidden = hammer2_inode_get(pmp, pmp->iroot, cluster);
1481 		hammer2_inode_ref(pmp->ihidden);
1482 
1483 		/*
1484 		 * Remove any unlinked files which were left open as-of
1485 		 * any system crash.
1486 		 *
1487 		 * Don't pass NODATA, we need the inode data so the delete
1488 		 * can do proper statistics updates.
1489 		 */
1490 		count = 0;
1491 		scan = hammer2_cluster_lookup(cluster, &key_next,
1492 					      0, HAMMER2_TID_MAX, 0);
1493 		while (scan) {
1494 			if (hammer2_cluster_type(scan) ==
1495 			    HAMMER2_BREF_TYPE_INODE) {
1496 				hammer2_cluster_delete(&trans, cluster, scan,
1497 						   HAMMER2_DELETE_PERMANENT);
1498 				++count;
1499 			}
1500 			scan = hammer2_cluster_next(cluster, scan, &key_next,
1501 						    0, HAMMER2_TID_MAX, 0);
1502 		}
1503 
1504 		hammer2_inode_unlock(pmp->ihidden, cluster);
1505 		hammer2_inode_unlock(pmp->iroot, cparent);
1506 		hammer2_trans_done(&trans);
1507 		kprintf("hammer2: PFS loaded hidden dir, "
1508 			"removed %d dead entries\n", count);
1509 		return;
1510 	}
1511 
1512 	/*
1513 	 * Create the hidden directory
1514 	 */
1515 	error = hammer2_cluster_create(&trans, cparent, &cluster,
1516 				       HAMMER2_INODE_HIDDENDIR, 0,
1517 				       HAMMER2_BREF_TYPE_INODE,
1518 				       HAMMER2_INODE_BYTES,
1519 				       0);
1520 	hammer2_inode_unlock(pmp->iroot, cparent);
1521 
1522 	hammer2_cluster_modify(&trans, cluster, 0);
1523 	wipdata = &hammer2_cluster_wdata(cluster)->ipdata;
1524 	wipdata->type = HAMMER2_OBJTYPE_DIRECTORY;
1525 	wipdata->inum = HAMMER2_INODE_HIDDENDIR;
1526 	wipdata->nlinks = 1;
1527 	wipdata->comp_algo = dip_comp_algo;
1528 	wipdata->check_algo = dip_check_algo;
1529 	hammer2_cluster_modsync(cluster);
1530 	kprintf("hammer2: PFS root missing hidden directory, creating\n");
1531 
1532 	pmp->ihidden = hammer2_inode_get(pmp, pmp->iroot, cluster);
1533 	hammer2_inode_ref(pmp->ihidden);
1534 	hammer2_inode_unlock(pmp->ihidden, cluster);
1535 	hammer2_trans_done(&trans);
1536 }
1537 
1538 /*
1539  * If an open file is unlinked H2 needs to retain the file in the topology
1540  * to ensure that its backing store is not recovered by the bulk free scan.
1541  * This also allows us to avoid having to special-case the CHAIN_DELETED flag.
1542  *
1543  * To do this the file is moved to a hidden directory in the PFS root and
1544  * renamed.  The hidden directory must be created if it does not exist.
1545  */
1546 static
1547 void
1548 hammer2_inode_move_to_hidden(hammer2_trans_t *trans,
1549 			     hammer2_cluster_t **cparentp,
1550 			     hammer2_cluster_t **clusterp,
1551 			     hammer2_tid_t inum)
1552 {
1553 	hammer2_cluster_t *dcluster;
1554 	hammer2_pfs_t *pmp;
1555 	int error;
1556 
1557 	pmp = (*clusterp)->pmp;
1558 	KKASSERT(pmp != NULL);
1559 	KKASSERT(pmp->ihidden != NULL);
1560 
1561 	hammer2_cluster_delete(trans, *cparentp, *clusterp, 0);
1562 	dcluster = hammer2_inode_lock(pmp->ihidden, HAMMER2_RESOLVE_ALWAYS);
1563 	error = hammer2_inode_connect(trans, clusterp, 0,
1564 				      pmp->ihidden, dcluster,
1565 				      NULL, 0, inum);
1566 	hammer2_inode_unlock(pmp->ihidden, dcluster);
1567 	KKASSERT(error == 0);
1568 }
1569 
1570 /*
1571  * Given an exclusively locked inode and cluster we consolidate the cluster
1572  * for hardlink creation, adding (nlinks) to the file's link count and
1573  * potentially relocating the inode to (cdip) which is a parent directory
1574  * common to both the current location of the inode and the intended new
1575  * hardlink.
1576  *
1577  * Replaces (*clusterp) if consolidation occurred, unlocking the old cluster
1578  * and returning a new locked cluster.
1579  *
1580  * NOTE!  This function will also replace ip->cluster.
1581  */
1582 int
1583 hammer2_hardlink_consolidate(hammer2_trans_t *trans,
1584 			     hammer2_inode_t *ip,
1585 			     hammer2_cluster_t **clusterp,
1586 			     hammer2_inode_t *cdip,
1587 			     hammer2_cluster_t *cdcluster,
1588 			     int nlinks)
1589 {
1590 	const hammer2_inode_data_t *ripdata;
1591 	hammer2_inode_data_t *wipdata;
1592 	hammer2_cluster_t *cluster;
1593 	hammer2_cluster_t *cparent;
1594 	int error;
1595 
1596 	cluster = *clusterp;
1597 	ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1598 	if (nlinks == 0 &&			/* no hardlink needed */
1599 	    (ripdata->name_key & HAMMER2_DIRHASH_VISIBLE)) {
1600 		return (0);
1601 	}
1602 
1603 	if (hammer2_hardlink_enable == 0) {	/* disallow hardlinks */
1604 		hammer2_cluster_unlock(cluster);
1605 		hammer2_cluster_drop(cluster);
1606 		*clusterp = NULL;
1607 		return (ENOTSUP);
1608 	}
1609 
1610 	cparent = NULL;
1611 
1612 	/*
1613 	 * If no change in the hardlink's target directory is required and
1614 	 * this is already a hardlink target, all we need to do is adjust
1615 	 * the link count.
1616 	 */
1617 	ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1618 	if (cdip == ip->pip &&
1619 	    (ripdata->name_key & HAMMER2_DIRHASH_VISIBLE) == 0) {
1620 		if (nlinks) {
1621 			hammer2_cluster_modify(trans, cluster, 0);
1622 			wipdata = &hammer2_cluster_wdata(cluster)->ipdata;
1623 			wipdata->nlinks += nlinks;
1624 			hammer2_cluster_modsync(cluster);
1625 			ripdata = wipdata;
1626 		}
1627 		error = 0;
1628 		goto done;
1629 	}
1630 
1631 	/*
1632 	 * Cluster is the real inode.  The originating directory is locked
1633 	 * by the caller so we can manipulate it without worrying about races
1634 	 * against other lookups.
1635 	 *
1636 	 * If cluster is visible we need to delete it from the current
1637 	 * location and create a hardlink pointer in its place.  If it is
1638 	 * not visible we need only delete it.  Then later cluster will be
1639 	 * renamed to a parent directory and converted (if necessary) to
1640 	 * a hidden inode (via shiftup).
1641 	 *
1642 	 * NOTE! We must hold cparent locked through the delete/create/rename
1643 	 *	 operation to ensure that other threads block resolving to
1644 	 *	 the same hardlink, otherwise the other threads may not see
1645 	 *	 the hardlink.
1646 	 */
1647 	KKASSERT((cluster->focus->flags & HAMMER2_CHAIN_DELETED) == 0);
1648 	cparent = hammer2_cluster_parent(cluster);
1649 
1650 	hammer2_cluster_delete(trans, cparent, cluster, 0);
1651 
1652 	ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1653 	KKASSERT(ripdata->type != HAMMER2_OBJTYPE_HARDLINK);
1654 	if (ripdata->name_key & HAMMER2_DIRHASH_VISIBLE) {
1655 		hammer2_cluster_t *ncluster;
1656 		hammer2_key_t lhc;
1657 
1658 		ncluster = NULL;
1659 		lhc = cluster->focus->bref.key;
1660 		error = hammer2_cluster_create(trans, cparent, &ncluster,
1661 					     lhc, 0,
1662 					     HAMMER2_BREF_TYPE_INODE,
1663 					     HAMMER2_INODE_BYTES,
1664 					     0);
1665 		hammer2_cluster_modify(trans, ncluster, 0);
1666 		wipdata = &hammer2_cluster_wdata(ncluster)->ipdata;
1667 
1668 		/* wipdata->comp_algo = ripdata->comp_algo; */
1669 		wipdata->comp_algo = 0;
1670 		wipdata->check_algo = 0;
1671 		wipdata->version = HAMMER2_INODE_VERSION_ONE;
1672 		wipdata->inum = ripdata->inum;
1673 		wipdata->target_type = ripdata->type;
1674 		wipdata->type = HAMMER2_OBJTYPE_HARDLINK;
1675 		wipdata->uflags = 0;
1676 		wipdata->rmajor = 0;
1677 		wipdata->rminor = 0;
1678 		wipdata->ctime = 0;
1679 		wipdata->mtime = 0;
1680 		wipdata->atime = 0;
1681 		wipdata->btime = 0;
1682 		bzero(&wipdata->uid, sizeof(wipdata->uid));
1683 		bzero(&wipdata->gid, sizeof(wipdata->gid));
1684 		wipdata->op_flags = HAMMER2_OPFLAG_DIRECTDATA;
1685 		wipdata->cap_flags = 0;
1686 		wipdata->mode = 0;
1687 		wipdata->size = 0;
1688 		wipdata->nlinks = 1;
1689 		wipdata->iparent = 0;	/* XXX */
1690 		wipdata->pfs_type = 0;
1691 		wipdata->pfs_inum = 0;
1692 		bzero(&wipdata->pfs_clid, sizeof(wipdata->pfs_clid));
1693 		bzero(&wipdata->pfs_fsid, sizeof(wipdata->pfs_fsid));
1694 		wipdata->data_quota = 0;
1695 		wipdata->data_count = 0;
1696 		wipdata->inode_quota = 0;
1697 		wipdata->inode_count = 0;
1698 		wipdata->attr_tid = 0;
1699 		wipdata->dirent_tid = 0;
1700 		bzero(&wipdata->u, sizeof(wipdata->u));
1701 		bcopy(ripdata->filename, wipdata->filename, ripdata->name_len);
1702 		wipdata->name_key = ncluster->focus->bref.key;
1703 		wipdata->name_len = ripdata->name_len;
1704 		/* XXX transaction ids */
1705 		hammer2_cluster_modsync(ncluster);
1706 		hammer2_cluster_unlock(ncluster);
1707 		hammer2_cluster_drop(ncluster);
1708 	}
1709 	ripdata = wipdata;
1710 
1711 	/*
1712 	 * cluster represents the hardlink target and is now flagged deleted.
1713 	 * duplicate it to the parent directory and adjust nlinks.
1714 	 *
1715 	 * WARNING! The shiftup() call can cause ncluster to be moved into
1716 	 *	    an indirect block, and our ncluster will wind up pointing
1717 	 *	    to the older/original version.
1718 	 */
1719 	KKASSERT(cluster->focus->flags & HAMMER2_CHAIN_DELETED);
1720 	hammer2_hardlink_shiftup(trans, cluster, cdip, cdcluster,
1721 				 nlinks, &error);
1722 
1723 	if (error == 0)
1724 		hammer2_inode_repoint(ip, cdip, cluster);
1725 
1726 done:
1727 	/*
1728 	 * Cleanup, cluster/ncluster already dealt with.
1729 	 *
1730 	 * Return the shifted cluster in *clusterp.
1731 	 */
1732 	if (cparent) {
1733 		hammer2_cluster_unlock(cparent);
1734 		hammer2_cluster_drop(cparent);
1735 	}
1736 	*clusterp = cluster;
1737 
1738 	return (error);
1739 }
1740 
1741 /*
1742  * If (*ochainp) is non-NULL it points to the forward OBJTYPE_HARDLINK
1743  * inode while (*chainp) points to the resolved (hidden hardlink
1744  * target) inode.  In this situation when nlinks is 1 we wish to
1745  * deconsolidate the hardlink, moving it back to the directory that now
1746  * represents the only remaining link.
1747  */
1748 int
1749 hammer2_hardlink_deconsolidate(hammer2_trans_t *trans,
1750 			       hammer2_inode_t *dip,
1751 			       hammer2_chain_t **chainp,
1752 			       hammer2_chain_t **ochainp)
1753 {
1754 	if (*ochainp == NULL)
1755 		return (0);
1756 	/* XXX */
1757 	return (0);
1758 }
1759 
1760 /*
1761  * The caller presents a locked cluster with an obj_type of
1762  * HAMMER2_OBJTYPE_HARDLINK in (*clusterp).  This routine will locate
1763  * the inode and replace (*clusterp) with a new locked cluster containing
1764  * the target hardlink, also locked.  The original cluster will be
1765  * unlocked and released.
1766  *
1767  * If cparentp is not NULL a locked cluster representing the hardlink's
1768  * parent is also returned.
1769  *
1770  * If we are unable to locate the hardlink target EIO is returned,
1771  * (*cparentp) is set to NULL, the original passed-in (*clusterp)
1772  * will be unlocked and released and (*clusterp) will be set to NULL
1773  * as well.
1774  */
1775 int
1776 hammer2_hardlink_find(hammer2_inode_t *dip,
1777 		      hammer2_cluster_t **cparentp,
1778 		      hammer2_cluster_t **clusterp)
1779 {
1780 	const hammer2_inode_data_t *ipdata;
1781 	hammer2_cluster_t *cluster;
1782 	hammer2_cluster_t *cparent;
1783 	hammer2_cluster_t *rcluster;
1784 	hammer2_inode_t *ip;
1785 	hammer2_inode_t *pip;
1786 	hammer2_key_t key_dummy;
1787 	hammer2_key_t lhc;
1788 
1789 	cluster = *clusterp;
1790 	pip = dip;
1791 	hammer2_inode_ref(pip);		/* for loop */
1792 
1793 	/*
1794 	 * Locate the hardlink.  pip is referenced and not locked.
1795 	 * Unlock and release (*clusterp) after extracting the needed
1796 	 * data.
1797 	 */
1798 	ipdata = &hammer2_cluster_rdata(cluster)->ipdata;
1799 	lhc = ipdata->inum;
1800 	ipdata = NULL;			/* safety */
1801 	hammer2_cluster_unlock(cluster);
1802 	hammer2_cluster_drop(cluster);
1803 	*clusterp = NULL;		/* safety */
1804 
1805 	rcluster = NULL;
1806 	cparent = NULL;
1807 
1808 	while ((ip = pip) != NULL) {
1809 		cparent = hammer2_inode_lock(ip, HAMMER2_RESOLVE_ALWAYS);
1810 		hammer2_inode_drop(ip);			/* loop */
1811 		KKASSERT(hammer2_cluster_type(cparent) ==
1812 			 HAMMER2_BREF_TYPE_INODE);
1813 		rcluster = hammer2_cluster_lookup(cparent, &key_dummy,
1814 					     lhc, lhc, 0);
1815 		if (rcluster)
1816 			break;
1817 		hammer2_cluster_lookup_done(cparent);	/* discard parent */
1818 		cparent = NULL;				/* safety */
1819 		pip = ip->pip;		/* safe, ip held locked */
1820 		if (pip)
1821 			hammer2_inode_ref(pip);		/* loop */
1822 		hammer2_inode_unlock(ip, NULL);
1823 	}
1824 
1825 	/*
1826 	 * chain is locked, ip is locked.  Unlock ip, return the locked
1827 	 * chain.  *ipp is already set w/a ref count and not locked.
1828 	 *
1829 	 * (cparent is already unlocked).
1830 	 */
1831 	*clusterp = rcluster;
1832 	if (rcluster) {
1833 		if (cparentp) {
1834 			*cparentp = cparent;
1835 			hammer2_inode_unlock(ip, NULL);
1836 		} else {
1837 			hammer2_inode_unlock(ip, cparent);
1838 		}
1839 		return (0);
1840 	} else {
1841 		if (cparentp)
1842 			*cparentp = NULL;
1843 		if (ip)
1844 			hammer2_inode_unlock(ip, cparent);
1845 		return (EIO);
1846 	}
1847 }
1848 
1849 /*
1850  * Find the directory common to both fdip and tdip.
1851  *
1852  * Returns a held but not locked inode.  Caller typically locks the inode,
1853  * and when through unlocks AND drops it.
1854  */
1855 hammer2_inode_t *
1856 hammer2_inode_common_parent(hammer2_inode_t *fdip, hammer2_inode_t *tdip)
1857 {
1858 	hammer2_inode_t *scan1;
1859 	hammer2_inode_t *scan2;
1860 
1861 	/*
1862 	 * We used to have a depth field but it complicated matters too
1863 	 * much for directory renames.  So now its ugly.  Check for
1864 	 * simple cases before giving up and doing it the expensive way.
1865 	 *
1866 	 * XXX need a bottom-up topology stability lock
1867 	 */
1868 	if (fdip == tdip || fdip == tdip->pip) {
1869 		hammer2_inode_ref(fdip);
1870 		return(fdip);
1871 	}
1872 	if (fdip->pip == tdip) {
1873 		hammer2_inode_ref(tdip);
1874 		return(tdip);
1875 	}
1876 
1877 	/*
1878 	 * XXX not MPSAFE
1879 	 */
1880 	for (scan1 = fdip; scan1->pmp == fdip->pmp; scan1 = scan1->pip) {
1881 		scan2 = tdip;
1882 		while (scan2->pmp == tdip->pmp) {
1883 			if (scan1 == scan2) {
1884 				hammer2_inode_ref(scan1);
1885 				return(scan1);
1886 			}
1887 			scan2 = scan2->pip;
1888 			if (scan2 == NULL)
1889 				break;
1890 		}
1891 	}
1892 	panic("hammer2_inode_common_parent: no common parent %p %p\n",
1893 	      fdip, tdip);
1894 	/* NOT REACHED */
1895 	return(NULL);
1896 }
1897 
1898 /*
1899  * Synchronize the inode's frontend state with the chain state prior
1900  * to any explicit flush of the inode or any strategy write call.
1901  *
1902  * Called with a locked inode.
1903  */
1904 void
1905 hammer2_inode_fsync(hammer2_trans_t *trans, hammer2_inode_t *ip,
1906 		    hammer2_cluster_t *cparent)
1907 {
1908 	const hammer2_inode_data_t *ripdata;
1909 	hammer2_inode_data_t *wipdata;
1910 	hammer2_cluster_t *dparent;
1911 	hammer2_cluster_t *cluster;
1912 	hammer2_key_t lbase;
1913 	hammer2_key_t key_next;
1914 	int dosync = 0;
1915 
1916 	ripdata = &hammer2_cluster_rdata(cparent)->ipdata;    /* target file */
1917 
1918 	if (ip->flags & HAMMER2_INODE_MTIME) {
1919 		wipdata = hammer2_cluster_modify_ip(trans, ip, cparent, 0);
1920 		atomic_clear_int(&ip->flags, HAMMER2_INODE_MTIME);
1921 		wipdata->mtime = ip->mtime;
1922 		dosync = 1;
1923 		ripdata = wipdata;
1924 	}
1925 	if ((ip->flags & HAMMER2_INODE_RESIZED) && ip->size < ripdata->size) {
1926 		wipdata = hammer2_cluster_modify_ip(trans, ip, cparent, 0);
1927 		wipdata->size = ip->size;
1928 		dosync = 1;
1929 		ripdata = wipdata;
1930 		atomic_clear_int(&ip->flags, HAMMER2_INODE_RESIZED);
1931 
1932 		/*
1933 		 * We must delete any chains beyond the EOF.  The chain
1934 		 * straddling the EOF will be pending in the bioq.
1935 		 */
1936 		lbase = (ripdata->size + HAMMER2_PBUFMASK64) &
1937 			~HAMMER2_PBUFMASK64;
1938 		dparent = hammer2_cluster_lookup_init(&ip->cluster, 0);
1939 		cluster = hammer2_cluster_lookup(dparent, &key_next,
1940 					         lbase, (hammer2_key_t)-1,
1941 						 HAMMER2_LOOKUP_NODATA);
1942 		while (cluster) {
1943 			/*
1944 			 * Degenerate embedded case, nothing to loop on
1945 			 */
1946 			switch (hammer2_cluster_type(cluster)) {
1947 			case HAMMER2_BREF_TYPE_INODE:
1948 				hammer2_cluster_unlock(cluster);
1949 				hammer2_cluster_drop(cluster);
1950 				cluster = NULL;
1951 				break;
1952 			case HAMMER2_BREF_TYPE_DATA:
1953 				hammer2_cluster_delete(trans, dparent, cluster,
1954 						   HAMMER2_DELETE_PERMANENT);
1955 				/* fall through */
1956 			default:
1957 				cluster = hammer2_cluster_next(dparent, cluster,
1958 						   &key_next,
1959 						   key_next, (hammer2_key_t)-1,
1960 						   HAMMER2_LOOKUP_NODATA);
1961 				break;
1962 			}
1963 		}
1964 		hammer2_cluster_lookup_done(dparent);
1965 	} else
1966 	if ((ip->flags & HAMMER2_INODE_RESIZED) && ip->size > ripdata->size) {
1967 		wipdata = hammer2_cluster_modify_ip(trans, ip, cparent, 0);
1968 		wipdata->size = ip->size;
1969 		atomic_clear_int(&ip->flags, HAMMER2_INODE_RESIZED);
1970 
1971 		/*
1972 		 * When resizing larger we may not have any direct-data
1973 		 * available.
1974 		 */
1975 		if ((wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) &&
1976 		    ip->size > HAMMER2_EMBEDDED_BYTES) {
1977 			wipdata->op_flags &= ~HAMMER2_OPFLAG_DIRECTDATA;
1978 			bzero(&wipdata->u.blockset,
1979 			      sizeof(wipdata->u.blockset));
1980 		}
1981 		dosync = 1;
1982 		ripdata = wipdata;
1983 	}
1984 	if (dosync)
1985 		hammer2_cluster_modsync(cparent);
1986 }
1987