xref: /dragonfly/sys/vfs/hammer2/hammer2_inode.c (revision 31c7ac8b)
1 /*
2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/cdefs.h>
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/types.h>
39 #include <sys/lock.h>
40 #include <sys/uuid.h>
41 
42 #include "hammer2.h"
43 
44 #define INODE_DEBUG	0
45 
46 static void hammer2_inode_move_to_hidden(hammer2_trans_t *trans,
47 					 hammer2_chain_t **chainp,
48 					 hammer2_tid_t inum);
49 
50 RB_GENERATE2(hammer2_inode_tree, hammer2_inode, rbnode, hammer2_inode_cmp,
51 	     hammer2_tid_t, inum);
52 
53 int
54 hammer2_inode_cmp(hammer2_inode_t *ip1, hammer2_inode_t *ip2)
55 {
56 	if (ip1->inum < ip2->inum)
57 		return(-1);
58 	if (ip1->inum > ip2->inum)
59 		return(1);
60 	return(0);
61 }
62 
63 /*
64  * HAMMER2 inode locks
65  *
66  * HAMMER2 offers shared locks and exclusive locks on inodes.
67  *
68  * An inode's ip->chain pointer is resolved and stable while an inode is
69  * locked, and can be cleaned out at any time (become NULL) when an inode
70  * is not locked.
71  *
72  * This function handles duplication races and hardlink replacement races
73  * which can cause ip's cached chain to become stale.
74  *
75  * The underlying chain is also locked and returned.
76  *
77  * NOTE: We don't combine the inode/chain lock because putting away an
78  *       inode would otherwise confuse multiple lock holders of the inode.
79  */
80 hammer2_chain_t *
81 hammer2_inode_lock_ex(hammer2_inode_t *ip)
82 {
83 	hammer2_chain_t *chain;
84 	hammer2_chain_t *ochain;
85 	hammer2_chain_core_t *core;
86 	int error;
87 
88 	hammer2_inode_ref(ip);
89 	ccms_thread_lock(&ip->topo_cst, CCMS_STATE_EXCLUSIVE);
90 
91 	chain = ip->chain;
92 	core = chain->core;
93 	for (;;) {
94 		if (chain->flags & HAMMER2_CHAIN_DUPLICATED) {
95 			spin_lock(&core->cst.spin);
96 			while (chain->flags & HAMMER2_CHAIN_DUPLICATED)
97 				chain = TAILQ_NEXT(chain, core_entry);
98 			hammer2_chain_ref(chain);
99 			spin_unlock(&core->cst.spin);
100 			hammer2_inode_repoint(ip, NULL, chain);
101 			hammer2_chain_drop(chain);
102 		}
103 		hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
104 		if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0)
105 			break;
106 		hammer2_chain_unlock(chain);
107 	}
108 	if (chain->data->ipdata.type == HAMMER2_OBJTYPE_HARDLINK &&
109 	    (chain->flags & HAMMER2_CHAIN_DELETED) == 0) {
110 		error = hammer2_hardlink_find(ip->pip, &chain, &ochain);
111 		hammer2_chain_drop(ochain);
112 		KKASSERT(error == 0);
113 		/* XXX error handling */
114 	}
115 	return (chain);
116 }
117 
118 void
119 hammer2_inode_unlock_ex(hammer2_inode_t *ip, hammer2_chain_t *chain)
120 {
121 	/*
122 	 * XXX this will catch parent directories too which we don't
123 	 *     really want.
124 	 */
125 	if (chain)
126 		hammer2_chain_unlock(chain);
127 	ccms_thread_unlock(&ip->topo_cst);
128 	hammer2_inode_drop(ip);
129 }
130 
131 /*
132  * NOTE: We don't combine the inode/chain lock because putting away an
133  *       inode would otherwise confuse multiple lock holders of the inode.
134  *
135  *	 Shared locks are especially sensitive to having too many shared
136  *	 lock counts (from the same thread) on certain paths which might
137  *	 need to upgrade them.  Only one count of a shared lock can be
138  *	 upgraded.
139  */
140 hammer2_chain_t *
141 hammer2_inode_lock_sh(hammer2_inode_t *ip)
142 {
143 	hammer2_chain_t *chain;
144 
145 	hammer2_inode_ref(ip);
146 	for (;;) {
147 		ccms_thread_lock(&ip->topo_cst, CCMS_STATE_SHARED);
148 
149 		chain = ip->chain;
150 		KKASSERT(chain != NULL);	/* for now */
151 		hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS |
152 					  HAMMER2_RESOLVE_SHARED);
153 
154 		/*
155 		 * Resolve duplication races, resolve hardlinks by giving
156 		 * up and cycling an exclusive lock.
157 		 */
158 		if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0 &&
159 		    chain->data->ipdata.type != HAMMER2_OBJTYPE_HARDLINK) {
160 			break;
161 		}
162 		hammer2_chain_unlock(chain);
163 		ccms_thread_unlock(&ip->topo_cst);
164 		chain = hammer2_inode_lock_ex(ip);
165 		hammer2_inode_unlock_ex(ip, chain);
166 	}
167 	return (chain);
168 }
169 
170 void
171 hammer2_inode_unlock_sh(hammer2_inode_t *ip, hammer2_chain_t *chain)
172 {
173 	if (chain)
174 		hammer2_chain_unlock(chain);
175 	ccms_thread_unlock(&ip->topo_cst);
176 	hammer2_inode_drop(ip);
177 }
178 
179 ccms_state_t
180 hammer2_inode_lock_temp_release(hammer2_inode_t *ip)
181 {
182 	return(ccms_thread_lock_temp_release(&ip->topo_cst));
183 }
184 
185 void
186 hammer2_inode_lock_temp_restore(hammer2_inode_t *ip, ccms_state_t ostate)
187 {
188 	ccms_thread_lock_temp_restore(&ip->topo_cst, ostate);
189 }
190 
191 ccms_state_t
192 hammer2_inode_lock_upgrade(hammer2_inode_t *ip)
193 {
194 	return(ccms_thread_lock_upgrade(&ip->topo_cst));
195 }
196 
197 void
198 hammer2_inode_lock_downgrade(hammer2_inode_t *ip, ccms_state_t ostate)
199 {
200 	ccms_thread_lock_downgrade(&ip->topo_cst, ostate);
201 }
202 
203 /*
204  * Lookup an inode by inode number
205  */
206 hammer2_inode_t *
207 hammer2_inode_lookup(hammer2_pfsmount_t *pmp, hammer2_tid_t inum)
208 {
209 	hammer2_inode_t *ip;
210 
211 	if (pmp) {
212 		spin_lock(&pmp->inum_spin);
213 		ip = RB_LOOKUP(hammer2_inode_tree, &pmp->inum_tree, inum);
214 		if (ip)
215 			hammer2_inode_ref(ip);
216 		spin_unlock(&pmp->inum_spin);
217 	} else {
218 		ip = NULL;
219 	}
220 	return(ip);
221 }
222 
223 /*
224  * Adding a ref to an inode is only legal if the inode already has at least
225  * one ref.
226  */
227 void
228 hammer2_inode_ref(hammer2_inode_t *ip)
229 {
230 	atomic_add_int(&ip->refs, 1);
231 }
232 
233 /*
234  * Drop an inode reference, freeing the inode when the last reference goes
235  * away.
236  */
237 void
238 hammer2_inode_drop(hammer2_inode_t *ip)
239 {
240 	hammer2_pfsmount_t *pmp;
241 	hammer2_inode_t *pip;
242 	u_int refs;
243 
244 	while (ip) {
245 		refs = ip->refs;
246 		cpu_ccfence();
247 		if (refs == 1) {
248 			/*
249 			 * Transition to zero, must interlock with
250 			 * the inode inumber lookup tree (if applicable).
251 			 *
252 			 * NOTE: The super-root inode has no pmp.
253 			 */
254 			pmp = ip->pmp;
255 			if (pmp)
256 				spin_lock(&pmp->inum_spin);
257 
258 			if (atomic_cmpset_int(&ip->refs, 1, 0)) {
259 				KKASSERT(ip->topo_cst.count == 0);
260 				if (ip->flags & HAMMER2_INODE_ONRBTREE) {
261 					atomic_clear_int(&ip->flags,
262 						     HAMMER2_INODE_ONRBTREE);
263 					RB_REMOVE(hammer2_inode_tree,
264 						  &pmp->inum_tree, ip);
265 				}
266 				if (pmp)
267 					spin_unlock(&pmp->inum_spin);
268 
269 				pip = ip->pip;
270 				ip->pip = NULL;
271 				ip->pmp = NULL;
272 
273 				/*
274 				 * Cleaning out ip->chain isn't entirely
275 				 * trivial.
276 				 */
277 				hammer2_inode_repoint(ip, NULL, NULL);
278 
279 				/*
280 				 * We have to drop pip (if non-NULL) to
281 				 * dispose of our implied reference from
282 				 * ip->pip.  We can simply loop on it.
283 				 */
284 				if (pmp) {
285 					KKASSERT((ip->flags &
286 						  HAMMER2_INODE_SROOT) == 0);
287 					kfree(ip, pmp->minode);
288 					atomic_add_long(&pmp->inmem_inodes, -1);
289 				} else {
290 					KKASSERT(ip->flags &
291 						 HAMMER2_INODE_SROOT);
292 					kfree(ip, M_HAMMER2);
293 				}
294 				ip = pip;
295 				/* continue with pip (can be NULL) */
296 			} else {
297 				if (pmp)
298 					spin_unlock(&ip->pmp->inum_spin);
299 			}
300 		} else {
301 			/*
302 			 * Non zero transition
303 			 */
304 			if (atomic_cmpset_int(&ip->refs, refs, refs - 1))
305 				break;
306 		}
307 	}
308 }
309 
310 /*
311  * Get the vnode associated with the given inode, allocating the vnode if
312  * necessary.  The vnode will be returned exclusively locked.
313  *
314  * The caller must lock the inode (shared or exclusive).
315  *
316  * Great care must be taken to avoid deadlocks and vnode acquisition/reclaim
317  * races.
318  */
319 struct vnode *
320 hammer2_igetv(hammer2_inode_t *ip, int *errorp)
321 {
322 	hammer2_inode_data_t *ipdata;
323 	hammer2_pfsmount_t *pmp;
324 	struct vnode *vp;
325 	ccms_state_t ostate;
326 
327 	pmp = ip->pmp;
328 	KKASSERT(pmp != NULL);
329 	*errorp = 0;
330 	ipdata = &ip->chain->data->ipdata;
331 
332 	for (;;) {
333 		/*
334 		 * Attempt to reuse an existing vnode assignment.  It is
335 		 * possible to race a reclaim so the vget() may fail.  The
336 		 * inode must be unlocked during the vget() to avoid a
337 		 * deadlock against a reclaim.
338 		 */
339 		vp = ip->vp;
340 		if (vp) {
341 			/*
342 			 * Inode must be unlocked during the vget() to avoid
343 			 * possible deadlocks, but leave the ip ref intact.
344 			 *
345 			 * vnode is held to prevent destruction during the
346 			 * vget().  The vget() can still fail if we lost
347 			 * a reclaim race on the vnode.
348 			 */
349 			vhold(vp);
350 			ostate = hammer2_inode_lock_temp_release(ip);
351 			if (vget(vp, LK_EXCLUSIVE)) {
352 				vdrop(vp);
353 				hammer2_inode_lock_temp_restore(ip, ostate);
354 				continue;
355 			}
356 			hammer2_inode_lock_temp_restore(ip, ostate);
357 			vdrop(vp);
358 			/* vp still locked and ref from vget */
359 			if (ip->vp != vp) {
360 				kprintf("hammer2: igetv race %p/%p\n",
361 					ip->vp, vp);
362 				vput(vp);
363 				continue;
364 			}
365 			*errorp = 0;
366 			break;
367 		}
368 
369 		/*
370 		 * No vnode exists, allocate a new vnode.  Beware of
371 		 * allocation races.  This function will return an
372 		 * exclusively locked and referenced vnode.
373 		 */
374 		*errorp = getnewvnode(VT_HAMMER2, pmp->mp, &vp, 0, 0);
375 		if (*errorp) {
376 			kprintf("hammer2: igetv getnewvnode failed %d\n",
377 				*errorp);
378 			vp = NULL;
379 			break;
380 		}
381 
382 		/*
383 		 * Lock the inode and check for an allocation race.
384 		 */
385 		ostate = hammer2_inode_lock_upgrade(ip);
386 		if (ip->vp != NULL) {
387 			vp->v_type = VBAD;
388 			vx_put(vp);
389 			hammer2_inode_lock_downgrade(ip, ostate);
390 			continue;
391 		}
392 
393 		switch (ipdata->type) {
394 		case HAMMER2_OBJTYPE_DIRECTORY:
395 			vp->v_type = VDIR;
396 			break;
397 		case HAMMER2_OBJTYPE_REGFILE:
398 			vp->v_type = VREG;
399 			vinitvmio(vp, ipdata->size,
400 				  HAMMER2_LBUFSIZE,
401 				  (int)ipdata->size & HAMMER2_LBUFMASK);
402 			break;
403 		case HAMMER2_OBJTYPE_SOFTLINK:
404 			/*
405 			 * XXX for now we are using the generic file_read
406 			 * and file_write code so we need a buffer cache
407 			 * association.
408 			 */
409 			vp->v_type = VLNK;
410 			vinitvmio(vp, ipdata->size,
411 				  HAMMER2_LBUFSIZE,
412 				  (int)ipdata->size & HAMMER2_LBUFMASK);
413 			break;
414 		case HAMMER2_OBJTYPE_CDEV:
415 			vp->v_type = VCHR;
416 			/* fall through */
417 		case HAMMER2_OBJTYPE_BDEV:
418 			vp->v_ops = &pmp->mp->mnt_vn_spec_ops;
419 			if (ipdata->type != HAMMER2_OBJTYPE_CDEV)
420 				vp->v_type = VBLK;
421 			addaliasu(vp, ipdata->rmajor, ipdata->rminor);
422 			break;
423 		case HAMMER2_OBJTYPE_FIFO:
424 			vp->v_type = VFIFO;
425 			vp->v_ops = &pmp->mp->mnt_vn_fifo_ops;
426 			break;
427 		default:
428 			panic("hammer2: unhandled objtype %d", ipdata->type);
429 			break;
430 		}
431 
432 		if (ip == pmp->iroot)
433 			vsetflags(vp, VROOT);
434 
435 		vp->v_data = ip;
436 		ip->vp = vp;
437 		hammer2_inode_ref(ip);		/* vp association */
438 		hammer2_inode_lock_downgrade(ip, ostate);
439 		break;
440 	}
441 
442 	/*
443 	 * Return non-NULL vp and *errorp == 0, or NULL vp and *errorp != 0.
444 	 */
445 	if (hammer2_debug & 0x0002) {
446 		kprintf("igetv vp %p refs 0x%08x aux 0x%08x\n",
447 			vp, vp->v_refcnt, vp->v_auxrefs);
448 	}
449 	return (vp);
450 }
451 
452 /*
453  * The passed-in chain must be locked and the returned inode will also be
454  * locked.  This routine typically locates or allocates the inode, assigns
455  * ip->chain (adding a ref to chain if necessary), and returns the inode.
456  *
457  * The hammer2_inode structure regulates the interface between the high level
458  * kernel VNOPS API and the filesystem backend (the chains).
459  *
460  * WARNING!  This routine sucks up the chain's lock (makes it part of the
461  *	     inode lock from the point of view of the inode lock API),
462  *	     so callers need to be careful.
463  *
464  * WARNING!  The mount code is allowed to pass dip == NULL for iroot and
465  *	     is allowed to pass pmp == NULL and dip == NULL for sroot.
466  */
467 hammer2_inode_t *
468 hammer2_inode_get(hammer2_pfsmount_t *pmp, hammer2_inode_t *dip,
469 		  hammer2_chain_t *chain)
470 {
471 	hammer2_inode_t *nip;
472 
473 	KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INODE);
474 
475 	/*
476 	 * Interlocked lookup/ref of the inode.  This code is only needed
477 	 * when looking up inodes with nlinks != 0 (TODO: optimize out
478 	 * otherwise and test for duplicates).
479 	 */
480 again:
481 	for (;;) {
482 		nip = hammer2_inode_lookup(pmp, chain->data->ipdata.inum);
483 		if (nip == NULL)
484 			break;
485 		ccms_thread_lock(&nip->topo_cst, CCMS_STATE_EXCLUSIVE);
486 		if ((nip->flags & HAMMER2_INODE_ONRBTREE) == 0) { /* race */
487 			ccms_thread_unlock(&nip->topo_cst);
488 			hammer2_inode_drop(nip);
489 			continue;
490 		}
491 		if (nip->chain != chain)
492 			hammer2_inode_repoint(nip, NULL, chain);
493 
494 		/*
495 		 * Consolidated nip/nip->chain is locked (chain locked
496 		 * by caller).
497 		 */
498 		return nip;
499 	}
500 
501 	/*
502 	 * We couldn't find the inode number, create a new inode.
503 	 */
504 	if (pmp) {
505 		nip = kmalloc(sizeof(*nip), pmp->minode, M_WAITOK | M_ZERO);
506 		atomic_add_long(&pmp->inmem_inodes, 1);
507 		hammer2_chain_memory_inc(pmp);
508 		hammer2_chain_memory_wakeup(pmp);
509 	} else {
510 		nip = kmalloc(sizeof(*nip), M_HAMMER2, M_WAITOK | M_ZERO);
511 		nip->flags = HAMMER2_INODE_SROOT;
512 	}
513 	nip->inum = chain->data->ipdata.inum;
514 	nip->size = chain->data->ipdata.size;
515 	nip->mtime = chain->data->ipdata.mtime;
516 	hammer2_inode_repoint(nip, NULL, chain);
517 	nip->pip = dip;				/* can be NULL */
518 	if (dip)
519 		hammer2_inode_ref(dip);	/* ref dip for nip->pip */
520 
521 	nip->pmp = pmp;
522 
523 	/*
524 	 * ref and lock on nip gives it state compatible to after a
525 	 * hammer2_inode_lock_ex() call.
526 	 */
527 	nip->refs = 1;
528 	ccms_cst_init(&nip->topo_cst, &nip->chain);
529 	ccms_thread_lock(&nip->topo_cst, CCMS_STATE_EXCLUSIVE);
530 	/* combination of thread lock and chain lock == inode lock */
531 
532 	/*
533 	 * Attempt to add the inode.  If it fails we raced another inode
534 	 * get.  Undo all the work and try again.
535 	 */
536 	if (pmp) {
537 		spin_lock(&pmp->inum_spin);
538 		if (RB_INSERT(hammer2_inode_tree, &pmp->inum_tree, nip)) {
539 			spin_unlock(&pmp->inum_spin);
540 			ccms_thread_unlock(&nip->topo_cst);
541 			hammer2_inode_drop(nip);
542 			goto again;
543 		}
544 		atomic_set_int(&nip->flags, HAMMER2_INODE_ONRBTREE);
545 		spin_unlock(&pmp->inum_spin);
546 	}
547 
548 	return (nip);
549 }
550 
551 /*
552  * Create a new inode in the specified directory using the vattr to
553  * figure out the type of inode.
554  *
555  * If no error occurs the new inode with its chain locked is returned in
556  * *nipp, otherwise an error is returned and *nipp is set to NULL.
557  *
558  * If vap and/or cred are NULL the related fields are not set and the
559  * inode type defaults to a directory.  This is used when creating PFSs
560  * under the super-root, so the inode number is set to 1 in this case.
561  *
562  * dip is not locked on entry.
563  */
564 hammer2_inode_t *
565 hammer2_inode_create(hammer2_trans_t *trans, hammer2_inode_t *dip,
566 		     struct vattr *vap, struct ucred *cred,
567 		     const uint8_t *name, size_t name_len,
568 		     hammer2_chain_t **chainp, int *errorp)
569 {
570 	hammer2_inode_data_t *dipdata;
571 	hammer2_inode_data_t *nipdata;
572 	hammer2_chain_t *chain;
573 	hammer2_chain_t *parent;
574 	hammer2_inode_t *nip;
575 	hammer2_key_t key_dummy;
576 	hammer2_key_t lhc;
577 	int error;
578 	uid_t xuid;
579 	uuid_t dip_uid;
580 	uuid_t dip_gid;
581 	uint32_t dip_mode;
582 	uint8_t dip_algo;
583 	int cache_index = -1;
584 
585 	lhc = hammer2_dirhash(name, name_len);
586 	*errorp = 0;
587 
588 	/*
589 	 * Locate the inode or indirect block to create the new
590 	 * entry in.  At the same time check for key collisions
591 	 * and iterate until we don't get one.
592 	 *
593 	 * NOTE: hidden inodes do not have iterators.
594 	 */
595 retry:
596 	parent = hammer2_inode_lock_ex(dip);
597 	dipdata = &dip->chain->data->ipdata;
598 	dip_uid = dipdata->uid;
599 	dip_gid = dipdata->gid;
600 	dip_mode = dipdata->mode;
601 	dip_algo = dipdata->comp_algo;
602 
603 	error = 0;
604 	while (error == 0) {
605 		chain = hammer2_chain_lookup(&parent, &key_dummy,
606 					     lhc, lhc, &cache_index, 0);
607 		if (chain == NULL)
608 			break;
609 		if ((lhc & HAMMER2_DIRHASH_VISIBLE) == 0)
610 			error = ENOSPC;
611 		if ((lhc & HAMMER2_DIRHASH_LOMASK) == HAMMER2_DIRHASH_LOMASK)
612 			error = ENOSPC;
613 		hammer2_chain_unlock(chain);
614 		chain = NULL;
615 		++lhc;
616 	}
617 
618 	if (error == 0) {
619 		error = hammer2_chain_create(trans, &parent, &chain,
620 					     lhc, 0,
621 					     HAMMER2_BREF_TYPE_INODE,
622 					     HAMMER2_INODE_BYTES);
623 	}
624 #if INODE_DEBUG
625 	kprintf("CREATE INODE %*.*s chain=%p\n",
626 		(int)name_len, (int)name_len, name, chain);
627 #endif
628 
629 	/*
630 	 * Cleanup and handle retries.
631 	 */
632 	if (error == EAGAIN) {
633 		hammer2_chain_ref(parent);
634 		hammer2_inode_unlock_ex(dip, parent);
635 		hammer2_chain_wait(parent);
636 		hammer2_chain_drop(parent);
637 		goto retry;
638 	}
639 	hammer2_inode_unlock_ex(dip, parent);
640 
641 	if (error) {
642 		KKASSERT(chain == NULL);
643 		*errorp = error;
644 		return (NULL);
645 	}
646 
647 	/*
648 	 * Set up the new inode.
649 	 *
650 	 * NOTE: *_get() integrates chain's lock into the inode lock.
651 	 *
652 	 * NOTE: Only one new inode can currently be created per
653 	 *	 transaction.  If the need arises we can adjust
654 	 *	 hammer2_trans_init() to allow more.
655 	 *
656 	 * NOTE: nipdata will have chain's blockset data.
657 	 */
658 	chain->data->ipdata.inum = trans->inode_tid;
659 	nip = hammer2_inode_get(dip->pmp, dip, chain);
660 	nipdata = &chain->data->ipdata;
661 
662 	if (vap) {
663 		KKASSERT(trans->inodes_created == 0);
664 		nipdata->type = hammer2_get_obj_type(vap->va_type);
665 		nipdata->inum = trans->inode_tid;
666 		++trans->inodes_created;
667 
668 		switch (nipdata->type) {
669 		case HAMMER2_OBJTYPE_CDEV:
670 		case HAMMER2_OBJTYPE_BDEV:
671 			nipdata->rmajor = vap->va_rmajor;
672 			nipdata->rminor = vap->va_rminor;
673 			break;
674 		default:
675 			break;
676 		}
677 	} else {
678 		nipdata->type = HAMMER2_OBJTYPE_DIRECTORY;
679 		nipdata->inum = 1;
680 	}
681 
682 	/* Inherit parent's inode compression mode. */
683 	nip->comp_heuristic = 0;
684 	nipdata->comp_algo = dip_algo;
685 	nipdata->version = HAMMER2_INODE_VERSION_ONE;
686 	hammer2_update_time(&nipdata->ctime);
687 	nipdata->mtime = nipdata->ctime;
688 	if (vap)
689 		nipdata->mode = vap->va_mode;
690 	nipdata->nlinks = 1;
691 	if (vap) {
692 		if (dip && dip->pmp) {
693 			xuid = hammer2_to_unix_xid(&dip_uid);
694 			xuid = vop_helper_create_uid(dip->pmp->mp,
695 						     dip_mode,
696 						     xuid,
697 						     cred,
698 						     &vap->va_mode);
699 		} else {
700 			/* super-root has no dip and/or pmp */
701 			xuid = 0;
702 		}
703 		if (vap->va_vaflags & VA_UID_UUID_VALID)
704 			nipdata->uid = vap->va_uid_uuid;
705 		else if (vap->va_uid != (uid_t)VNOVAL)
706 			hammer2_guid_to_uuid(&nipdata->uid, vap->va_uid);
707 		else
708 			hammer2_guid_to_uuid(&nipdata->uid, xuid);
709 
710 		if (vap->va_vaflags & VA_GID_UUID_VALID)
711 			nipdata->gid = vap->va_gid_uuid;
712 		else if (vap->va_gid != (gid_t)VNOVAL)
713 			hammer2_guid_to_uuid(&nipdata->gid, vap->va_gid);
714 		else if (dip)
715 			nipdata->gid = dip_gid;
716 	}
717 
718 	/*
719 	 * Regular files and softlinks allow a small amount of data to be
720 	 * directly embedded in the inode.  This flag will be cleared if
721 	 * the size is extended past the embedded limit.
722 	 */
723 	if (nipdata->type == HAMMER2_OBJTYPE_REGFILE ||
724 	    nipdata->type == HAMMER2_OBJTYPE_SOFTLINK) {
725 		nipdata->op_flags |= HAMMER2_OPFLAG_DIRECTDATA;
726 	}
727 
728 	KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
729 	bcopy(name, nipdata->filename, name_len);
730 	nipdata->name_key = lhc;
731 	nipdata->name_len = name_len;
732 	*chainp = chain;
733 
734 	return (nip);
735 }
736 
737 /*
738  * chain may have been moved around by the create.
739  */
740 void
741 hammer2_chain_refactor(hammer2_chain_t **chainp)
742 {
743 	hammer2_chain_t *chain = *chainp;
744 	hammer2_chain_core_t *core;
745 
746 	core = chain->core;
747 	while (chain->flags & HAMMER2_CHAIN_DUPLICATED) {
748 		spin_lock(&core->cst.spin);
749 		chain = TAILQ_NEXT(chain, core_entry);
750 		while (chain->flags & HAMMER2_CHAIN_DUPLICATED)
751 			chain = TAILQ_NEXT(chain, core_entry);
752 		hammer2_chain_ref(chain);
753 		spin_unlock(&core->cst.spin);
754 		KKASSERT(chain->core == core);
755 
756 		hammer2_chain_unlock(*chainp);
757 		hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS |
758 					  HAMMER2_RESOLVE_NOREF); /* eat ref */
759 		*chainp = chain;
760 	}
761 }
762 
763 /*
764  * Shift *chainp up to the specified directory, change the filename
765  * to "0xINODENUMBER", and adjust the key.  The chain becomes the
766  * invisible hardlink target.
767  *
768  * The original *chainp has already been marked deleted.
769  */
770 static
771 void
772 hammer2_hardlink_shiftup(hammer2_trans_t *trans, hammer2_chain_t **chainp,
773 			hammer2_inode_t *dip, hammer2_chain_t **dchainp,
774 			int nlinks, int *errorp)
775 {
776 	hammer2_inode_data_t *nipdata;
777 	hammer2_chain_t *chain;
778 	hammer2_chain_t *xchain;
779 	hammer2_key_t key_dummy;
780 	hammer2_key_t lhc;
781 	hammer2_blockref_t bref;
782 	int cache_index = -1;
783 
784 	chain = *chainp;
785 	lhc = chain->data->ipdata.inum;
786 	KKASSERT((lhc & HAMMER2_DIRHASH_VISIBLE) == 0);
787 
788 	/*
789 	 * Locate the inode or indirect block to create the new
790 	 * entry in.  lhc represents the inode number so there is
791 	 * no collision iteration.
792 	 *
793 	 * There should be no key collisions with invisible inode keys.
794 	 *
795 	 * WARNING! Must use inode_lock_ex() on dip to handle a stale
796 	 *	    dip->chain cache.
797 	 */
798 retry:
799 	*errorp = 0;
800 	xchain = hammer2_chain_lookup(dchainp, &key_dummy,
801 				      lhc, lhc, &cache_index, 0);
802 	if (xchain) {
803 		kprintf("X3 chain %p dip %p dchain %p dip->chain %p\n",
804 			xchain, dip, *dchainp, dip->chain);
805 		hammer2_chain_unlock(xchain);
806 		xchain = NULL;
807 		*errorp = ENOSPC;
808 #if 0
809 		Debugger("X3");
810 #endif
811 	}
812 
813 	/*
814 	 * Create entry in common parent directory using the seek position
815 	 * calculated above.
816 	 *
817 	 * We must refactor chain because it might have been shifted into
818 	 * an indirect chain by the create.
819 	 */
820 	if (*errorp == 0) {
821 		KKASSERT(xchain == NULL);
822 #if 0
823 		*errorp = hammer2_chain_create(trans, dchainp, &xchain,
824 					       lhc, 0,
825 					       HAMMER2_BREF_TYPE_INODE,/* n/a */
826 					       HAMMER2_INODE_BYTES);   /* n/a */
827 #endif
828 		/*XXX this somehow isn't working on chain XXX*/
829 		/*KKASSERT(xxx)*/
830 	}
831 
832 	/*
833 	 * Cleanup and handle retries.
834 	 */
835 	if (*errorp == EAGAIN) {
836 		kprintf("R");
837 		hammer2_chain_wait(*dchainp);
838 		hammer2_chain_drop(*dchainp);
839 		goto retry;
840 	}
841 
842 	/*
843 	 * Handle the error case
844 	 */
845 	if (*errorp) {
846 		panic("error2");
847 		KKASSERT(xchain == NULL);
848 		return;
849 	}
850 
851 	/*
852 	 * Use xchain as a placeholder for (lhc).  Duplicate chain to the
853 	 * same target bref as xchain and then delete xchain.  The duplication
854 	 * occurs after xchain in flush order even though xchain is deleted
855 	 * after the duplication. XXX
856 	 *
857 	 * WARNING! Duplications (to a different parent) can cause indirect
858 	 *	    blocks to be inserted, refactor xchain.
859 	 */
860 	bref = chain->bref;
861 	bref.key = lhc;			/* invisible dir entry key */
862 	bref.keybits = 0;
863 	hammer2_chain_duplicate(trans, dchainp, &chain, &bref, 0, 2);
864 
865 	/*
866 	 * chain is now 'live' again.. adjust the filename.
867 	 *
868 	 * Directory entries are inodes but this is a hidden hardlink
869 	 * target.  The name isn't used but to ease debugging give it
870 	 * a name after its inode number.
871 	 */
872 	hammer2_chain_modify(trans, &chain, 0);
873 	nipdata = &chain->data->ipdata;
874 	ksnprintf(nipdata->filename, sizeof(nipdata->filename),
875 		  "0x%016jx", (intmax_t)nipdata->inum);
876 	nipdata->name_len = strlen(nipdata->filename);
877 	nipdata->name_key = lhc;
878 	nipdata->nlinks += nlinks;
879 
880 	*chainp = chain;
881 }
882 
883 /*
884  * Connect the target inode represented by (*chainp) to the media topology
885  * at (dip, name, len).  The caller can pass a rough *chainp, this function
886  * will issue lookup()s to position the parent chain properly for the
887  * chain insertion.
888  *
889  * If hlink is TRUE this function creates an OBJTYPE_HARDLINK directory
890  * entry instead of connecting (*chainp).
891  *
892  * If hlink is FALSE this function uses chain_duplicate() to make a copy
893  * if (*chainp) in the directory entry.  (*chainp) is likely to be deleted
894  * by the caller in this case (e.g. rename).
895  */
896 int
897 hammer2_inode_connect(hammer2_trans_t *trans,
898 		      hammer2_chain_t **chainp, int hlink,
899 		      hammer2_inode_t *dip, hammer2_chain_t **dchainp,
900 		      const uint8_t *name, size_t name_len,
901 		      hammer2_key_t lhc)
902 {
903 	hammer2_inode_data_t *ipdata;
904 	hammer2_chain_t *nchain;
905 	hammer2_chain_t *ochain;
906 	hammer2_key_t key_dummy;
907 	int cache_index = -1;
908 	int error;
909 
910 	/*
911 	 * Since ochain is either disconnected from the topology or represents
912 	 * a hardlink terminus which is always a parent of or equal to dip,
913 	 * we should be able to safely lock dip->chain for our setup.
914 	 *
915 	 * WARNING! Must use inode_lock_ex() on dip to handle a stale
916 	 *	    dip->chain cache.
917 	 */
918 	ochain = *chainp;
919 
920 	/*
921 	 * If name is non-NULL we calculate lhc, else we use the passed-in
922 	 * lhc.
923 	 */
924 	if (name) {
925 		lhc = hammer2_dirhash(name, name_len);
926 
927 		/*
928 		 * Locate the inode or indirect block to create the new
929 		 * entry in.  At the same time check for key collisions
930 		 * and iterate until we don't get one.
931 		 */
932 		error = 0;
933 		while (error == 0) {
934 			nchain = hammer2_chain_lookup(dchainp, &key_dummy,
935 						      lhc, lhc,
936 						      &cache_index, 0);
937 			if (nchain == NULL)
938 				break;
939 			if ((lhc & HAMMER2_DIRHASH_LOMASK) ==
940 			    HAMMER2_DIRHASH_LOMASK) {
941 				error = ENOSPC;
942 			}
943 			hammer2_chain_unlock(nchain);
944 			nchain = NULL;
945 			++lhc;
946 		}
947 	} else {
948 		/*
949 		 * Reconnect to specific key (used when moving
950 		 * unlinked-but-open files into the hidden directory).
951 		 */
952 		nchain = hammer2_chain_lookup(dchainp, &key_dummy,
953 					      lhc, lhc, &cache_index, 0);
954 		KKASSERT(nchain == NULL);
955 	}
956 
957 	if (error == 0) {
958 		if (hlink) {
959 			/*
960 			 * Hardlink pointer needed, create totally fresh
961 			 * directory entry.
962 			 *
963 			 * We must refactor ochain because it might have
964 			 * been shifted into an indirect chain by the
965 			 * create.
966 			 */
967 			KKASSERT(nchain == NULL);
968 			error = hammer2_chain_create(trans, dchainp, &nchain,
969 						     lhc, 0,
970 						     HAMMER2_BREF_TYPE_INODE,
971 						     HAMMER2_INODE_BYTES);
972 			hammer2_chain_refactor(&ochain);
973 		} else {
974 			/*
975 			 * Reconnect the original chain and rename.  Use
976 			 * chain_duplicate().  The caller will likely delete
977 			 * or has already deleted the original chain in
978 			 * this case.
979 			 *
980 			 * NOTE: chain_duplicate() generates a new chain
981 			 *	 with CHAIN_DELETED cleared (ochain typically
982 			 *	 has it set from the file unlink).
983 			 *
984 			 * WARNING! Can cause held-over chains to require a
985 			 *	    refactor.  Fortunately we have none (our
986 			 *	    locked chains are passed into and
987 			 *	    modified by the call).
988 			 */
989 			nchain = ochain;
990 			ochain = NULL;
991 			hammer2_chain_duplicate(trans, NULL, &nchain, NULL,
992 						0, 3);
993 			error = hammer2_chain_create(trans, dchainp, &nchain,
994 						     lhc, 0,
995 						     HAMMER2_BREF_TYPE_INODE,
996 						     HAMMER2_INODE_BYTES);
997 		}
998 	}
999 
1000 	/*
1001 	 * Unlock stuff.
1002 	 */
1003 	KKASSERT(error != EAGAIN);
1004 
1005 	/*
1006 	 * nchain should be NULL on error, leave ochain (== *chainp) alone.
1007 	 */
1008 	if (error) {
1009 		KKASSERT(nchain == NULL);
1010 		return (error);
1011 	}
1012 
1013 	/*
1014 	 * Directory entries are inodes so if the name has changed we have
1015 	 * to update the inode.
1016 	 *
1017 	 * When creating an OBJTYPE_HARDLINK entry remember to unlock the
1018 	 * chain, the caller will access the hardlink via the actual hardlink
1019 	 * target file and not the hardlink pointer entry, so we must still
1020 	 * return ochain.
1021 	 */
1022 	if (hlink && hammer2_hardlink_enable >= 0) {
1023 		/*
1024 		 * Create the HARDLINK pointer.  oip represents the hardlink
1025 		 * target in this situation.
1026 		 *
1027 		 * We will return ochain (the hardlink target).
1028 		 */
1029 		hammer2_chain_modify(trans, &nchain, 0);
1030 		KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
1031 		ipdata = &nchain->data->ipdata;
1032 		bcopy(name, ipdata->filename, name_len);
1033 		ipdata->name_key = lhc;
1034 		ipdata->name_len = name_len;
1035 		ipdata->target_type = ochain->data->ipdata.type;
1036 		ipdata->type = HAMMER2_OBJTYPE_HARDLINK;
1037 		ipdata->inum = ochain->data->ipdata.inum;
1038 		ipdata->nlinks = 1;
1039 		hammer2_chain_unlock(nchain);
1040 		nchain = ochain;
1041 		ochain = NULL;
1042 	} else if (hlink && hammer2_hardlink_enable < 0) {
1043 		/*
1044 		 * Create a snapshot (hardlink fake mode for debugging).
1045 		 * (ochain already flushed above so we can just copy the
1046 		 * bref XXX).
1047 		 *
1048 		 * Since this is a snapshot we return nchain in the fake
1049 		 * hardlink case.
1050 		 */
1051 		hammer2_chain_modify(trans, &nchain, 0);
1052 		KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
1053 		ipdata = &nchain->data->ipdata;
1054 		*ipdata = ochain->data->ipdata;
1055 		bcopy(name, ipdata->filename, name_len);
1056 		ipdata->name_key = lhc;
1057 		ipdata->name_len = name_len;
1058 		atomic_clear_int(&nchain->core->flags,
1059 				 HAMMER2_CORE_COUNTEDBREFS);
1060 		kprintf("created fake hardlink %*.*s\n",
1061 			(int)name_len, (int)name_len, name);
1062 	} else {
1063 		/*
1064 		 * nchain is a duplicate of ochain at the new location.
1065 		 * We must fixup the name stored in oip.  The bref key
1066 		 * has already been set up.
1067 		 */
1068 		hammer2_chain_modify(trans, &nchain, 0);
1069 		ipdata = &nchain->data->ipdata;
1070 
1071 		KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
1072 		bcopy(name, ipdata->filename, name_len);
1073 		ipdata->name_key = lhc;
1074 		ipdata->name_len = name_len;
1075 		ipdata->nlinks = 1;
1076 	}
1077 
1078 	/*
1079 	 * We are replacing ochain with nchain, unlock ochain.  In the
1080 	 * case where ochain is left unchanged the code above sets
1081 	 * nchain to ochain and ochain to NULL, resulting in a NOP here.
1082 	 */
1083 	if (ochain)
1084 		hammer2_chain_unlock(ochain);
1085 	*chainp = nchain;
1086 
1087 	return (0);
1088 }
1089 
1090 /*
1091  * Repoint ip->chain to nchain.  Caller must hold the inode exclusively
1092  * locked.
1093  *
1094  * ip->chain is set to nchain.  The prior chain in ip->chain is dropped
1095  * and nchain is ref'd.
1096  */
1097 void
1098 hammer2_inode_repoint(hammer2_inode_t *ip, hammer2_inode_t *pip,
1099 		      hammer2_chain_t *nchain)
1100 {
1101 	hammer2_chain_t *ochain;
1102 	hammer2_inode_t *opip;
1103 
1104 	/*
1105 	 * Repoint ip->chain if requested.
1106 	 */
1107 	ochain = ip->chain;
1108 	ip->chain = nchain;
1109 	if (nchain)
1110 		hammer2_chain_ref(nchain);
1111 	if (ochain)
1112 		hammer2_chain_drop(ochain);
1113 
1114 	/*
1115 	 * Repoint ip->pip if requested (non-NULL pip).
1116 	 */
1117 	if (pip && ip->pip != pip) {
1118 		opip = ip->pip;
1119 		hammer2_inode_ref(pip);
1120 		ip->pip = pip;
1121 		if (opip)
1122 			hammer2_inode_drop(opip);
1123 	}
1124 }
1125 
1126 /*
1127  * Unlink the file from the specified directory inode.  The directory inode
1128  * does not need to be locked.
1129  *
1130  * isdir determines whether a directory/non-directory check should be made.
1131  * No check is made if isdir is set to -1.
1132  *
1133  * isopen specifies whether special unlink-with-open-descriptor handling
1134  * must be performed.  If set to -1 the caller is deleting a PFS and we
1135  * check whether the chain is mounted or not (chain->pmp != NULL).  1 is
1136  * implied if it is mounted.
1137  *
1138  * If isopen is 1 and nlinks drops to 0 this function must move the chain
1139  * to a special hidden directory until last-close occurs on the file.
1140  *
1141  * NOTE!  The underlying file can still be active with open descriptors
1142  *	  or if the chain is being manually held (e.g. for rename).
1143  *
1144  *	  The caller is responsible for fixing up ip->chain if e.g. a
1145  *	  rename occurs (see chain_duplicate()).
1146  */
1147 int
1148 hammer2_unlink_file(hammer2_trans_t *trans, hammer2_inode_t *dip,
1149 		    const uint8_t *name, size_t name_len,
1150 		    int isdir, int *hlinkp, struct nchandle *nch)
1151 {
1152 	hammer2_inode_data_t *ipdata;
1153 	hammer2_chain_t *parent;
1154 	hammer2_chain_t *ochain;
1155 	hammer2_chain_t *chain;
1156 	hammer2_chain_t *dparent;
1157 	hammer2_chain_t *dchain;
1158 	hammer2_key_t key_dummy;
1159 	hammer2_key_t key_next;
1160 	hammer2_key_t lhc;
1161 	int error;
1162 	int cache_index = -1;
1163 	uint8_t type;
1164 
1165 	error = 0;
1166 	ochain = NULL;
1167 	lhc = hammer2_dirhash(name, name_len);
1168 
1169 	/*
1170 	 * Search for the filename in the directory
1171 	 */
1172 	if (hlinkp)
1173 		*hlinkp = 0;
1174 	parent = hammer2_inode_lock_ex(dip);
1175 	chain = hammer2_chain_lookup(&parent, &key_next,
1176 				     lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1177 				     &cache_index, 0);
1178 	while (chain) {
1179 		if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
1180 		    name_len == chain->data->ipdata.name_len &&
1181 		    bcmp(name, chain->data->ipdata.filename, name_len) == 0) {
1182 			break;
1183 		}
1184 		chain = hammer2_chain_next(&parent, chain, &key_next,
1185 					   key_next,
1186 					   lhc + HAMMER2_DIRHASH_LOMASK,
1187 					   &cache_index, 0);
1188 	}
1189 	hammer2_inode_unlock_ex(dip, NULL);	/* retain parent */
1190 
1191 	/*
1192 	 * Not found or wrong type (isdir < 0 disables the type check).
1193 	 * If a hardlink pointer, type checks use the hardlink target.
1194 	 */
1195 	if (chain == NULL) {
1196 		error = ENOENT;
1197 		goto done;
1198 	}
1199 	if ((type = chain->data->ipdata.type) == HAMMER2_OBJTYPE_HARDLINK) {
1200 		if (hlinkp)
1201 			*hlinkp = 1;
1202 		type = chain->data->ipdata.target_type;
1203 	}
1204 
1205 	if (type == HAMMER2_OBJTYPE_DIRECTORY && isdir == 0) {
1206 		error = ENOTDIR;
1207 		goto done;
1208 	}
1209 	if (type != HAMMER2_OBJTYPE_DIRECTORY && isdir >= 1) {
1210 		error = EISDIR;
1211 		goto done;
1212 	}
1213 
1214 	/*
1215 	 * Hardlink must be resolved.  We can't hold the parent locked
1216 	 * while we do this or we could deadlock.
1217 	 *
1218 	 * On success chain will be adjusted to point at the hardlink target
1219 	 * and ochain will point to the hardlink pointer in the original
1220 	 * directory.  Otherwise chain remains pointing to the original.
1221 	 */
1222 	if (chain->data->ipdata.type == HAMMER2_OBJTYPE_HARDLINK) {
1223 		hammer2_chain_unlock(parent);
1224 		parent = NULL;
1225 		error = hammer2_hardlink_find(dip, &chain, &ochain);
1226 	}
1227 
1228 	/*
1229 	 * If this is a directory the directory must be empty.  However, if
1230 	 * isdir < 0 we are doing a rename and the directory does not have
1231 	 * to be empty, and if isdir > 1 we are deleting a PFS/snapshot
1232 	 * and the directory does not have to be empty.
1233 	 *
1234 	 * NOTE: We check the full key range here which covers both visible
1235 	 *	 and invisible entries.  Theoretically there should be no
1236 	 *	 invisible (hardlink target) entries if there are no visible
1237 	 *	 entries.
1238 	 */
1239 	if (type == HAMMER2_OBJTYPE_DIRECTORY && isdir == 1) {
1240 		dparent = hammer2_chain_lookup_init(chain, 0);
1241 		dchain = hammer2_chain_lookup(&dparent, &key_dummy,
1242 					      0, (hammer2_key_t)-1,
1243 					      &cache_index,
1244 					      HAMMER2_LOOKUP_NODATA);
1245 		if (dchain) {
1246 			hammer2_chain_unlock(dchain);
1247 			hammer2_chain_lookup_done(dparent);
1248 			error = ENOTEMPTY;
1249 			goto done;
1250 		}
1251 		hammer2_chain_lookup_done(dparent);
1252 		dparent = NULL;
1253 		/* dchain NULL */
1254 	}
1255 
1256 	/*
1257 	 * Ok, we can now unlink the chain.  We always decrement nlinks even
1258 	 * if the entry can be deleted in case someone has the file open and
1259 	 * does an fstat().
1260 	 *
1261 	 * The chain itself will no longer be in the on-media topology but
1262 	 * can still be flushed to the media (e.g. if an open descriptor
1263 	 * remains).  When the last vnode/ip ref goes away the chain will
1264 	 * be marked unmodified, avoiding any further (now unnecesary) I/O.
1265 	 *
1266 	 * A non-NULL ochain indicates a hardlink.
1267 	 */
1268 	if (ochain) {
1269 		/*
1270 		 * Delete the original hardlink pointer unconditionally.
1271 		 * (any open descriptors will migrate to the hardlink
1272 		 * target and have no affect on this operation).
1273 		 *
1274 		 * NOTE: parent from above is NULL when ochain != NULL
1275 		 *	 so we can reuse it.
1276 		 */
1277 		hammer2_chain_lock(ochain, HAMMER2_RESOLVE_ALWAYS);
1278 		hammer2_chain_delete(trans, ochain, 0);
1279 		hammer2_chain_unlock(ochain);
1280 	}
1281 
1282 	/*
1283 	 * Decrement nlinks on the hardlink target (or original file if
1284 	 * there it was not hardlinked).  Delete the target when nlinks
1285 	 * reaches 0 with special handling if (isopen) is set.
1286 	 *
1287 	 * NOTE! In DragonFly the vnops function calls cache_unlink() after
1288 	 *	 calling us here to clean out the namecache association,
1289 	 *	 (which does not represent a ref for the open-test), and to
1290 	 *	 force finalization of the vnode if/when the last ref gets
1291 	 *	 dropped.
1292 	 *
1293 	 * NOTE! Files are unlinked by rename and then relinked.  nch will be
1294 	 *	 passed as NULL in this situation.  hammer2_inode_connect()
1295 	 *	 will bump nlinks.
1296 	 */
1297 	KKASSERT(chain != NULL);
1298 	hammer2_chain_modify(trans, &chain, 0);
1299 	ipdata = &chain->data->ipdata;
1300 	--ipdata->nlinks;
1301 	if ((int64_t)ipdata->nlinks < 0)	/* XXX debugging */
1302 		ipdata->nlinks = 0;
1303 	if (ipdata->nlinks == 0) {
1304 		if ((chain->flags & HAMMER2_CHAIN_PFSROOT) && chain->pmp) {
1305 			error = EINVAL;
1306 			kprintf("hammer2: PFS \"%s\" cannot be deleted "
1307 				"while still mounted\n",
1308 				ipdata->filename);
1309 			goto done;
1310 		}
1311 		if (nch && cache_isopen(nch)) {
1312 			kprintf("WARNING: unlinking open file\n");
1313 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_UNLINKED);
1314 			hammer2_inode_move_to_hidden(trans, &chain,
1315 						     ipdata->inum);
1316 		} else {
1317 			hammer2_chain_delete(trans, chain, 0);
1318 		}
1319 	}
1320 	error = 0;
1321 done:
1322 	if (chain)
1323 		hammer2_chain_unlock(chain);
1324 	if (parent)
1325 		hammer2_chain_lookup_done(parent);
1326 	if (ochain)
1327 		hammer2_chain_drop(ochain);
1328 
1329 	return error;
1330 }
1331 
1332 /*
1333  * This is called from the mount code to initialize pmp->ihidden
1334  */
1335 void
1336 hammer2_inode_install_hidden(hammer2_pfsmount_t *pmp)
1337 {
1338 	hammer2_trans_t trans;
1339 	hammer2_chain_t *parent;
1340 	hammer2_chain_t *chain;
1341 	hammer2_chain_t *scan;
1342 	hammer2_inode_data_t *ipdata;
1343 	hammer2_key_t key_dummy;
1344 	hammer2_key_t key_next;
1345 	int cache_index;
1346 	int error;
1347 	int count;
1348 
1349 	if (pmp->ihidden)
1350 		return;
1351 
1352 	/*
1353 	 * Find the hidden directory
1354 	 */
1355 	bzero(&key_dummy, sizeof(key_dummy));
1356 	hammer2_trans_init(&trans, pmp, NULL, 0);
1357 
1358 	parent = hammer2_inode_lock_ex(pmp->iroot);
1359 	chain = hammer2_chain_lookup(&parent, &key_dummy,
1360 				     HAMMER2_INODE_HIDDENDIR,
1361 				     HAMMER2_INODE_HIDDENDIR,
1362 				     &cache_index, 0);
1363 	if (chain) {
1364 		pmp->ihidden = hammer2_inode_get(pmp, pmp->iroot, chain);
1365 		hammer2_inode_ref(pmp->ihidden);
1366 
1367 		/*
1368 		 * Remove any unlinked files which were left open as-of
1369 		 * any system crash.
1370 		 */
1371 		count = 0;
1372 		scan = hammer2_chain_lookup(&chain, &key_next,
1373 					    0, HAMMER2_MAX_TID,
1374 					    &cache_index,
1375 					    HAMMER2_LOOKUP_NODATA);
1376 		while (scan) {
1377 			if (scan->bref.type == HAMMER2_BREF_TYPE_INODE) {
1378 				hammer2_chain_delete(&trans, scan, 0);
1379 				++count;
1380 			}
1381 			scan = hammer2_chain_next(&chain, scan, &key_next,
1382 						   0, HAMMER2_MAX_TID,
1383 						   &cache_index,
1384 						   HAMMER2_LOOKUP_NODATA);
1385 		}
1386 
1387 		hammer2_inode_unlock_ex(pmp->ihidden, chain);
1388 		hammer2_inode_unlock_ex(pmp->iroot, parent);
1389 		hammer2_trans_done(&trans);
1390 		kprintf("hammer2: PFS loaded hidden dir, "
1391 			"removed %d dead entries\n", count);
1392 		return;
1393 	}
1394 
1395 	/*
1396 	 * Create the hidden directory
1397 	 */
1398 	error = hammer2_chain_create(&trans, &parent, &chain,
1399 				     HAMMER2_INODE_HIDDENDIR, 0,
1400 				     HAMMER2_BREF_TYPE_INODE,
1401 				     HAMMER2_INODE_BYTES);
1402 	hammer2_inode_unlock_ex(pmp->iroot, parent);
1403 	hammer2_chain_modify(&trans, &chain, 0);
1404 	ipdata = &chain->data->ipdata;
1405 	ipdata->type = HAMMER2_OBJTYPE_DIRECTORY;
1406 	ipdata->inum = HAMMER2_INODE_HIDDENDIR;
1407 	ipdata->nlinks = 1;
1408 	kprintf("hammer2: PFS root missing hidden directory, creating\n");
1409 
1410 	pmp->ihidden = hammer2_inode_get(pmp, pmp->iroot, chain);
1411 	hammer2_inode_ref(pmp->ihidden);
1412 	hammer2_inode_unlock_ex(pmp->ihidden, chain);
1413 	hammer2_trans_done(&trans);
1414 }
1415 
1416 /*
1417  * If an open file is unlinked H2 needs to retain the file in the topology
1418  * to ensure that its backing store is not recovered by the bulk free scan.
1419  * This also allows us to avoid having to special-case the CHAIN_DELETED flag.
1420  *
1421  * To do this the file is moved to a hidden directory in the PFS root and
1422  * renamed.  The hidden directory must be created if it does not exist.
1423  */
1424 static
1425 void
1426 hammer2_inode_move_to_hidden(hammer2_trans_t *trans, hammer2_chain_t **chainp,
1427 			     hammer2_tid_t inum)
1428 {
1429 	hammer2_chain_t *chain;
1430 	hammer2_chain_t *dchain;
1431 	hammer2_pfsmount_t *pmp;
1432 	int error;
1433 
1434 	chain = *chainp;
1435 	pmp = chain->pmp;
1436 	KKASSERT(pmp != NULL);
1437 	KKASSERT(pmp->ihidden != NULL);
1438 	hammer2_chain_delete(trans, chain, 0);
1439 
1440 	dchain = hammer2_inode_lock_ex(pmp->ihidden);
1441         error = hammer2_inode_connect(trans, chainp, 0,
1442                                       pmp->ihidden, &dchain,
1443 				      NULL, 0, inum);
1444 	hammer2_inode_unlock_ex(pmp->ihidden, dchain);
1445 	KKASSERT(error == 0);
1446 }
1447 
1448 /*
1449  * Given an exclusively locked inode and chain we consolidate its chain
1450  * for hardlink creation, adding (nlinks) to the file's link count and
1451  * potentially relocating the inode to a directory common to ip->pip and tdip.
1452  *
1453  * Replaces (*chainp) if consolidation occurred, unlocking the old chain
1454  * and returning a new locked chain.
1455  *
1456  * NOTE!  This function will also replace ip->chain.
1457  */
1458 int
1459 hammer2_hardlink_consolidate(hammer2_trans_t *trans,
1460 			     hammer2_inode_t *ip, hammer2_chain_t **chainp,
1461 			     hammer2_inode_t *cdip, hammer2_chain_t **cdchainp,
1462 			     int nlinks)
1463 {
1464 	hammer2_inode_data_t *ipdata;
1465 	hammer2_chain_t *chain;
1466 	hammer2_chain_t *nchain;
1467 	int error;
1468 
1469 	chain = *chainp;
1470 	if (nlinks == 0 &&			/* no hardlink needed */
1471 	    (chain->data->ipdata.name_key & HAMMER2_DIRHASH_VISIBLE)) {
1472 		return (0);
1473 	}
1474 	if (hammer2_hardlink_enable < 0) {	/* fake hardlinks */
1475 		return (0);
1476 	}
1477 
1478 	if (hammer2_hardlink_enable == 0) {	/* disallow hardlinks */
1479 		hammer2_chain_unlock(chain);
1480 		*chainp = NULL;
1481 		return (ENOTSUP);
1482 	}
1483 
1484 	/*
1485 	 * If no change in the hardlink's target directory is required and
1486 	 * this is already a hardlink target, all we need to do is adjust
1487 	 * the link count.
1488 	 */
1489 	if (cdip == ip->pip &&
1490 	    (chain->data->ipdata.name_key & HAMMER2_DIRHASH_VISIBLE) == 0) {
1491 		if (nlinks) {
1492 			hammer2_chain_modify(trans, &chain, 0);
1493 			chain->data->ipdata.nlinks += nlinks;
1494 		}
1495 		error = 0;
1496 		goto done;
1497 	}
1498 
1499 
1500 	/*
1501 	 * chain is the real inode.  If it's visible we have to convert it
1502 	 * to a hardlink pointer.  If it is not visible then it is already
1503 	 * a hardlink target and only needs to be deleted.
1504 	 */
1505 	KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
1506 	KKASSERT(chain->data->ipdata.type != HAMMER2_OBJTYPE_HARDLINK);
1507 	if (chain->data->ipdata.name_key & HAMMER2_DIRHASH_VISIBLE) {
1508 		/*
1509 		 * We are going to duplicate chain later, causing its
1510 		 * media block to be shifted to the duplicate.  Even though
1511 		 * we are delete-duplicating nchain here it might decide not
1512 		 * to reallocate the block.  Set FORCECOW to force it to.
1513 		 */
1514 		nchain = chain;
1515 		hammer2_chain_lock(nchain, HAMMER2_RESOLVE_ALWAYS);
1516 		atomic_set_int(&nchain->flags, HAMMER2_CHAIN_FORCECOW);
1517 		hammer2_chain_delete_duplicate(trans, &nchain,
1518 					       HAMMER2_DELDUP_RECORE);
1519 		KKASSERT((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0);
1520 
1521 		ipdata = &nchain->data->ipdata;
1522 		ipdata->target_type = ipdata->type;
1523 		ipdata->type = HAMMER2_OBJTYPE_HARDLINK;
1524 		ipdata->uflags = 0;
1525 		ipdata->rmajor = 0;
1526 		ipdata->rminor = 0;
1527 		ipdata->ctime = 0;
1528 		ipdata->mtime = 0;
1529 		ipdata->atime = 0;
1530 		ipdata->btime = 0;
1531 		bzero(&ipdata->uid, sizeof(ipdata->uid));
1532 		bzero(&ipdata->gid, sizeof(ipdata->gid));
1533 		ipdata->op_flags = HAMMER2_OPFLAG_DIRECTDATA;
1534 		ipdata->cap_flags = 0;
1535 		ipdata->mode = 0;
1536 		ipdata->size = 0;
1537 		ipdata->nlinks = 1;
1538 		ipdata->iparent = 0;	/* XXX */
1539 		ipdata->pfs_type = 0;
1540 		ipdata->pfs_inum = 0;
1541 		bzero(&ipdata->pfs_clid, sizeof(ipdata->pfs_clid));
1542 		bzero(&ipdata->pfs_fsid, sizeof(ipdata->pfs_fsid));
1543 		ipdata->data_quota = 0;
1544 		ipdata->data_count = 0;
1545 		ipdata->inode_quota = 0;
1546 		ipdata->inode_count = 0;
1547 		ipdata->attr_tid = 0;
1548 		ipdata->dirent_tid = 0;
1549 		bzero(&ipdata->u, sizeof(ipdata->u));
1550 		/* XXX transaction ids */
1551 	} else {
1552 		hammer2_chain_delete(trans, chain, 0);
1553 		nchain = NULL;
1554 	}
1555 
1556 	/*
1557 	 * chain represents the hardlink target and is now flagged deleted.
1558 	 * duplicate it to the parent directory and adjust nlinks.
1559 	 *
1560 	 * WARNING! The shiftup() call can cause nchain to be moved into
1561 	 *	    an indirect block, and our nchain will wind up pointing
1562 	 *	    to the older/original version.
1563 	 */
1564 	KKASSERT(chain->flags & HAMMER2_CHAIN_DELETED);
1565 	hammer2_hardlink_shiftup(trans, &chain, cdip, cdchainp, nlinks, &error);
1566 
1567 	if (error == 0)
1568 		hammer2_inode_repoint(ip, cdip, chain);
1569 
1570 	/*
1571 	 * Unlock the original chain last as the lock blocked races against
1572 	 * the creation of the new hardlink target.
1573 	 */
1574 	if (nchain)
1575 		hammer2_chain_unlock(nchain);
1576 
1577 done:
1578 	/*
1579 	 * Cleanup, chain/nchain already dealt with.
1580 	 */
1581 	*chainp = chain;
1582 	hammer2_inode_drop(cdip);
1583 
1584 	return (error);
1585 }
1586 
1587 /*
1588  * If (*ochainp) is non-NULL it points to the forward OBJTYPE_HARDLINK
1589  * inode while (*chainp) points to the resolved (hidden hardlink
1590  * target) inode.  In this situation when nlinks is 1 we wish to
1591  * deconsolidate the hardlink, moving it back to the directory that now
1592  * represents the only remaining link.
1593  */
1594 int
1595 hammer2_hardlink_deconsolidate(hammer2_trans_t *trans,
1596 			       hammer2_inode_t *dip,
1597 			       hammer2_chain_t **chainp,
1598 			       hammer2_chain_t **ochainp)
1599 {
1600 	if (*ochainp == NULL)
1601 		return (0);
1602 	/* XXX */
1603 	return (0);
1604 }
1605 
1606 /*
1607  * The caller presents a locked *chainp pointing to a HAMMER2_BREF_TYPE_INODE
1608  * with an obj_type of HAMMER2_OBJTYPE_HARDLINK.  This routine will gobble
1609  * the *chainp and return a new locked *chainp representing the file target
1610  * (the original *chainp will be unlocked).
1611  *
1612  * When a match is found the chain representing the original HARDLINK
1613  * will be returned in *ochainp with a ref, but not locked.
1614  *
1615  * When no match is found *chainp is set to NULL and EIO is returned.
1616  * (*ochainp) will still be set to the original chain with a ref but not
1617  * locked.
1618  */
1619 int
1620 hammer2_hardlink_find(hammer2_inode_t *dip, hammer2_chain_t **chainp,
1621 		      hammer2_chain_t **ochainp)
1622 {
1623 	hammer2_chain_t *chain = *chainp;
1624 	hammer2_chain_t *parent;
1625 	hammer2_inode_t *ip;
1626 	hammer2_inode_t *pip;
1627 	hammer2_key_t key_dummy;
1628 	hammer2_key_t lhc;
1629 	int cache_index = -1;
1630 
1631 	pip = dip;
1632 	hammer2_inode_ref(pip);		/* for loop */
1633 	hammer2_chain_ref(chain);	/* for (*ochainp) */
1634 	*ochainp = chain;
1635 
1636 	/*
1637 	 * Locate the hardlink.  pip is referenced and not locked,
1638 	 * ipp.
1639 	 *
1640 	 * chain is reused.
1641 	 */
1642 	lhc = chain->data->ipdata.inum;
1643 	hammer2_chain_unlock(chain);
1644 	chain = NULL;
1645 
1646 	while ((ip = pip) != NULL) {
1647 		parent = hammer2_inode_lock_ex(ip);
1648 		hammer2_inode_drop(ip);			/* loop */
1649 		KKASSERT(parent->bref.type == HAMMER2_BREF_TYPE_INODE);
1650 		chain = hammer2_chain_lookup(&parent, &key_dummy,
1651 					     lhc, lhc, &cache_index, 0);
1652 		hammer2_chain_lookup_done(parent);	/* discard parent */
1653 		if (chain)
1654 			break;
1655 		pip = ip->pip;		/* safe, ip held locked */
1656 		if (pip)
1657 			hammer2_inode_ref(pip);		/* loop */
1658 		hammer2_inode_unlock_ex(ip, NULL);
1659 	}
1660 
1661 	/*
1662 	 * chain is locked, ip is locked.  Unlock ip, return the locked
1663 	 * chain.  *ipp is already set w/a ref count and not locked.
1664 	 *
1665 	 * (parent is already unlocked).
1666 	 */
1667 	if (ip)
1668 		hammer2_inode_unlock_ex(ip, NULL);
1669 	*chainp = chain;
1670 	if (chain) {
1671 		KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1672 		/* already locked */
1673 		return (0);
1674 	} else {
1675 		return (EIO);
1676 	}
1677 }
1678 
1679 /*
1680  * Find the directory common to both fdip and tdip, hold and return
1681  * its inode.
1682  */
1683 hammer2_inode_t *
1684 hammer2_inode_common_parent(hammer2_inode_t *fdip, hammer2_inode_t *tdip)
1685 {
1686 	hammer2_inode_t *scan1;
1687 	hammer2_inode_t *scan2;
1688 
1689 	/*
1690 	 * We used to have a depth field but it complicated matters too
1691 	 * much for directory renames.  So now its ugly.  Check for
1692 	 * simple cases before giving up and doing it the expensive way.
1693 	 *
1694 	 * XXX need a bottom-up topology stability lock
1695 	 */
1696 	if (fdip == tdip || fdip == tdip->pip) {
1697 		hammer2_inode_ref(fdip);
1698 		return(fdip);
1699 	}
1700 	if (fdip->pip == tdip) {
1701 		hammer2_inode_ref(tdip);
1702 		return(tdip);
1703 	}
1704 
1705 	/*
1706 	 * XXX not MPSAFE
1707 	 */
1708 	for (scan1 = fdip; scan1->pmp == fdip->pmp; scan1 = scan1->pip) {
1709 		scan2 = tdip;
1710 		while (scan2->pmp == tdip->pmp) {
1711 			if (scan1 == scan2) {
1712 				hammer2_inode_ref(scan1);
1713 				return(scan1);
1714 			}
1715 			scan2 = scan2->pip;
1716 			if (scan2 == NULL)
1717 				break;
1718 		}
1719 	}
1720 	panic("hammer2_inode_common_parent: no common parent %p %p\n",
1721 	      fdip, tdip);
1722 	/* NOT REACHED */
1723 	return(NULL);
1724 }
1725 
1726 /*
1727  * Synchronize the inode's frontend state with the chain state prior
1728  * to any explicit flush of the inode or any strategy write call.
1729  *
1730  * Called with a locked inode.
1731  */
1732 void
1733 hammer2_inode_fsync(hammer2_trans_t *trans, hammer2_inode_t *ip,
1734 		    hammer2_chain_t **chainp)
1735 {
1736 	hammer2_inode_data_t *ipdata;
1737 	hammer2_chain_t *parent;
1738 	hammer2_chain_t *chain;
1739 	hammer2_key_t lbase;
1740 	hammer2_key_t key_next;
1741 	int cache_index;
1742 
1743 	ipdata = &ip->chain->data->ipdata;
1744 
1745 	if (ip->flags & HAMMER2_INODE_MTIME) {
1746 		ipdata = hammer2_chain_modify_ip(trans, ip, chainp, 0);
1747 		atomic_clear_int(&ip->flags, HAMMER2_INODE_MTIME);
1748 		ipdata->mtime = ip->mtime;
1749 	}
1750 	if ((ip->flags & HAMMER2_INODE_RESIZED) && ip->size < ipdata->size) {
1751 		ipdata = hammer2_chain_modify_ip(trans, ip, chainp, 0);
1752 		ipdata->size = ip->size;
1753 		atomic_clear_int(&ip->flags, HAMMER2_INODE_RESIZED);
1754 
1755 		/*
1756 		 * We must delete any chains beyond the EOF.  The chain
1757 		 * straddling the EOF will be pending in the bioq.
1758 		 */
1759 		lbase = (ipdata->size + HAMMER2_PBUFMASK64) &
1760 			~HAMMER2_PBUFMASK64;
1761 		parent = hammer2_chain_lookup_init(ip->chain, 0);
1762 		chain = hammer2_chain_lookup(&parent, &key_next,
1763 					     lbase, (hammer2_key_t)-1,
1764 					     &cache_index,
1765 					     HAMMER2_LOOKUP_NODATA);
1766 		while (chain) {
1767 			/*
1768 			 * Degenerate embedded case, nothing to loop on
1769 			 */
1770 			if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1771 				hammer2_chain_unlock(chain);
1772 				break;
1773 			}
1774 			if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1775 				hammer2_chain_delete(trans, chain, 0);
1776 			}
1777 			chain = hammer2_chain_next(&parent, chain, &key_next,
1778 						   key_next, (hammer2_key_t)-1,
1779 						   &cache_index,
1780 						   HAMMER2_LOOKUP_NODATA);
1781 		}
1782 		hammer2_chain_lookup_done(parent);
1783 	} else
1784 	if ((ip->flags & HAMMER2_INODE_RESIZED) && ip->size > ipdata->size) {
1785 		ipdata = hammer2_chain_modify_ip(trans, ip, chainp, 0);
1786 		ipdata->size = ip->size;
1787 		atomic_clear_int(&ip->flags, HAMMER2_INODE_RESIZED);
1788 
1789 		/*
1790 		 * When resizing larger we may not have any direct-data
1791 		 * available.
1792 		 */
1793 		if ((ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) &&
1794 		    ip->size > HAMMER2_EMBEDDED_BYTES) {
1795 			ipdata->op_flags &= ~HAMMER2_OPFLAG_DIRECTDATA;
1796 			bzero(&ipdata->u.blockset, sizeof(ipdata->u.blockset));
1797 		}
1798 	}
1799 }
1800