xref: /dragonfly/sys/vfs/hammer2/hammer2_chain.c (revision 20c2db9a)
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem handles direct and indirect block searches, recursions,
37  * creation, and deletion.  Chains of blockrefs are tracked and modifications
38  * are flagged for propagation... eventually all the way back to the volume
39  * header.  Any chain except the volume header can be flushed to disk at
40  * any time... none of it matters until the volume header is dealt with
41  * (which is not here, see hammer2_vfsops.c for the volume header disk
42  * sequencing).
43  *
44  * Serialized flushes are not handled here, see hammer2_flush.c.  This module
45  * can essentially work on the current version of data, which can be in memory
46  * as well as on-disk due to the above.  However, we are responsible for
47  * making a copy of the state when a modified chain is part of a flush
48  * and we attempt to modify it again before the flush gets to it.  In that
49  * situation we create an allocated copy of the state that the flush can
50  * deal with.  If a chain undergoing deletion is part of a flush it is
51  * marked DELETED and its bref index is kept intact for the flush, but the
52  * chain is thereafter ignored by this module's because it is no longer
53  * current.
54  */
55 #include <sys/cdefs.h>
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/types.h>
59 #include <sys/lock.h>
60 #include <sys/uuid.h>
61 
62 #include "hammer2.h"
63 
64 static int hammer2_indirect_optimize;	/* XXX SYSCTL */
65 
66 static hammer2_chain_t *hammer2_chain_create_indirect(
67 			hammer2_mount_t *hmp, hammer2_chain_t *parent,
68 			hammer2_key_t key, int keybits,
69 			int *errorp);
70 
71 /*
72  * We use a red-black tree to guarantee safe lookups under shared locks.
73  */
74 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
75 
76 int
77 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
78 {
79 	return(chain2->index - chain1->index);
80 }
81 
82 /*
83  * Recursively mark the parent chain elements so flushes can find
84  * modified elements.  Stop when we hit a chain already flagged
85  * SUBMODIFIED, but ignore the SUBMODIFIED bit that might be set
86  * in chain itself.
87  *
88  * SUBMODIFIED is not set on the chain passed in.
89  *
90  * The chain->cst.spin lock can be held to stabilize the chain->parent
91  * pointer.  The first parent is stabilized by virtue of chain being
92  * fully locked.
93  */
94 void
95 hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
96 {
97 	hammer2_chain_t *parent;
98 
99 	parent = chain->parent;
100 	if (parent && (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
101 		spin_lock(&parent->cst.spin);
102 		for (;;) {
103 			atomic_set_int(&parent->flags,
104 				       HAMMER2_CHAIN_SUBMODIFIED);
105 			if ((chain = parent->parent) == NULL)
106 				break;
107 			spin_lock(&chain->cst.spin);	/* upward interlock */
108 			spin_unlock(&parent->cst.spin);
109 			parent = chain;
110 		}
111 		spin_unlock(&parent->cst.spin);
112 	}
113 }
114 
115 /*
116  * Allocate a new disconnected chain element representing the specified
117  * bref.  The chain element is locked exclusively and refs is set to 1.
118  * Media data (data) and meta-structure (u) pointers are left NULL.
119  *
120  * This essentially allocates a system memory structure representing one
121  * of the media structure types, including inodes.
122  */
123 hammer2_chain_t *
124 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
125 {
126 	hammer2_chain_t *chain;
127 	u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
128 
129 	/*
130 	 * Construct the appropriate system structure.
131 	 */
132 	switch(bref->type) {
133 	case HAMMER2_BREF_TYPE_INODE:
134 	case HAMMER2_BREF_TYPE_INDIRECT:
135 	case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
136 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
137 	case HAMMER2_BREF_TYPE_DATA:
138 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
139 		chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO);
140 		break;
141 	case HAMMER2_BREF_TYPE_VOLUME:
142 		chain = NULL;
143 		panic("hammer2_chain_alloc volume type illegal for op");
144 	default:
145 		chain = NULL;
146 		panic("hammer2_chain_alloc: unrecognized blockref type: %d",
147 		      bref->type);
148 	}
149 
150 	/*
151 	 * Only set bref_flush if the bref has a real media offset, otherwise
152 	 * the caller has to wait for the chain to be modified/block-allocated
153 	 * before a blockref can be synchronized with its (future) parent.
154 	 */
155 	chain->bref = *bref;
156 	if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX)
157 		chain->bref_flush = *bref;
158 	chain->index = -1;		/* not yet assigned */
159 	chain->refs = 1;
160 	chain->bytes = bytes;
161 	ccms_cst_init(&chain->cst, chain);
162 	ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
163 
164 	return (chain);
165 }
166 
167 /*
168  * Deallocate a chain (the step before freeing it).  Remove the chain from
169  * its parent's tree.
170  *
171  * Caller must hold the parent and the chain exclusively locked, and
172  * chain->refs must be 0.
173  *
174  * This function unlocks, removes, and destroys chain, and will recursively
175  * destroy any sub-chains under chain (whos refs must also be 0 at this
176  * point).
177  *
178  * parent can be NULL.
179  */
180 static void
181 hammer2_chain_dealloc(hammer2_mount_t *hmp, hammer2_chain_t *chain)
182 {
183 	hammer2_chain_t *parent;
184 	hammer2_chain_t *child;
185 
186 	KKASSERT(chain->refs == 0);
187 	KKASSERT(chain->flushing == 0);
188 	KKASSERT((chain->flags &
189 		  (HAMMER2_CHAIN_MOVED | HAMMER2_CHAIN_MODIFIED)) == 0);
190 
191 	/*
192 	 * If the sub-tree is not empty all the elements on it must have
193 	 * 0 refs and be deallocatable.
194 	 */
195 	while ((child = RB_ROOT(&chain->rbhead)) != NULL) {
196 		ccms_thread_lock(&child->cst, CCMS_STATE_EXCLUSIVE);
197 		hammer2_chain_dealloc(hmp, child);
198 	}
199 
200 	/*
201 	 * If the DELETED flag is not set the chain must be removed from
202 	 * its parent's tree.
203 	 *
204 	 * WARNING! chain->cst.spin must be held when chain->parent is
205 	 *	    modified, even though we own the full blown lock,
206 	 *	    to deal with setsubmod and rename races.
207 	 */
208 	if (chain->flags & HAMMER2_CHAIN_ONRBTREE) {
209 		spin_lock(&chain->cst.spin);	/* shouldn't be needed */
210 		parent = chain->parent;
211 		RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
212 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
213 		chain->parent = NULL;
214 		spin_unlock(&chain->cst.spin);
215 	}
216 	hammer2_chain_free(hmp, chain);
217 }
218 
219 /*
220  * Free a disconnected chain element
221  */
222 void
223 hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
224 {
225 	switch(chain->bref.type) {
226 	case HAMMER2_BREF_TYPE_VOLUME:
227 		chain->data = NULL;
228 		break;
229 	case HAMMER2_BREF_TYPE_INODE:
230 		if (chain->data) {
231 			kfree(chain->data, hmp->minode);
232 			chain->data = NULL;
233 		}
234 		break;
235 	default:
236 		KKASSERT(chain->data == NULL);
237 		break;
238 	}
239 
240 	KKASSERT(chain->bp == NULL);
241 
242 	ccms_thread_unlock(&chain->cst);
243 	KKASSERT(chain->cst.count == 0);
244 	KKASSERT(chain->cst.upgrade == 0);
245 
246 	kfree(chain, hmp->mchain);
247 }
248 
249 /*
250  * Add a reference to a chain element, preventing its destruction.
251  *
252  * The parent chain must be locked shared or exclusive or otherwise be
253  * stable and already have a reference.
254  */
255 void
256 hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
257 {
258 	u_int refs;
259 
260 	while (chain) {
261 		refs = chain->refs;
262 		KKASSERT(chain->refs >= 0);
263 		cpu_ccfence();
264 		if (refs == 0) {
265 			/*
266 			 * 0 -> 1 transition must bump the refs on the parent
267 			 * too.  The caller has stabilized the parent.
268 			 */
269 			if (atomic_cmpset_int(&chain->refs, 0, 1)) {
270 				chain = chain->parent;
271 				KKASSERT(chain == NULL || chain->refs > 0);
272 			}
273 			/* retry or continue along the parent chain */
274 		} else {
275 			/*
276 			 * N -> N+1
277 			 */
278 			if (atomic_cmpset_int(&chain->refs, refs, refs + 1))
279 				break;
280 			/* retry */
281 		}
282 	}
283 }
284 
285 /*
286  * Drop the callers reference to the chain element.  If the ref count
287  * reaches zero we attempt to recursively drop the parent.
288  *
289  * MOVED and MODIFIED elements hold additional references so it should not
290  * be possible for the count on a modified element to drop to 0.
291  *
292  * The chain element must NOT be locked by the caller on the 1->0 transition.
293  *
294  * The parent might or might not be locked by the caller.  If we are unable
295  * to lock the parent on the 1->0 transition the destruction of the chain
296  * will be deferred but we still recurse upward and drop the ref on the
297  * parent (see the lastdrop() function)
298  */
299 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_mount_t *hmp,
300 						hammer2_chain_t *chain);
301 
302 void
303 hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
304 {
305 	u_int refs;
306 
307 	while (chain) {
308 		refs = chain->refs;
309 		cpu_ccfence();
310 		KKASSERT(refs > 0);
311 		if (refs == 1) {
312 			/*
313 			 * (1) lastdrop successfully drops the chain to 0
314 			 *     refs and may may not have destroyed it.
315 			 *     lastdrop will return the parent so we can
316 			 *     recursively drop the implied ref from the
317 			 *     1->0 transition.
318 			 *
319 			 * (2) lastdrop fails to transition refs from 1 to 0
320 			 *     and returns the same chain, we retry.
321 			 */
322 			chain = hammer2_chain_lastdrop(hmp, chain);
323 		} else {
324 			if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
325 				/*
326 				 * Succeeded, count did not reach zero so
327 				 * cut out of the loop.
328 				 */
329 				break;
330 			}
331 			/* retry the same chain */
332 		}
333 	}
334 }
335 
336 /*
337  * Handle SMP races during the last drop.  We must obtain a lock on
338  * chain->parent to stabilize the last pointer reference to chain
339  * (if applicable).  This reference does not have a parallel ref count,
340  * that is idle chains in the topology can have a ref count of 0.
341  *
342  * The 1->0 transition implies a ref on the parent.
343  */
344 static
345 hammer2_chain_t *
346 hammer2_chain_lastdrop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
347 {
348 	hammer2_chain_t *parent;
349 
350 	/*
351 	 * Stablize chain->parent with the chain cst's spinlock.
352 	 * (parent can be NULL here).
353 	 *
354 	 * cst.spin locks are allowed to be nested bottom-up (the reverse
355 	 * of the normal top-down for full-blown cst locks), so this also
356 	 * allows us to attempt to obtain the parent's cst lock non-blocking
357 	 * (which must acquire the parent's spinlock unconditionally) while
358 	 * we are still holding the chain's spinlock.
359 	 */
360 	spin_lock(&chain->cst.spin);
361 	parent = chain->parent;
362 
363 	/*
364 	 * If chain->flushing is non-zero we cannot deallocate the chain
365 	 * here.  The flushing field will be serialized for the inline
366 	 * unlock made by the flusher itself and we don't care about races
367 	 * in any other situation because the thread lock on the parent
368 	 * will fail in other situations.
369 	 *
370 	 * If we have a non-NULL parent but cannot acquire its thread
371 	 * lock, we also cannot deallocate the chain.
372 	 */
373 	if (chain->flushing ||
374 	    (parent && ccms_thread_lock_nonblock(&parent->cst,
375 						 CCMS_STATE_EXCLUSIVE))) {
376 		if (atomic_cmpset_int(&chain->refs, 1, 0)) {
377 			spin_unlock(&chain->cst.spin);	/* success */
378 			return(parent);
379 		} else {
380 			spin_unlock(&chain->cst.spin);	/* failure */
381 			return(chain);
382 		}
383 	}
384 	spin_unlock(&chain->cst.spin);
385 
386 	/*
387 	 * With the parent now held we control the last pointer reference
388 	 * to chain ONLY IF this is the 1->0 drop.  If we fail to transition
389 	 * from 1->0 we raced a refs change and must retry at chain.
390 	 */
391 	if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
392 		/* failure */
393 		if (parent)
394 			ccms_thread_unlock(&parent->cst);
395 		return(chain);
396 	}
397 
398 	/*
399 	 * Ok, we succeeded.  We now own the implied ref on the parent
400 	 * associated with the 1->0 transition of the child.  It should not
401 	 * be possible for ANYTHING to access the child now, as we own the
402 	 * lock on the parent, so we should be able to safely lock the
403 	 * child and destroy it.
404 	 */
405 	ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
406 	hammer2_chain_dealloc(hmp, chain);
407 
408 	/*
409 	 * We want to return parent with its implied ref to the caller
410 	 * to recurse and drop the parent.
411 	 */
412 	if (parent)
413 		ccms_thread_unlock(&parent->cst);
414 	return (parent);
415 }
416 
417 /*
418  * Ref and lock a chain element, acquiring its data with I/O if necessary,
419  * and specify how you would like the data to be resolved.
420  *
421  * Returns 0 on success or an error code if the data could not be acquired.
422  * The chain element is locked either way.
423  *
424  * The lock is allowed to recurse, multiple locking ops will aggregate
425  * the requested resolve types.  Once data is assigned it will not be
426  * removed until the last unlock.
427  *
428  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
429  *			   (typically used to avoid device/logical buffer
430  *			    aliasing for data)
431  *
432  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
433  *			   the INITIAL-create state (indirect blocks only).
434  *
435  *			   Do not resolve data elements for DATA chains.
436  *			   (typically used to avoid device/logical buffer
437  *			    aliasing for data)
438  *
439  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
440  *
441  * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise
442  *			   it will be locked exclusive.
443  *
444  * NOTE: Embedded elements (volume header, inodes) are always resolved
445  *	 regardless.
446  *
447  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
448  *	 element will instantiate and zero its buffer, and flush it on
449  *	 release.
450  *
451  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
452  *	 so as not to instantiate a device buffer, which could alias against
453  *	 a logical file buffer.  However, if ALWAYS is specified the
454  *	 device buffer will be instantiated anyway.
455  */
456 int
457 hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
458 {
459 	hammer2_blockref_t *bref;
460 	hammer2_off_t pbase;
461 	hammer2_off_t peof;
462 	ccms_state_t ostate;
463 	size_t boff;
464 	size_t bbytes;
465 	int error;
466 	char *bdata;
467 
468 	/*
469 	 * Ref and lock the element.  Recursive locks are allowed.
470 	 */
471 	hammer2_chain_ref(hmp, chain);
472 	if (how & HAMMER2_RESOLVE_SHARED)
473 		ccms_thread_lock(&chain->cst, CCMS_STATE_SHARED);
474 	else
475 		ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
476 
477 	/*
478 	 * If we already have a valid data pointer no further action is
479 	 * necessary.
480 	 */
481 	if (chain->data)
482 		return (0);
483 
484 	/*
485 	 * Do we have to resolve the data?
486 	 */
487 	switch(how & HAMMER2_RESOLVE_MASK) {
488 	case HAMMER2_RESOLVE_NEVER:
489 		return(0);
490 	case HAMMER2_RESOLVE_MAYBE:
491 		if (chain->flags & HAMMER2_CHAIN_INITIAL)
492 			return(0);
493 		if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
494 			return(0);
495 		if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF)
496 			return(0);
497 		/* fall through */
498 	case HAMMER2_RESOLVE_ALWAYS:
499 		break;
500 	}
501 
502 	/*
503 	 * Upgrade to an exclusive lock so we can safely manipulate the
504 	 * buffer cache.  If another thread got to it before us we
505 	 * can just return.
506 	 */
507 	ostate = ccms_thread_lock_upgrade(&chain->cst);
508 	if (chain->data) {
509 		ccms_thread_lock_restore(&chain->cst, ostate);
510 		return (0);
511 	}
512 
513 	/*
514 	 * We must resolve to a device buffer, either by issuing I/O or
515 	 * by creating a zero-fill element.  We do not mark the buffer
516 	 * dirty when creating a zero-fill element (the hammer2_chain_modify()
517 	 * API must still be used to do that).
518 	 *
519 	 * The device buffer is variable-sized in powers of 2 down
520 	 * to HAMMER2_MINALLOCSIZE (typically 1K).  A 64K physical storage
521 	 * chunk always contains buffers of the same size. (XXX)
522 	 *
523 	 * The minimum physical IO size may be larger than the variable
524 	 * block size.
525 	 */
526 	bref = &chain->bref;
527 
528 	if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
529 		bbytes = HAMMER2_MINIOSIZE;
530 	pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
531 	peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64;
532 	boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
533 	KKASSERT(pbase != 0);
534 
535 	/*
536 	 * The getblk() optimization can only be used on newly created
537 	 * elements if the physical block size matches the request.
538 	 */
539 	if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
540 	    chain->bytes == bbytes) {
541 		chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
542 		error = 0;
543 	} else if (hammer2_cluster_enable) {
544 		error = cluster_read(hmp->devvp, peof, pbase, bbytes,
545 				     HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE,
546 				     &chain->bp);
547 	} else {
548 		error = bread(hmp->devvp, pbase, bbytes, &chain->bp);
549 	}
550 
551 	if (error) {
552 		kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
553 			(intmax_t)pbase, error);
554 		bqrelse(chain->bp);
555 		chain->bp = NULL;
556 		ccms_thread_lock_restore(&chain->cst, ostate);
557 		return (error);
558 	}
559 
560 	/*
561 	 * Zero the data area if the chain is in the INITIAL-create state.
562 	 * Mark the buffer for bdwrite().
563 	 */
564 	bdata = (char *)chain->bp->b_data + boff;
565 	if (chain->flags & HAMMER2_CHAIN_INITIAL) {
566 		bzero(bdata, chain->bytes);
567 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
568 	}
569 
570 	/*
571 	 * Setup the data pointer, either pointing it to an embedded data
572 	 * structure and copying the data from the buffer, or pointing it
573 	 * into the buffer.
574 	 *
575 	 * The buffer is not retained when copying to an embedded data
576 	 * structure in order to avoid potential deadlocks or recursions
577 	 * on the same physical buffer.
578 	 */
579 	switch (bref->type) {
580 	case HAMMER2_BREF_TYPE_VOLUME:
581 		/*
582 		 * Copy data from bp to embedded buffer
583 		 */
584 		panic("hammer2_chain_lock: called on unresolved volume header");
585 #if 0
586 		/* NOT YET */
587 		KKASSERT(pbase == 0);
588 		KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
589 		bcopy(bdata, &hmp->voldata, chain->bytes);
590 		chain->data = (void *)&hmp->voldata;
591 		bqrelse(chain->bp);
592 		chain->bp = NULL;
593 #endif
594 		break;
595 	case HAMMER2_BREF_TYPE_INODE:
596 		/*
597 		 * Copy data from bp to embedded buffer, do not retain the
598 		 * device buffer.
599 		 */
600 		KKASSERT(chain->bytes == sizeof(chain->data->ipdata));
601 		chain->data = kmalloc(sizeof(chain->data->ipdata),
602 				      hmp->minode, M_WAITOK | M_ZERO);
603 		bcopy(bdata, &chain->data->ipdata, chain->bytes);
604 		bqrelse(chain->bp);
605 		chain->bp = NULL;
606 		break;
607 	case HAMMER2_BREF_TYPE_INDIRECT:
608 	case HAMMER2_BREF_TYPE_DATA:
609 	case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
610 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
611 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
612 	default:
613 		/*
614 		 * Point data at the device buffer and leave bp intact.
615 		 */
616 		chain->data = (void *)bdata;
617 		break;
618 	}
619 
620 	/*
621 	 * Make sure the bp is not specifically owned by this thread before
622 	 * restoring to a possibly shared lock, so another hammer2 thread
623 	 * can release it.
624 	 */
625 	if (chain->bp)
626 		BUF_KERNPROC(chain->bp);
627 	ccms_thread_lock_restore(&chain->cst, ostate);
628 	return (0);
629 }
630 
631 /*
632  * Unlock and deref a chain element.
633  *
634  * On the last lock release any non-embedded data (chain->bp) will be
635  * retired.
636  */
637 void
638 hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
639 {
640 	long *counterp;
641 
642 	/*
643 	 * Release the CST lock but with a special 1->0 transition case.
644 	 *
645 	 * Returns non-zero if lock references remain.  When zero is
646 	 * returned the last lock reference is retained and any shared
647 	 * lock is upgraded to an exclusive lock for final disposition.
648 	 */
649 	if (ccms_thread_unlock_zero(&chain->cst)) {
650 		KKASSERT(chain->refs > 1);
651 		atomic_add_int(&chain->refs, -1);
652 		return;
653 	}
654 
655 	/*
656 	 * Shortcut the case if the data is embedded or not resolved.
657 	 *
658 	 * Do NOT NULL out chain->data (e.g. inode data), it might be
659 	 * dirty.
660 	 *
661 	 * The DIRTYBP flag is non-applicable in this situation and can
662 	 * be cleared to keep the flags state clean.
663 	 */
664 	if (chain->bp == NULL) {
665 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
666 		ccms_thread_unlock(&chain->cst);
667 		hammer2_chain_drop(hmp, chain);
668 		return;
669 	}
670 
671 	/*
672 	 * Statistics
673 	 */
674 	if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) {
675 		;
676 	} else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
677 		switch(chain->bref.type) {
678 		case HAMMER2_BREF_TYPE_DATA:
679 			counterp = &hammer2_ioa_file_write;
680 			break;
681 		case HAMMER2_BREF_TYPE_INODE:
682 			counterp = &hammer2_ioa_meta_write;
683 			break;
684 		case HAMMER2_BREF_TYPE_INDIRECT:
685 			counterp = &hammer2_ioa_indr_write;
686 			break;
687 		case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
688 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
689 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
690 			counterp = &hammer2_ioa_fmap_write;
691 			break;
692 		default:
693 			counterp = &hammer2_ioa_volu_write;
694 			break;
695 		}
696 		++*counterp;
697 	} else {
698 		switch(chain->bref.type) {
699 		case HAMMER2_BREF_TYPE_DATA:
700 			counterp = &hammer2_iod_file_write;
701 			break;
702 		case HAMMER2_BREF_TYPE_INODE:
703 			counterp = &hammer2_iod_meta_write;
704 			break;
705 		case HAMMER2_BREF_TYPE_INDIRECT:
706 			counterp = &hammer2_iod_indr_write;
707 			break;
708 		case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
709 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
710 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
711 			counterp = &hammer2_iod_fmap_write;
712 			break;
713 		default:
714 			counterp = &hammer2_iod_volu_write;
715 			break;
716 		}
717 		++*counterp;
718 	}
719 
720 	/*
721 	 * Clean out the bp.
722 	 *
723 	 * If a device buffer was used for data be sure to destroy the
724 	 * buffer when we are done to avoid aliases (XXX what about the
725 	 * underlying VM pages?).
726 	 *
727 	 * NOTE: Freemap leaf's use reserved blocks and thus no aliasing
728 	 *	 is possible.
729 	 */
730 	if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
731 		chain->bp->b_flags |= B_RELBUF;
732 
733 	/*
734 	 * The DIRTYBP flag tracks whether we have to bdwrite() the buffer
735 	 * or not.  The flag will get re-set when chain_modify() is called,
736 	 * even if MODIFIED is already set, allowing the OS to retire the
737 	 * buffer independent of a hammer2 flus.
738 	 */
739 	chain->data = NULL;
740 	if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
741 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
742 		if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
743 			atomic_clear_int(&chain->flags,
744 					 HAMMER2_CHAIN_IOFLUSH);
745 			chain->bp->b_flags |= B_RELBUF;
746 			cluster_awrite(chain->bp);
747 		} else {
748 			chain->bp->b_flags |= B_CLUSTEROK;
749 			bdwrite(chain->bp);
750 		}
751 	} else {
752 		if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
753 			atomic_clear_int(&chain->flags,
754 					 HAMMER2_CHAIN_IOFLUSH);
755 			chain->bp->b_flags |= B_RELBUF;
756 			brelse(chain->bp);
757 		} else {
758 			/* bp might still be dirty */
759 			bqrelse(chain->bp);
760 		}
761 	}
762 	chain->bp = NULL;
763 	ccms_thread_unlock(&chain->cst);
764 	hammer2_chain_drop(hmp, chain);
765 }
766 
767 /*
768  * Resize the chain's physical storage allocation.  Chains can be resized
769  * smaller without reallocating the storage.  Resizing larger will reallocate
770  * the storage.
771  *
772  * Must be passed a locked chain.
773  *
774  * If you want the resize code to copy the data to the new block then the
775  * caller should lock the chain RESOLVE_MAYBE or RESOLVE_ALWAYS.
776  *
777  * If the caller already holds a logical buffer containing the data and
778  * intends to bdwrite() that buffer resolve with RESOLVE_NEVER.  The resize
779  * operation will then not copy the data.
780  *
781  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
782  * to avoid instantiating a device buffer that conflicts with the vnode
783  * data buffer.
784  *
785  * XXX flags currently ignored, uses chain->bp to detect data/no-data.
786  */
787 void
788 hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
789 		     int nradix, int flags)
790 {
791 	hammer2_mount_t *hmp = ip->hmp;
792 	struct buf *nbp;
793 	hammer2_off_t pbase;
794 	size_t obytes;
795 	size_t nbytes;
796 	size_t bbytes;
797 	int boff;
798 	char *bdata;
799 	int error;
800 
801 	/*
802 	 * Only data and indirect blocks can be resized for now.
803 	 * (The volu root, inodes, and freemap elements use a fixed size).
804 	 */
805 	KKASSERT(chain != &hmp->vchain);
806 	KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
807 		 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
808 
809 	/*
810 	 * Nothing to do if the element is already the proper size
811 	 */
812 	obytes = chain->bytes;
813 	nbytes = 1U << nradix;
814 	if (obytes == nbytes)
815 		return;
816 
817 	/*
818 	 * Set MODIFIED and add a chain ref to prevent destruction.  Both
819 	 * modified flags share the same ref.
820 	 *
821 	 * If the chain is already marked MODIFIED then we can safely
822 	 * return the previous allocation to the pool without having to
823 	 * worry about snapshots.
824 	 */
825 	if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
826 		atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
827 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
828 					      HAMMER2_CHAIN_MODIFY_TID);
829 		hammer2_chain_ref(hmp, chain);
830 	} else {
831 		hammer2_freemap_free(hmp, chain->bref.data_off,
832 				     chain->bref.type);
833 	}
834 
835 	/*
836 	 * Relocate the block, even if making it smaller (because different
837 	 * block sizes may be in different regions).
838 	 */
839 	chain->bref.data_off = hammer2_freemap_alloc(hmp, chain->bref.type,
840 						     nbytes);
841 	chain->bytes = nbytes;
842 	/*ip->delta_dcount += (ssize_t)(nbytes - obytes);*/ /* XXX atomic */
843 
844 	/*
845 	 * The device buffer may be larger than the allocation size.
846 	 */
847 	if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
848 		bbytes = HAMMER2_MINIOSIZE;
849 	pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
850 	boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
851 
852 	/*
853 	 * Only copy the data if resolved, otherwise the caller is
854 	 * responsible.
855 	 */
856 	if (chain->bp) {
857 		KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
858 			 chain->bref.type == HAMMER2_BREF_TYPE_DATA);
859 		KKASSERT(chain != &hmp->vchain);	/* safety */
860 
861 		/*
862 		 * The getblk() optimization can only be used if the
863 		 * physical block size matches the request.
864 		 */
865 		if (nbytes == bbytes) {
866 			nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
867 			error = 0;
868 		} else {
869 			error = bread(hmp->devvp, pbase, bbytes, &nbp);
870 			KKASSERT(error == 0);
871 		}
872 		bdata = (char *)nbp->b_data + boff;
873 
874 		if (nbytes < obytes) {
875 			bcopy(chain->data, bdata, nbytes);
876 		} else {
877 			bcopy(chain->data, bdata, obytes);
878 			bzero(bdata + obytes, nbytes - obytes);
879 		}
880 
881 		/*
882 		 * NOTE: The INITIAL state of the chain is left intact.
883 		 *	 We depend on hammer2_chain_modify() to do the
884 		 *	 right thing.
885 		 *
886 		 * NOTE: We set B_NOCACHE to throw away the previous bp and
887 		 *	 any VM backing store, even if it was dirty.
888 		 *	 Otherwise we run the risk of a logical/device
889 		 *	 conflict on reallocation.
890 		 */
891 		chain->bp->b_flags |= B_RELBUF | B_NOCACHE;
892 		brelse(chain->bp);
893 		chain->bp = nbp;
894 		chain->data = (void *)bdata;
895 		hammer2_chain_modify(hmp, chain, 0);
896 	}
897 
898 	/*
899 	 * Make sure the chain is marked MOVED and SUBMOD is set in the
900 	 * parent(s) so the adjustments are picked up by flush.
901 	 */
902 	if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
903 		hammer2_chain_ref(hmp, chain);
904 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
905 	}
906 	hammer2_chain_parent_setsubmod(hmp, chain);
907 }
908 
909 /*
910  * Convert a locked chain that was retrieved read-only to read-write.
911  *
912  * If not already marked modified a new physical block will be allocated
913  * and assigned to the bref.
914  *
915  * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
916  *		     level or the COW operation will not work.
917  *
918  * Data blocks	   - The chain is usually locked RESOLVE_NEVER so as not to
919  *		     run the data through the device buffers.
920  */
921 void
922 hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags)
923 {
924 	struct buf *nbp;
925 	int error;
926 	hammer2_off_t pbase;
927 	size_t bbytes;
928 	size_t boff;
929 	void *bdata;
930 
931 	/*
932 	 * Tells flush that modify_tid must be updated, otherwise only
933 	 * mirror_tid is updated.  This is the default.
934 	 */
935 	if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
936 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFY_TID);
937 
938 	/*
939 	 * If the chain is already marked MODIFIED we can just return.
940 	 *
941 	 * However, it is possible that a prior lock/modify sequence
942 	 * retired the buffer.  During this lock/modify sequence MODIFIED
943 	 * may still be set but the buffer could wind up clean.  Since
944 	 * the caller is going to modify the buffer further we have to
945 	 * be sure that DIRTYBP is set again.
946 	 */
947 	if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
948 		if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
949 		    chain->bp == NULL) {
950 			goto skip1;
951 		}
952 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
953 		return;
954 	}
955 
956 	/*
957 	 * Set MODIFIED and add a chain ref to prevent destruction.  Both
958 	 * modified flags share the same ref.
959 	 */
960 	atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
961 	hammer2_chain_ref(hmp, chain);
962 
963 	/*
964 	 * We must allocate the copy-on-write block.
965 	 *
966 	 * If the data is embedded no other action is required.
967 	 *
968 	 * If the data is not embedded we acquire and clear the
969 	 * new block.  If chain->data is not NULL we then do the
970 	 * copy-on-write.  chain->data will then be repointed to the new
971 	 * buffer and the old buffer will be released.
972 	 *
973 	 * For newly created elements with no prior allocation we go
974 	 * through the copy-on-write steps except without the copying part.
975 	 */
976 	if (chain != &hmp->vchain) {
977 		if ((hammer2_debug & 0x0001) &&
978 		    (chain->bref.data_off & HAMMER2_OFF_MASK)) {
979 			kprintf("Replace %d\n", chain->bytes);
980 		}
981 		chain->bref.data_off =
982 			hammer2_freemap_alloc(hmp, chain->bref.type,
983 					      chain->bytes);
984 		/* XXX failed allocation */
985 	}
986 
987 	/*
988 	 * If data instantiation is optional and the chain has no current
989 	 * data association (typical for DATA and newly-created INDIRECT
990 	 * elements), don't instantiate the buffer now.
991 	 */
992 	if ((flags & HAMMER2_MODIFY_OPTDATA) && chain->bp == NULL)
993 		goto skip2;
994 
995 skip1:
996 	/*
997 	 * Setting the DIRTYBP flag will cause the buffer to be dirtied or
998 	 * written-out on unlock.  This bit is independent of the MODIFIED
999 	 * bit because the chain may still need meta-data adjustments done
1000 	 * by virtue of MODIFIED for its parent, and the buffer can be
1001 	 * flushed out (possibly multiple times) by the OS before that.
1002 	 *
1003 	 * Clearing the INITIAL flag (for indirect blocks) indicates that
1004 	 * a zero-fill buffer has been instantiated.
1005 	 */
1006 	atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
1007 	atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1008 
1009 	/*
1010 	 * We currently should never instantiate a device buffer for a
1011 	 * file data chain.  (We definitely can for a freemap chain).
1012 	 */
1013 	KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
1014 
1015 	/*
1016 	 * Execute COW operation
1017 	 */
1018 	switch(chain->bref.type) {
1019 	case HAMMER2_BREF_TYPE_VOLUME:
1020 	case HAMMER2_BREF_TYPE_INODE:
1021 		/*
1022 		 * The data is embedded, no copy-on-write operation is
1023 		 * needed.
1024 		 */
1025 		KKASSERT(chain->bp == NULL);
1026 		break;
1027 	case HAMMER2_BREF_TYPE_DATA:
1028 	case HAMMER2_BREF_TYPE_INDIRECT:
1029 	case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1030 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1031 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1032 		/*
1033 		 * Perform the copy-on-write operation
1034 		 */
1035 		KKASSERT(chain != &hmp->vchain);	/* safety */
1036 		/*
1037 		 * The device buffer may be larger than the allocation size.
1038 		 */
1039 		if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
1040 			bbytes = HAMMER2_MINIOSIZE;
1041 		pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
1042 		boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
1043 
1044 		/*
1045 		 * The getblk() optimization can only be used if the
1046 		 * physical block size matches the request.
1047 		 */
1048 		if (chain->bytes == bbytes) {
1049 			nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
1050 			error = 0;
1051 		} else {
1052 			error = bread(hmp->devvp, pbase, bbytes, &nbp);
1053 			KKASSERT(error == 0);
1054 		}
1055 		bdata = (char *)nbp->b_data + boff;
1056 
1057 		/*
1058 		 * Copy or zero-fill on write depending on whether
1059 		 * chain->data exists or not.
1060 		 */
1061 		if (chain->data) {
1062 			bcopy(chain->data, bdata, chain->bytes);
1063 			KKASSERT(chain->bp != NULL);
1064 		} else {
1065 			bzero(bdata, chain->bytes);
1066 		}
1067 		if (chain->bp) {
1068 			chain->bp->b_flags |= B_RELBUF;
1069 			brelse(chain->bp);
1070 		}
1071 		chain->bp = nbp;
1072 		chain->data = bdata;
1073 		break;
1074 	default:
1075 		panic("hammer2_chain_modify: illegal non-embedded type %d",
1076 		      chain->bref.type);
1077 		break;
1078 
1079 	}
1080 skip2:
1081 	if ((flags & HAMMER2_MODIFY_NOSUB) == 0)
1082 		hammer2_chain_parent_setsubmod(hmp, chain);
1083 }
1084 
1085 /*
1086  * Mark the volume as having been modified.  This short-cut version
1087  * does not have to lock the volume's chain, which allows the ioctl
1088  * code to make adjustments to connections without deadlocking.
1089  */
1090 void
1091 hammer2_modify_volume(hammer2_mount_t *hmp)
1092 {
1093 	hammer2_voldata_lock(hmp);
1094 	atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
1095 	hammer2_voldata_unlock(hmp);
1096 }
1097 
1098 /*
1099  * Locate an in-memory chain.  The parent must be locked.  The in-memory
1100  * chain is returned or NULL if no in-memory chain is present.
1101  *
1102  * NOTE: A chain on-media might exist for this index when NULL is returned.
1103  */
1104 hammer2_chain_t *
1105 hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
1106 {
1107 	hammer2_chain_t dummy;
1108 	hammer2_chain_t *chain;
1109 
1110 	dummy.index = index;
1111 	chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
1112 	return (chain);
1113 }
1114 
1115 /*
1116  * Return a locked chain structure with all associated data acquired.
1117  *
1118  * Caller must lock the parent on call, the returned child will be locked.
1119  */
1120 hammer2_chain_t *
1121 hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1122 		  int index, int flags)
1123 {
1124 	hammer2_blockref_t *bref;
1125 	hammer2_chain_t *chain;
1126 	hammer2_chain_t dummy;
1127 	int how;
1128 	ccms_state_t ostate;
1129 
1130 	/*
1131 	 * Figure out how to lock.  MAYBE can be used to optimized
1132 	 * the initial-create state for indirect blocks.
1133 	 */
1134 	if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK))
1135 		how = HAMMER2_RESOLVE_NEVER;
1136 	else
1137 		how = HAMMER2_RESOLVE_MAYBE;
1138 	if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
1139 		how |= HAMMER2_RESOLVE_SHARED;
1140 
1141 	/*
1142 	 * First see if we have a (possibly modified) chain element cached
1143 	 * for this (parent, index).  Acquire the data if necessary.
1144 	 *
1145 	 * If chain->data is non-NULL the chain should already be marked
1146 	 * modified.
1147 	 */
1148 	dummy.index = index;
1149 	chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
1150 	if (chain) {
1151 		if (flags & HAMMER2_LOOKUP_NOLOCK)
1152 			hammer2_chain_ref(hmp, chain);
1153 		else
1154 			hammer2_chain_lock(hmp, chain, how);
1155 		return(chain);
1156 	}
1157 
1158 	/*
1159 	 * Upgrade our thread lock and handle any race that may have
1160 	 * occurred.  Leave the lock upgraded for the rest of the get.
1161 	 * We have to do this because we will be modifying the chain
1162 	 * structure.
1163 	 */
1164 	ostate = ccms_thread_lock_upgrade(&parent->cst);
1165 	chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
1166 	if (chain) {
1167 		if (flags & HAMMER2_LOOKUP_NOLOCK)
1168 			hammer2_chain_ref(hmp, chain);
1169 		else
1170 			hammer2_chain_lock(hmp, chain, how);
1171 		ccms_thread_lock_restore(&parent->cst, ostate);
1172 		return(chain);
1173 	}
1174 
1175 	/*
1176 	 * The get function must always succeed, panic if there's no
1177 	 * data to index.
1178 	 */
1179 	if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1180 		ccms_thread_lock_restore(&parent->cst, ostate);
1181 		panic("hammer2_chain_get: Missing bref(1)");
1182 		/* NOT REACHED */
1183 	}
1184 
1185 	/*
1186 	 * Otherwise lookup the bref and issue I/O (switch on the parent)
1187 	 */
1188 	switch(parent->bref.type) {
1189 	case HAMMER2_BREF_TYPE_INODE:
1190 		KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1191 		bref = &parent->data->ipdata.u.blockset.blockref[index];
1192 		break;
1193 	case HAMMER2_BREF_TYPE_INDIRECT:
1194 	case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1195 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1196 		KKASSERT(parent->data != NULL);
1197 		KKASSERT(index >= 0 &&
1198 			 index < parent->bytes / sizeof(hammer2_blockref_t));
1199 		bref = &parent->data->npdata.blockref[index];
1200 		break;
1201 	case HAMMER2_BREF_TYPE_VOLUME:
1202 		KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1203 		bref = &hmp->voldata.sroot_blockset.blockref[index];
1204 		break;
1205 	default:
1206 		bref = NULL;
1207 		panic("hammer2_chain_get: unrecognized blockref type: %d",
1208 		      parent->bref.type);
1209 	}
1210 	if (bref->type == 0) {
1211 		panic("hammer2_chain_get: Missing bref(2)");
1212 		/* NOT REACHED */
1213 	}
1214 
1215 	/*
1216 	 * Allocate a chain structure representing the existing media
1217 	 * entry.
1218 	 *
1219 	 * The locking operation we do later will issue I/O to read it.
1220 	 */
1221 	chain = hammer2_chain_alloc(hmp, bref);
1222 
1223 	/*
1224 	 * Link the chain into its parent.  Caller is expected to hold an
1225 	 * exclusive lock on the parent.
1226 	 */
1227 	chain->parent = parent;
1228 	chain->index = index;
1229 	if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain))
1230 		panic("hammer2_chain_link: collision");
1231 	atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
1232 	KKASSERT(parent->refs > 0);
1233 	atomic_add_int(&parent->refs, 1);	/* for red-black entry */
1234 	ccms_thread_lock_restore(&parent->cst, ostate);
1235 
1236 	/*
1237 	 * Our new chain structure has already been referenced and locked
1238 	 * but the lock code handles the I/O so call it to resolve the data.
1239 	 * Then release one of our two exclusive locks.
1240 	 *
1241 	 * If NOLOCK is set the release will release the one-and-only lock.
1242 	 */
1243 	if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1244 		hammer2_chain_lock(hmp, chain, how);	/* recusive lock */
1245 		hammer2_chain_drop(hmp, chain);		/* excess ref */
1246 	}
1247 	ccms_thread_unlock(&chain->cst);			/* from alloc */
1248 
1249 	return (chain);
1250 }
1251 
1252 /*
1253  * Locate any key between key_beg and key_end inclusive.  (*parentp)
1254  * typically points to an inode but can also point to a related indirect
1255  * block and this function will recurse upwards and find the inode again.
1256  *
1257  * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
1258  *	     WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
1259  *	     WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
1260  *
1261  * (*parentp) must be exclusively locked and referenced and can be an inode
1262  * or an existing indirect block within the inode.
1263  *
1264  * On return (*parentp) will be modified to point at the deepest parent chain
1265  * element encountered during the search, as a helper for an insertion or
1266  * deletion.   The new (*parentp) will be locked and referenced and the old
1267  * will be unlocked and dereferenced (no change if they are both the same).
1268  *
1269  * The matching chain will be returned exclusively locked and referenced.
1270  *
1271  * NULL is returned if no match was found, but (*parentp) will still
1272  * potentially be adjusted.
1273  *
1274  * This function will also recurse up the chain if the key is not within the
1275  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
1276  * can simply allow (*parentp) to float inside the loop.
1277  */
1278 hammer2_chain_t *
1279 hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1280 		     hammer2_key_t key_beg, hammer2_key_t key_end,
1281 		     int flags)
1282 {
1283 	hammer2_chain_t *parent;
1284 	hammer2_chain_t *chain;
1285 	hammer2_chain_t *tmp;
1286 	hammer2_blockref_t *base;
1287 	hammer2_blockref_t *bref;
1288 	hammer2_key_t scan_beg;
1289 	hammer2_key_t scan_end;
1290 	int count = 0;
1291 	int i;
1292 	int how_always = HAMMER2_RESOLVE_ALWAYS;
1293 	int how_maybe = HAMMER2_RESOLVE_MAYBE;
1294 
1295 	if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) {
1296 		how_maybe |= HAMMER2_RESOLVE_SHARED;
1297 		how_always |= HAMMER2_RESOLVE_SHARED;
1298 	}
1299 
1300 	/*
1301 	 * Recurse (*parentp) upward if necessary until the parent completely
1302 	 * encloses the key range or we hit the inode.
1303 	 */
1304 	parent = *parentp;
1305 	while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1306 	       parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1307 		scan_beg = parent->bref.key;
1308 		scan_end = scan_beg +
1309 			   ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1310 		if (key_beg >= scan_beg && key_end <= scan_end)
1311 			break;
1312 		hammer2_chain_ref(hmp, parent);		/* ref old parent */
1313 		hammer2_chain_unlock(hmp, parent);	/* unlock old parent */
1314 		parent = parent->parent;
1315 							/* lock new parent */
1316 		hammer2_chain_lock(hmp, parent, how_maybe);
1317 		hammer2_chain_drop(hmp, *parentp);	/* drop old parent */
1318 		*parentp = parent;			/* new parent */
1319 	}
1320 
1321 again:
1322 	/*
1323 	 * Locate the blockref array.  Currently we do a fully associative
1324 	 * search through the array.
1325 	 */
1326 	switch(parent->bref.type) {
1327 	case HAMMER2_BREF_TYPE_INODE:
1328 		/*
1329 		 * Special shortcut for embedded data returns the inode
1330 		 * itself.  Callers must detect this condition and access
1331 		 * the embedded data (the strategy code does this for us).
1332 		 *
1333 		 * This is only applicable to regular files and softlinks.
1334 		 */
1335 		if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1336 			if (flags & HAMMER2_LOOKUP_NOLOCK)
1337 				hammer2_chain_ref(hmp, parent);
1338 			else
1339 				hammer2_chain_lock(hmp, parent, how_always);
1340 			return (parent);
1341 		}
1342 		base = &parent->data->ipdata.u.blockset.blockref[0];
1343 		count = HAMMER2_SET_COUNT;
1344 		break;
1345 	case HAMMER2_BREF_TYPE_INDIRECT:
1346 	case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1347 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1348 		/*
1349 		 * Optimize indirect blocks in the INITIAL state to avoid
1350 		 * I/O.
1351 		 */
1352 		if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1353 			base = NULL;
1354 		} else {
1355 			if (parent->data == NULL)
1356 				panic("parent->data is NULL");
1357 			base = &parent->data->npdata.blockref[0];
1358 		}
1359 		count = parent->bytes / sizeof(hammer2_blockref_t);
1360 		break;
1361 	case HAMMER2_BREF_TYPE_VOLUME:
1362 		base = &hmp->voldata.sroot_blockset.blockref[0];
1363 		count = HAMMER2_SET_COUNT;
1364 		break;
1365 	default:
1366 		panic("hammer2_chain_lookup: unrecognized blockref type: %d",
1367 		      parent->bref.type);
1368 		base = NULL;	/* safety */
1369 		count = 0;	/* safety */
1370 	}
1371 
1372 	/*
1373 	 * If the element and key overlap we use the element.
1374 	 *
1375 	 * NOTE!  Deleted elements are effectively invisible.  A Deleted
1376 	 *	  elements covers (makes invisible) any original media
1377 	 *	  data.
1378 	 */
1379 	bref = NULL;
1380 	for (i = 0; i < count; ++i) {
1381 		tmp = hammer2_chain_find(hmp, parent, i);
1382 		if (tmp) {
1383 			if (tmp->flags & HAMMER2_CHAIN_DELETED)
1384 				continue;
1385 			bref = &tmp->bref;
1386 			KKASSERT(bref->type != 0);
1387 		} else if (base == NULL || base[i].type == 0) {
1388 			continue;
1389 		} else {
1390 			bref = &base[i];
1391 		}
1392 		scan_beg = bref->key;
1393 		scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1394 		if (key_beg <= scan_end && key_end >= scan_beg)
1395 			break;
1396 	}
1397 	if (i == count) {
1398 		if (key_beg == key_end)
1399 			return (NULL);
1400 		return (hammer2_chain_next(hmp, parentp, NULL,
1401 					   key_beg, key_end, flags));
1402 	}
1403 
1404 	/*
1405 	 * Acquire the new chain element.  If the chain element is an
1406 	 * indirect block we must search recursively.
1407 	 */
1408 	chain = hammer2_chain_get(hmp, parent, i, flags);
1409 	if (chain == NULL)
1410 		return (NULL);
1411 
1412 	/*
1413 	 * If the chain element is an indirect block it becomes the new
1414 	 * parent and we loop on it.
1415 	 *
1416 	 * The parent always has to be locked with at least RESOLVE_MAYBE,
1417 	 * so it might need a fixup if the caller passed incompatible flags.
1418 	 */
1419 	if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1420 	    chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1421 		hammer2_chain_unlock(hmp, parent);
1422 		*parentp = parent = chain;
1423 		if (flags & HAMMER2_LOOKUP_NOLOCK) {
1424 			hammer2_chain_lock(hmp, chain, how_maybe);
1425 			hammer2_chain_drop(hmp, chain);	/* excess ref */
1426 		} else if (flags & HAMMER2_LOOKUP_NODATA) {
1427 			hammer2_chain_lock(hmp, chain, how_maybe);
1428 			hammer2_chain_unlock(hmp, chain);
1429 		}
1430 		goto again;
1431 	}
1432 
1433 	/*
1434 	 * All done, return chain
1435 	 */
1436 	return (chain);
1437 }
1438 
1439 /*
1440  * After having issued a lookup we can iterate all matching keys.
1441  *
1442  * If chain is non-NULL we continue the iteration from just after it's index.
1443  *
1444  * If chain is NULL we assume the parent was exhausted and continue the
1445  * iteration at the next parent.
1446  *
1447  * parent must be locked on entry and remains locked throughout.  chain's
1448  * lock status must match flags.
1449  */
1450 hammer2_chain_t *
1451 hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1452 		   hammer2_chain_t *chain,
1453 		   hammer2_key_t key_beg, hammer2_key_t key_end,
1454 		   int flags)
1455 {
1456 	hammer2_chain_t *parent;
1457 	hammer2_chain_t *tmp;
1458 	hammer2_blockref_t *base;
1459 	hammer2_blockref_t *bref;
1460 	hammer2_key_t scan_beg;
1461 	hammer2_key_t scan_end;
1462 	int i;
1463 	int how_maybe = HAMMER2_RESOLVE_MAYBE;
1464 	int count;
1465 
1466 	if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
1467 		how_maybe |= HAMMER2_RESOLVE_SHARED;
1468 
1469 	parent = *parentp;
1470 
1471 again:
1472 	/*
1473 	 * Calculate the next index and recalculate the parent if necessary.
1474 	 */
1475 	if (chain) {
1476 		/*
1477 		 * Continue iteration within current parent.  If not NULL
1478 		 * the passed-in chain may or may not be locked, based on
1479 		 * the LOOKUP_NOLOCK flag (passed in as returned from lookup
1480 		 * or a prior next).
1481 		 */
1482 		i = chain->index + 1;
1483 		if (flags & HAMMER2_LOOKUP_NOLOCK)
1484 			hammer2_chain_drop(hmp, chain);
1485 		else
1486 			hammer2_chain_unlock(hmp, chain);
1487 
1488 		/*
1489 		 * Any scan where the lookup returned degenerate data embedded
1490 		 * in the inode has an invalid index and must terminate.
1491 		 */
1492 		if (chain == parent)
1493 			return(NULL);
1494 		chain = NULL;
1495 	} else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
1496 		   parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1497 		/*
1498 		 * We reached the end of the iteration.
1499 		 */
1500 		return (NULL);
1501 	} else {
1502 		/*
1503 		 * Continue iteration with next parent unless the current
1504 		 * parent covers the range.
1505 		 */
1506 		hammer2_chain_t *nparent;
1507 
1508 		scan_beg = parent->bref.key;
1509 		scan_end = scan_beg +
1510 			    ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1511 		if (key_beg >= scan_beg && key_end <= scan_end)
1512 			return (NULL);
1513 
1514 		i = parent->index + 1;
1515 		nparent = parent->parent;
1516 		hammer2_chain_ref(hmp, nparent);	/* ref new parent */
1517 		hammer2_chain_unlock(hmp, parent);	/* unlock old parent */
1518 							/* lock new parent */
1519 		hammer2_chain_lock(hmp, nparent, how_maybe);
1520 		hammer2_chain_drop(hmp, nparent);	/* drop excess ref */
1521 		*parentp = parent = nparent;
1522 	}
1523 
1524 again2:
1525 	/*
1526 	 * Locate the blockref array.  Currently we do a fully associative
1527 	 * search through the array.
1528 	 */
1529 	switch(parent->bref.type) {
1530 	case HAMMER2_BREF_TYPE_INODE:
1531 		base = &parent->data->ipdata.u.blockset.blockref[0];
1532 		count = HAMMER2_SET_COUNT;
1533 		break;
1534 	case HAMMER2_BREF_TYPE_INDIRECT:
1535 	case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1536 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1537 		if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1538 			base = NULL;
1539 		} else {
1540 			KKASSERT(parent->data != NULL);
1541 			base = &parent->data->npdata.blockref[0];
1542 		}
1543 		count = parent->bytes / sizeof(hammer2_blockref_t);
1544 		break;
1545 	case HAMMER2_BREF_TYPE_VOLUME:
1546 		base = &hmp->voldata.sroot_blockset.blockref[0];
1547 		count = HAMMER2_SET_COUNT;
1548 		break;
1549 	default:
1550 		panic("hammer2_chain_next: unrecognized blockref type: %d",
1551 		      parent->bref.type);
1552 		base = NULL;	/* safety */
1553 		count = 0;	/* safety */
1554 		break;
1555 	}
1556 	KKASSERT(i <= count);
1557 
1558 	/*
1559 	 * Look for the key.  If we are unable to find a match and an exact
1560 	 * match was requested we return NULL.  If a range was requested we
1561 	 * run hammer2_chain_next() to iterate.
1562 	 *
1563 	 * NOTE!  Deleted elements are effectively invisible.  A Deleted
1564 	 *	  elements covers (makes invisible) any original media
1565 	 *	  data.
1566 	 */
1567 	bref = NULL;
1568 	while (i < count) {
1569 		tmp = hammer2_chain_find(hmp, parent, i);
1570 		if (tmp) {
1571 			if (tmp->flags & HAMMER2_CHAIN_DELETED) {
1572 				++i;
1573 				continue;
1574 			}
1575 			bref = &tmp->bref;
1576 		} else if (base == NULL || base[i].type == 0) {
1577 			++i;
1578 			continue;
1579 		} else {
1580 			bref = &base[i];
1581 		}
1582 		scan_beg = bref->key;
1583 		scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1584 		if (key_beg <= scan_end && key_end >= scan_beg)
1585 			break;
1586 		++i;
1587 	}
1588 
1589 	/*
1590 	 * If we couldn't find a match recurse up a parent to continue the
1591 	 * search.
1592 	 */
1593 	if (i == count)
1594 		goto again;
1595 
1596 	/*
1597 	 * Acquire the new chain element.  If the chain element is an
1598 	 * indirect block we must search recursively.
1599 	 */
1600 	chain = hammer2_chain_get(hmp, parent, i, flags);
1601 	if (chain == NULL)
1602 		return (NULL);
1603 
1604 	/*
1605 	 * If the chain element is an indirect block it becomes the new
1606 	 * parent and we loop on it.
1607 	 *
1608 	 * The parent always has to be locked with at least RESOLVE_MAYBE,
1609 	 * so it might need a fixup if the caller passed incompatible flags.
1610 	 */
1611 	if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1612 	    chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1613 		hammer2_chain_unlock(hmp, parent);
1614 		*parentp = parent = chain;
1615 		chain = NULL;
1616 		if (flags & HAMMER2_LOOKUP_NOLOCK) {
1617 			hammer2_chain_lock(hmp, parent, how_maybe);
1618 			hammer2_chain_drop(hmp, parent);	/* excess ref */
1619 		} else if (flags & HAMMER2_LOOKUP_NODATA) {
1620 			hammer2_chain_lock(hmp, parent, how_maybe);
1621 			hammer2_chain_unlock(hmp, parent);
1622 		}
1623 		i = 0;
1624 		goto again2;
1625 	}
1626 
1627 	/*
1628 	 * All done, return chain
1629 	 */
1630 	return (chain);
1631 }
1632 
1633 /*
1634  * Create and return a new hammer2 system memory structure of the specified
1635  * key, type and size and insert it RELATIVE TO (PARENT).
1636  *
1637  * (parent) is typically either an inode or an indirect block, acquired
1638  * acquired as a side effect of issuing a prior failed lookup.  parent
1639  * must be locked and held.  Do not pass the inode chain to this function
1640  * unless that is the chain returned by the failed lookup.
1641  *
1642  * Non-indirect types will automatically allocate indirect blocks as required
1643  * if the new item does not fit in the current (parent).
1644  *
1645  * Indirect types will move a portion of the existing blockref array in
1646  * (parent) into the new indirect type and then use one of the free slots
1647  * to emplace the new indirect type.
1648  *
1649  * A new locked, referenced chain element is returned of the specified type.
1650  * The element may or may not have a data area associated with it:
1651  *
1652  *	VOLUME		not allowed here
1653  *	INODE		kmalloc()'d data area is set up
1654  *	INDIRECT	not allowed here
1655  *	DATA		no data area will be set-up (caller is expected
1656  *			to have logical buffers, we don't want to alias
1657  *			the data onto device buffers!).
1658  *
1659  * Requires an exclusively locked parent.
1660  */
1661 hammer2_chain_t *
1662 hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1663 		     hammer2_chain_t *chain,
1664 		     hammer2_key_t key, int keybits, int type, size_t bytes,
1665 		     int *errorp)
1666 {
1667 	hammer2_blockref_t dummy;
1668 	hammer2_blockref_t *base;
1669 	hammer2_chain_t dummy_chain;
1670 	int unlock_parent = 0;
1671 	int allocated = 0;
1672 	int count;
1673 	int i;
1674 
1675 	KKASSERT(ccms_thread_lock_owned(&parent->cst));
1676 	*errorp = 0;
1677 
1678 	if (chain == NULL) {
1679 		/*
1680 		 * First allocate media space and construct the dummy bref,
1681 		 * then allocate the in-memory chain structure.
1682 		 */
1683 		bzero(&dummy, sizeof(dummy));
1684 		dummy.type = type;
1685 		dummy.key = key;
1686 		dummy.keybits = keybits;
1687 		dummy.data_off = hammer2_allocsize(bytes);
1688 		dummy.methods = parent->bref.methods;
1689 		chain = hammer2_chain_alloc(hmp, &dummy);
1690 		allocated = 1;
1691 
1692 		/*
1693 		 * We do NOT set INITIAL here (yet).  INITIAL is only
1694 		 * used for indirect blocks.
1695 		 *
1696 		 * Recalculate bytes to reflect the actual media block
1697 		 * allocation.
1698 		 */
1699 		bytes = (hammer2_off_t)1 <<
1700 			(int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
1701 		chain->bytes = bytes;
1702 
1703 		switch(type) {
1704 		case HAMMER2_BREF_TYPE_VOLUME:
1705 			panic("hammer2_chain_create: called with volume type");
1706 			break;
1707 		case HAMMER2_BREF_TYPE_INODE:
1708 			KKASSERT(bytes == HAMMER2_INODE_BYTES);
1709 			chain->data = kmalloc(sizeof(chain->data->ipdata),
1710 					      hmp->minode, M_WAITOK | M_ZERO);
1711 			break;
1712 		case HAMMER2_BREF_TYPE_INDIRECT:
1713 			panic("hammer2_chain_create: cannot be used to"
1714 			      "create indirect block");
1715 			break;
1716 		case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1717 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1718 			panic("hammer2_chain_create: cannot be used to"
1719 			      "create freemap root or node");
1720 			break;
1721 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1722 		case HAMMER2_BREF_TYPE_DATA:
1723 		default:
1724 			/* leave chain->data NULL */
1725 			KKASSERT(chain->data == NULL);
1726 			break;
1727 		}
1728 	} else {
1729 		/*
1730 		 * Potentially update the chain's key/keybits.
1731 		 */
1732 		chain->bref.key = key;
1733 		chain->bref.keybits = keybits;
1734 	}
1735 
1736 again:
1737 	/*
1738 	 * Locate a free blockref in the parent's array
1739 	 */
1740 	switch(parent->bref.type) {
1741 	case HAMMER2_BREF_TYPE_INODE:
1742 		KKASSERT((parent->data->ipdata.op_flags &
1743 			  HAMMER2_OPFLAG_DIRECTDATA) == 0);
1744 		KKASSERT(parent->data != NULL);
1745 		base = &parent->data->ipdata.u.blockset.blockref[0];
1746 		count = HAMMER2_SET_COUNT;
1747 		break;
1748 	case HAMMER2_BREF_TYPE_INDIRECT:
1749 	case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1750 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1751 		if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1752 			base = NULL;
1753 		} else {
1754 			KKASSERT(parent->data != NULL);
1755 			base = &parent->data->npdata.blockref[0];
1756 		}
1757 		count = parent->bytes / sizeof(hammer2_blockref_t);
1758 		break;
1759 	case HAMMER2_BREF_TYPE_VOLUME:
1760 		KKASSERT(parent->data != NULL);
1761 		base = &hmp->voldata.sroot_blockset.blockref[0];
1762 		count = HAMMER2_SET_COUNT;
1763 		break;
1764 	default:
1765 		panic("hammer2_chain_create: unrecognized blockref type: %d",
1766 		      parent->bref.type);
1767 		count = 0;
1768 		break;
1769 	}
1770 
1771 	/*
1772 	 * Scan for an unallocated bref, also skipping any slots occupied
1773 	 * by in-memory chain elements that may not yet have been updated
1774 	 * in the parent's bref array.
1775 	 */
1776 	bzero(&dummy_chain, sizeof(dummy_chain));
1777 	for (i = 0; i < count; ++i) {
1778 		if (base == NULL) {
1779 			dummy_chain.index = i;
1780 			if (RB_FIND(hammer2_chain_tree,
1781 				    &parent->rbhead, &dummy_chain) == NULL) {
1782 				break;
1783 			}
1784 		} else if (base[i].type == 0) {
1785 			dummy_chain.index = i;
1786 			if (RB_FIND(hammer2_chain_tree,
1787 				    &parent->rbhead, &dummy_chain) == NULL) {
1788 				break;
1789 			}
1790 		}
1791 	}
1792 
1793 	/*
1794 	 * If no free blockref could be found we must create an indirect
1795 	 * block and move a number of blockrefs into it.  With the parent
1796 	 * locked we can safely lock each child in order to move it without
1797 	 * causing a deadlock.
1798 	 *
1799 	 * This may return the new indirect block or the old parent depending
1800 	 * on where the key falls.  NULL is returned on error.  The most
1801 	 * typical error is EAGAIN (flush conflict during chain move).
1802 	 */
1803 	if (i == count) {
1804 		hammer2_chain_t *nparent;
1805 
1806 		nparent = hammer2_chain_create_indirect(hmp, parent,
1807 							key, keybits,
1808 							errorp);
1809 		if (nparent == NULL) {
1810 			if (allocated)
1811 				hammer2_chain_free(hmp, chain);
1812 			chain = NULL;
1813 			goto done;
1814 		}
1815 		if (parent != nparent) {
1816 			if (unlock_parent)
1817 				hammer2_chain_unlock(hmp, parent);
1818 			parent = nparent;
1819 			unlock_parent = 1;
1820 		}
1821 		goto again;
1822 	}
1823 
1824 	/*
1825 	 * Link the chain into its parent.  Later on we will have to set
1826 	 * the MOVED bit in situations where we don't mark the new chain
1827 	 * as being modified.
1828 	 */
1829 	if (chain->parent != NULL)
1830 		panic("hammer2: hammer2_chain_create: chain already connected");
1831 	KKASSERT(chain->parent == NULL);
1832 	chain->parent = parent;
1833 	chain->index = i;
1834 	if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain))
1835 		panic("hammer2_chain_link: collision");
1836 	atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
1837 	KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
1838 	KKASSERT(parent->refs > 0);
1839 	atomic_add_int(&parent->refs, 1);
1840 
1841 	/*
1842 	 * (allocated) indicates that this is a newly-created chain element
1843 	 * rather than a renamed chain element.  In this situation we want
1844 	 * to place the chain element in the MODIFIED state.
1845 	 *
1846 	 * The data area will be set up as follows:
1847 	 *
1848 	 *	VOLUME		not allowed here.
1849 	 *
1850 	 *	INODE		embedded data are will be set-up.
1851 	 *
1852 	 *	INDIRECT	not allowed here.
1853 	 *
1854 	 *	DATA		no data area will be set-up (caller is expected
1855 	 *			to have logical buffers, we don't want to alias
1856 	 *			the data onto device buffers!).
1857 	 */
1858 	if (allocated) {
1859 		switch(chain->bref.type) {
1860 		case HAMMER2_BREF_TYPE_DATA:
1861 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1862 			hammer2_chain_modify(hmp, chain,
1863 					     HAMMER2_MODIFY_OPTDATA);
1864 			break;
1865 		case HAMMER2_BREF_TYPE_INDIRECT:
1866 		case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1867 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1868 			/* not supported in this function */
1869 			panic("hammer2_chain_create: bad type");
1870 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1871 			hammer2_chain_modify(hmp, chain,
1872 					     HAMMER2_MODIFY_OPTDATA);
1873 			break;
1874 		default:
1875 			hammer2_chain_modify(hmp, chain, 0);
1876 			break;
1877 		}
1878 	} else {
1879 		/*
1880 		 * When reconnecting inodes we have to call setsubmod()
1881 		 * to ensure that its state propagates up the newly
1882 		 * connected parent.
1883 		 *
1884 		 * Make sure MOVED is set but do not update bref_flush.  If
1885 		 * the chain is undergoing modification bref_flush will be
1886 		 * updated when it gets flushed.  If it is not then the
1887 		 * bref may not have been flushed yet and we do not want to
1888 		 * set MODIFIED here as this could result in unnecessary
1889 		 * reallocations.
1890 		 */
1891 		if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1892 			hammer2_chain_ref(hmp, chain);
1893 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1894 		}
1895 		hammer2_chain_parent_setsubmod(hmp, chain);
1896 	}
1897 
1898 done:
1899 	if (unlock_parent)
1900 		hammer2_chain_unlock(hmp, parent);
1901 	return (chain);
1902 }
1903 
1904 /*
1905  * Create an indirect block that covers one or more of the elements in the
1906  * current parent.  Either returns the existing parent with no locking or
1907  * ref changes or returns the new indirect block locked and referenced
1908  * and leaving the original parent lock/ref intact as well.
1909  *
1910  * If an error occurs, NULL is returned and *errorp is set to the error.
1911  * EAGAIN can be returned to indicate a flush collision which requires the
1912  * caller to retry.
1913  *
1914  * The returned chain depends on where the specified key falls.
1915  *
1916  * The key/keybits for the indirect mode only needs to follow three rules:
1917  *
1918  * (1) That all elements underneath it fit within its key space and
1919  *
1920  * (2) That all elements outside it are outside its key space.
1921  *
1922  * (3) When creating the new indirect block any elements in the current
1923  *     parent that fit within the new indirect block's keyspace must be
1924  *     moved into the new indirect block.
1925  *
1926  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
1927  *     keyspace the the current parent, but lookup/iteration rules will
1928  *     ensure (and must ensure) that rule (2) for all parents leading up
1929  *     to the nearest inode or the root volume header is adhered to.  This
1930  *     is accomplished by always recursing through matching keyspaces in
1931  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
1932  *
1933  * The current implementation calculates the current worst-case keyspace by
1934  * iterating the current parent and then divides it into two halves, choosing
1935  * whichever half has the most elements (not necessarily the half containing
1936  * the requested key).
1937  *
1938  * We can also opt to use the half with the least number of elements.  This
1939  * causes lower-numbered keys (aka logical file offsets) to recurse through
1940  * fewer indirect blocks and higher-numbered keys to recurse through more.
1941  * This also has the risk of not moving enough elements to the new indirect
1942  * block and being forced to create several indirect blocks before the element
1943  * can be inserted.
1944  *
1945  * Must be called with an exclusively locked parent.
1946  */
1947 static
1948 hammer2_chain_t *
1949 hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1950 			      hammer2_key_t create_key, int create_bits,
1951 			      int *errorp)
1952 {
1953 	hammer2_blockref_t *base;
1954 	hammer2_blockref_t *bref;
1955 	hammer2_chain_t *chain;
1956 	hammer2_chain_t *ichain;
1957 	hammer2_chain_t dummy;
1958 	hammer2_key_t key = create_key;
1959 	int keybits = create_bits;
1960 	int locount = 0;
1961 	int hicount = 0;
1962 	int count;
1963 	int nbytes;
1964 	int i;
1965 
1966 	/*
1967 	 * Calculate the base blockref pointer or NULL if the chain
1968 	 * is known to be empty.  We need to calculate the array count
1969 	 * for RB lookups either way.
1970 	 */
1971 	KKASSERT(ccms_thread_lock_owned(&parent->cst));
1972 	*errorp = 0;
1973 
1974 	hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
1975 	if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1976 		base = NULL;
1977 
1978 		switch(parent->bref.type) {
1979 		case HAMMER2_BREF_TYPE_INODE:
1980 			count = HAMMER2_SET_COUNT;
1981 			break;
1982 		case HAMMER2_BREF_TYPE_INDIRECT:
1983 		case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
1984 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1985 			count = parent->bytes / sizeof(hammer2_blockref_t);
1986 			break;
1987 		case HAMMER2_BREF_TYPE_VOLUME:
1988 			count = HAMMER2_SET_COUNT;
1989 			break;
1990 		default:
1991 			panic("hammer2_chain_create_indirect: "
1992 			      "unrecognized blockref type: %d",
1993 			      parent->bref.type);
1994 			count = 0;
1995 			break;
1996 		}
1997 	} else {
1998 		switch(parent->bref.type) {
1999 		case HAMMER2_BREF_TYPE_INODE:
2000 			base = &parent->data->ipdata.u.blockset.blockref[0];
2001 			count = HAMMER2_SET_COUNT;
2002 			break;
2003 		case HAMMER2_BREF_TYPE_INDIRECT:
2004 		case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
2005 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2006 			base = &parent->data->npdata.blockref[0];
2007 			count = parent->bytes / sizeof(hammer2_blockref_t);
2008 			break;
2009 		case HAMMER2_BREF_TYPE_VOLUME:
2010 			base = &hmp->voldata.sroot_blockset.blockref[0];
2011 			count = HAMMER2_SET_COUNT;
2012 			break;
2013 		default:
2014 			panic("hammer2_chain_create_indirect: "
2015 			      "unrecognized blockref type: %d",
2016 			      parent->bref.type);
2017 			count = 0;
2018 			break;
2019 		}
2020 	}
2021 
2022 	/*
2023 	 * Scan for an unallocated bref, also skipping any slots occupied
2024 	 * by in-memory chain elements which may not yet have been updated
2025 	 * in the parent's bref array.
2026 	 */
2027 	bzero(&dummy, sizeof(dummy));
2028 	for (i = 0; i < count; ++i) {
2029 		int nkeybits;
2030 
2031 		dummy.index = i;
2032 		chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
2033 		if (chain) {
2034 			/*
2035 			 * NOTE! CHAIN_DELETED elements have to be adjusted
2036 			 *	 too, they cannot be ignored.
2037 			 */
2038 			bref = &chain->bref;
2039 		} else if (base && base[i].type) {
2040 			bref = &base[i];
2041 		} else {
2042 			continue;
2043 		}
2044 
2045 		/*
2046 		 * Expand our calculated key range (key, keybits) to fit
2047 		 * the scanned key.  nkeybits represents the full range
2048 		 * that we will later cut in half (two halves @ nkeybits - 1).
2049 		 */
2050 		nkeybits = keybits;
2051 		if (nkeybits < bref->keybits)
2052 			nkeybits = bref->keybits;
2053 		while (nkeybits < 64 &&
2054 		       (~(((hammer2_key_t)1 << nkeybits) - 1) &
2055 		        (key ^ bref->key)) != 0) {
2056 			++nkeybits;
2057 		}
2058 
2059 		/*
2060 		 * If the new key range is larger we have to determine
2061 		 * which side of the new key range the existing keys fall
2062 		 * under by checking the high bit, then collapsing the
2063 		 * locount into the hicount or vise-versa.
2064 		 */
2065 		if (keybits != nkeybits) {
2066 			if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
2067 				hicount += locount;
2068 				locount = 0;
2069 			} else {
2070 				locount += hicount;
2071 				hicount = 0;
2072 			}
2073 			keybits = nkeybits;
2074 		}
2075 
2076 		/*
2077 		 * The newly scanned key will be in the lower half or the
2078 		 * higher half of the (new) key range.
2079 		 */
2080 		if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
2081 			++hicount;
2082 		else
2083 			++locount;
2084 	}
2085 
2086 	/*
2087 	 * Adjust keybits to represent half of the full range calculated
2088 	 * above (radix 63 max)
2089 	 */
2090 	--keybits;
2091 
2092 	/*
2093 	 * Select whichever half contains the most elements.  Theoretically
2094 	 * we can select either side as long as it contains at least one
2095 	 * element (in order to ensure that a free slot is present to hold
2096 	 * the indirect block).
2097 	 */
2098 	key &= ~(((hammer2_key_t)1 << keybits) - 1);
2099 	if (hammer2_indirect_optimize) {
2100 		/*
2101 		 * Insert node for least number of keys, this will arrange
2102 		 * the first few blocks of a large file or the first few
2103 		 * inodes in a directory with fewer indirect blocks when
2104 		 * created linearly.
2105 		 */
2106 		if (hicount < locount && hicount != 0)
2107 			key |= (hammer2_key_t)1 << keybits;
2108 		else
2109 			key &= ~(hammer2_key_t)1 << keybits;
2110 	} else {
2111 		/*
2112 		 * Insert node for most number of keys, best for heavily
2113 		 * fragmented files.
2114 		 */
2115 		if (hicount > locount)
2116 			key |= (hammer2_key_t)1 << keybits;
2117 		else
2118 			key &= ~(hammer2_key_t)1 << keybits;
2119 	}
2120 
2121 	/*
2122 	 * How big should our new indirect block be?  It has to be at least
2123 	 * as large as its parent.
2124 	 */
2125 	if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
2126 		nbytes = HAMMER2_IND_BYTES_MIN;
2127 	else
2128 		nbytes = HAMMER2_IND_BYTES_MAX;
2129 	if (nbytes < count * sizeof(hammer2_blockref_t))
2130 		nbytes = count * sizeof(hammer2_blockref_t);
2131 
2132 	/*
2133 	 * Ok, create our new indirect block
2134 	 */
2135 	switch(parent->bref.type) {
2136 	case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
2137 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2138 		dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE;
2139 		break;
2140 	default:
2141 		dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
2142 		break;
2143 	}
2144 	dummy.bref.key = key;
2145 	dummy.bref.keybits = keybits;
2146 	dummy.bref.data_off = hammer2_allocsize(nbytes);
2147 	dummy.bref.methods = parent->bref.methods;
2148 	ichain = hammer2_chain_alloc(hmp, &dummy.bref);
2149 	atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
2150 
2151 	/*
2152 	 * Iterate the original parent and move the matching brefs into
2153 	 * the new indirect block.
2154 	 */
2155 	for (i = 0; i < count; ++i) {
2156 		/*
2157 		 * For keying purposes access the bref from the media or
2158 		 * from our in-memory cache.  In cases where the in-memory
2159 		 * cache overrides the media the keyrefs will be the same
2160 		 * anyway so we can avoid checking the cache when the media
2161 		 * has a key.
2162 		 */
2163 		dummy.index = i;
2164 		chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
2165 		if (chain) {
2166 			/*
2167 			 * NOTE! CHAIN_DELETED elements have to be adjusted
2168 			 *	 too, they cannot be ignored.
2169 			 */
2170 			bref = &chain->bref;
2171 		} else if (base && base[i].type) {
2172 			bref = &base[i];
2173 		} else {
2174 			if (ichain->index < 0)
2175 				ichain->index = i;
2176 			continue;
2177 		}
2178 
2179 		/*
2180 		 * Skip keys not in the chosen half (low or high), only bit
2181 		 * (keybits - 1) needs to be compared but for safety we
2182 		 * will compare all msb bits plus that bit again.
2183 		 */
2184 		if ((~(((hammer2_key_t)1 << keybits) - 1) &
2185 		    (key ^ bref->key)) != 0) {
2186 			continue;
2187 		}
2188 
2189 		/*
2190 		 * This element is being moved from the parent, its slot
2191 		 * is available for our new indirect block.
2192 		 */
2193 		if (ichain->index < 0)
2194 			ichain->index = i;
2195 
2196 		/*
2197 		 * Load the new indirect block by acquiring or allocating
2198 		 * the related chain entries, then simply move them to the
2199 		 * new parent (ichain).  We cannot move chains which are
2200 		 * undergoing flushing and will break out of the loop in
2201 		 * that case.
2202 		 *
2203 		 * When adjusting the parent/child relationship we must
2204 		 * set the MOVED bit but we do NOT update bref_flush
2205 		 * because otherwise we might synchronize a bref that has
2206 		 * not yet been flushed.  We depend on chain's bref_flush
2207 		 * either being correct or the chain being in a MODIFIED
2208 		 * state.
2209 		 *
2210 		 * We do not want to set MODIFIED here as this would result
2211 		 * in unnecessary reallocations.
2212 		 *
2213 		 * We must still set SUBMODIFIED in the parent but we do
2214 		 * that after the loop.
2215 		 *
2216 		 * WARNING! chain->cst.spin must be held when chain->parent is
2217 		 *	    modified, even though we own the full blown lock,
2218 		 *	    to deal with setsubmod and rename races.
2219 		 */
2220 		chain = hammer2_chain_get(hmp, parent, i,
2221 					  HAMMER2_LOOKUP_NODATA);
2222 		if (chain->flushing) {
2223 			hammer2_chain_unlock(hmp, chain);
2224 			break;
2225 		}
2226 
2227 		spin_lock(&chain->cst.spin);
2228 		RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
2229 		if (RB_INSERT(hammer2_chain_tree, &ichain->rbhead, chain))
2230 			panic("hammer2_chain_create_indirect: collision");
2231 		chain->parent = ichain;
2232 		spin_unlock(&chain->cst.spin);
2233 
2234 		if (base)
2235 			bzero(&base[i], sizeof(base[i]));
2236 		atomic_add_int(&parent->refs, -1);
2237 		atomic_add_int(&ichain->refs, 1);
2238 		if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2239 			hammer2_chain_ref(hmp, chain);
2240 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2241 		}
2242 		hammer2_chain_unlock(hmp, chain);
2243 		KKASSERT(parent->refs > 0);
2244 		chain = NULL;
2245 	}
2246 
2247 	/*
2248 	 * If we hit a chain that is undergoing flushing we're screwed and
2249 	 * we have to undo the whole mess.  Since ichain has not been linked
2250 	 * in yet, the moved chains are not reachable and will not have been
2251 	 * disposed of.
2252 	 *
2253 	 * WARNING! This code is pretty hairy because the flusher is sitting
2254 	 *	    on the parent processing one of the children that we
2255 	 *	    haven't yet moved, and will do a RB_NEXT loop on that
2256 	 *	    child.  So the children we're moving back have to be
2257 	 *	    returned to the same place in the iteration that they
2258 	 *	    were removed from.
2259 	 */
2260 	if (i != count) {
2261 		kprintf("hammer2_chain_create_indirect: EAGAIN\n");
2262 		*errorp = EAGAIN;
2263 		while ((chain = RB_ROOT(&ichain->rbhead)) != NULL) {
2264 			hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_NEVER);
2265 			KKASSERT(chain->flushing == 0);
2266 			RB_REMOVE(hammer2_chain_tree, &ichain->rbhead, chain);
2267 			if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain))
2268 				panic("hammer2_chain_create_indirect: collision");
2269 			chain->parent = parent;
2270 			atomic_add_int(&parent->refs, 1);
2271 			atomic_add_int(&ichain->refs, -1);
2272 			/* MOVED bit might have been inherited, cannot undo */
2273 			hammer2_chain_unlock(hmp, chain);
2274 		}
2275 		hammer2_chain_free(hmp, ichain);
2276 		return(NULL);
2277 	}
2278 
2279 	/*
2280 	 * Insert the new indirect block into the parent now that we've
2281 	 * cleared out some entries in the parent.  We calculated a good
2282 	 * insertion index in the loop above (ichain->index).
2283 	 *
2284 	 * We don't have to set MOVED here because we mark ichain modified
2285 	 * down below (so the normal modified -> flush -> set-moved sequence
2286 	 * applies).
2287 	 */
2288 	KKASSERT(ichain->index >= 0);
2289 	if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, ichain))
2290 		panic("hammer2_chain_create_indirect: ichain insertion");
2291 	atomic_set_int(&ichain->flags, HAMMER2_CHAIN_ONRBTREE);
2292 	ichain->parent = parent;
2293 	atomic_add_int(&parent->refs, 1);
2294 
2295 	/*
2296 	 * Mark the new indirect block modified after insertion, which
2297 	 * will propagate up through parent all the way to the root and
2298 	 * also allocate the physical block in ichain for our caller,
2299 	 * and assign ichain->data to a pre-zero'd space (because there
2300 	 * is not prior data to copy into it).
2301 	 *
2302 	 * We have to set SUBMODIFIED in ichain's flags manually so the
2303 	 * flusher knows it has to recurse through it to get to all of
2304 	 * our moved blocks, then call setsubmod() to set the bit
2305 	 * recursively.
2306 	 */
2307 	hammer2_chain_modify(hmp, ichain, HAMMER2_MODIFY_OPTDATA);
2308 	hammer2_chain_parent_setsubmod(hmp, ichain);
2309 	atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2310 
2311 	/*
2312 	 * Figure out what to return.
2313 	 */
2314 	if (create_bits > keybits) {
2315 		/*
2316 		 * Key being created is way outside the key range,
2317 		 * return the original parent.
2318 		 */
2319 		hammer2_chain_unlock(hmp, ichain);
2320 	} else if (~(((hammer2_key_t)1 << keybits) - 1) &
2321 		   (create_key ^ key)) {
2322 		/*
2323 		 * Key being created is outside the key range,
2324 		 * return the original parent.
2325 		 */
2326 		hammer2_chain_unlock(hmp, ichain);
2327 	} else {
2328 		/*
2329 		 * Otherwise its in the range, return the new parent.
2330 		 * (leave both the new and old parent locked).
2331 		 */
2332 		parent = ichain;
2333 	}
2334 
2335 	return(parent);
2336 }
2337 
2338 /*
2339  * Physically delete the specified chain element.  Note that inodes with
2340  * open descriptors should not be deleted (as with other filesystems) until
2341  * the last open descriptor is closed.
2342  *
2343  * This routine will remove the chain element from its parent and potentially
2344  * also recurse upward and delete indirect blocks which become empty as a
2345  * side effect.
2346  *
2347  * The caller must pass a pointer to the chain's parent, also locked and
2348  * referenced.  (*parentp) will be modified in a manner similar to a lookup
2349  * or iteration when indirect blocks are also deleted as a side effect.
2350  *
2351  * Must be called with an exclusively locked parent and chain.  parent and
2352  * chain are both left locked on return.
2353  *
2354  * XXX This currently does not adhere to the MOVED flag protocol in that
2355  *     the removal is immediately indicated in the parent's blockref[]
2356  *     array.
2357  */
2358 void
2359 hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
2360 		     hammer2_chain_t *chain, int retain)
2361 {
2362 	hammer2_blockref_t *base;
2363 	int count;
2364 
2365 	if (chain->parent != parent)
2366 		panic("hammer2_chain_delete: parent mismatch");
2367 	KKASSERT(ccms_thread_lock_owned(&parent->cst));
2368 
2369 	/*
2370 	 * Mark the parent modified so our base[] pointer remains valid
2371 	 * while we move entries.  For the optimized indirect block
2372 	 * case mark the parent moved instead.
2373 	 *
2374 	 * Calculate the blockref reference in the parent
2375 	 */
2376 	switch(parent->bref.type) {
2377 	case HAMMER2_BREF_TYPE_INODE:
2378 		hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2379 		base = &parent->data->ipdata.u.blockset.blockref[0];
2380 		count = HAMMER2_SET_COUNT;
2381 		break;
2382 	case HAMMER2_BREF_TYPE_INDIRECT:
2383 	case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
2384 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2385 		hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA |
2386 						  HAMMER2_MODIFY_NO_MODIFY_TID);
2387 		if (parent->flags & HAMMER2_CHAIN_INITIAL)
2388 			base = NULL;
2389 		else
2390 			base = &parent->data->npdata.blockref[0];
2391 		count = parent->bytes / sizeof(hammer2_blockref_t);
2392 		break;
2393 	case HAMMER2_BREF_TYPE_VOLUME:
2394 		hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2395 		base = &hmp->voldata.sroot_blockset.blockref[0];
2396 		count = HAMMER2_SET_COUNT;
2397 		break;
2398 	default:
2399 		panic("hammer2_chain_delete: unrecognized blockref type: %d",
2400 		      parent->bref.type);
2401 		count = 0;
2402 		break;
2403 	}
2404 	KKASSERT(chain->index >= 0 && chain->index < count);
2405 
2406 	/*
2407 	 * We may not be able to immediately disconnect the chain if a
2408 	 * flush is in progress.  If retain is non-zero we MUST disconnect
2409 	 * the chain now and callers are responsible for making sure that
2410 	 * flushing is zero.
2411 	 */
2412 	spin_lock(&chain->cst.spin);
2413 	if ((retain || chain->flushing == 0) &&
2414 	    (chain->flags & HAMMER2_CHAIN_ONRBTREE)) {
2415 		if (base)
2416 			bzero(&base[chain->index], sizeof(*base));
2417 		KKASSERT(chain->flushing == 0);
2418 		RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
2419 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
2420 		atomic_add_int(&parent->refs, -1);   /* for red-black entry */
2421 		chain->index = -1;
2422 		chain->parent = NULL;
2423 	}
2424 	spin_unlock(&chain->cst.spin);
2425 
2426 	/*
2427 	 * Cumulative adjustments must be propagated to the parent inode
2428 	 * when deleting and synchronized to ip.  This occurs even if we
2429 	 * cannot detach the chain from its parent.
2430 	 *
2431 	 * NOTE:  We do not propagate ip->delta_*count to the parent because
2432 	 *	  these represent adjustments that have not yet been
2433 	 *	  propagated upward, so we don't need to remove them from
2434 	 *	  the parent.
2435 	 *
2436 	 * Clear the pointer to the parent inode.
2437 	 */
2438 	if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0 &&
2439 	    chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2440 		/* XXX */
2441 	}
2442 
2443 	/*
2444 	 * If retain is 0 the deletion is permanent.  Because the chain is
2445 	 * no longer connected to the topology a flush will have no
2446 	 * visibility into it.  We must dispose of the references related
2447 	 * to the MODIFIED and MOVED flags, otherwise the ref count will
2448 	 * never transition to 0.
2449 	 *
2450 	 * If retain is non-zero the deleted element is likely an inode
2451 	 * which the vnops frontend will mark DESTROYED and flush.  In that
2452 	 * situation we must retain the flags for any open file descriptors
2453 	 * on the (removed) inode.  The final close will destroy the
2454 	 * disconnected chain.
2455 	 */
2456 	if (retain == 0) {
2457 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
2458 		if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
2459 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
2460 			hammer2_chain_drop(hmp, chain);
2461 		}
2462 		if (chain->flags & HAMMER2_CHAIN_MOVED) {
2463 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2464 			hammer2_chain_drop(hmp, chain);
2465 		}
2466 	}
2467 
2468 	/*
2469 	 * The chain is still likely referenced, possibly even by a vnode
2470 	 * (if an inode), so defer further action until the chain gets
2471 	 * dropped.
2472 	 */
2473 }
2474 
2475 void
2476 hammer2_chain_wait(hammer2_mount_t *hmp, hammer2_chain_t *chain)
2477 {
2478 	tsleep(chain, 0, "chnflw", 1);
2479 }
2480