xref: /dragonfly/sys/vfs/hammer2/hammer2_chain.c (revision b575ab8a)
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem handles direct and indirect block searches, recursions,
37  * creation, and deletion.  Chains of blockrefs are tracked and modifications
38  * are flag for propagation... eventually all the way back to the volume
39  * header.
40  */
41 
42 #include <sys/cdefs.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/types.h>
46 #include <sys/lock.h>
47 #include <sys/uuid.h>
48 
49 #include "hammer2.h"
50 
51 static int hammer2_indirect_optimize;	/* XXX SYSCTL */
52 
53 static hammer2_chain_t *hammer2_chain_create_indirect(
54 			hammer2_mount_t *hmp, hammer2_chain_t *parent,
55 			hammer2_key_t key, int keybits);
56 
57 /*
58  * We use a red-black tree to guarantee safe lookups under shared locks.
59  */
60 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
61 
62 int
63 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
64 {
65 	return(chain2->index - chain1->index);
66 }
67 
68 /*
69  * Recursively mark the parent chain elements so flushes can find
70  * modified elements.  Stop when we hit a chain already flagged
71  * SUBMODIFIED, but ignore the SUBMODIFIED bit that might be set
72  * in chain itself.
73  *
74  * SUBMODIFIED is not set on the chain passed in.
75  *
76  * XXX rename of parent can create a SMP race
77  */
78 static void
79 hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
80 {
81 	hammer2_chain_t *parent;
82 
83 	parent = chain->parent;
84 	while (parent && (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
85 		atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED);
86 		parent = parent->parent;
87 	}
88 }
89 
90 /*
91  * Allocate a new disconnected chain element representing the specified
92  * bref.  The chain element is locked exclusively and refs is set to 1.
93  *
94  * This essentially allocates a system memory structure representing one
95  * of the media structure types, including inodes.
96  */
97 hammer2_chain_t *
98 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
99 {
100 	hammer2_chain_t *chain;
101 	hammer2_inode_t *ip;
102 	hammer2_indblock_t *np;
103 	hammer2_data_t *dp;
104 	u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
105 
106 	/*
107 	 * Construct the appropriate system structure.
108 	 */
109 	switch(bref->type) {
110 	case HAMMER2_BREF_TYPE_INODE:
111 		ip = kmalloc(sizeof(*ip), hmp->minode, M_WAITOK | M_ZERO);
112 		chain = &ip->chain;
113 		chain->u.ip = ip;
114 		ip->hmp = hmp;
115 		break;
116 	case HAMMER2_BREF_TYPE_INDIRECT:
117 		np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO);
118 		chain = &np->chain;
119 		chain->u.np = np;
120 		break;
121 	case HAMMER2_BREF_TYPE_DATA:
122 		dp = kmalloc(sizeof(*dp), hmp->mchain, M_WAITOK | M_ZERO);
123 		chain = &dp->chain;
124 		chain->u.dp = dp;
125 		break;
126 	case HAMMER2_BREF_TYPE_VOLUME:
127 		chain = NULL;
128 		panic("hammer2_chain_alloc volume type illegal for op");
129 	default:
130 		chain = NULL;
131 		panic("hammer2_chain_alloc: unrecognized blockref type: %d",
132 		      bref->type);
133 	}
134 
135 	/*
136 	 * Only set bref_flush if the bref has a real media offset, otherwise
137 	 * the caller has to wait for the chain to be modified/block-allocated
138 	 * before a blockref can be synchronized with its (future) parent.
139 	 */
140 	chain->bref = *bref;
141 	if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX)
142 		chain->bref_flush = *bref;
143 	chain->index = -1;		/* not yet assigned */
144 	chain->refs = 1;
145 	chain->bytes = bytes;
146 	ccms_cst_init(&chain->cst, chain);
147 	ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
148 
149 	return (chain);
150 }
151 
152 /*
153  * Deallocate a chain (the step before freeing it).  Remove the chain from
154  * its parent's tree.
155  *
156  * Caller must hold the parent and the chain exclusively locked, and
157  * chain->refs must be 0.
158  *
159  * This function unlocks, removes, and destroys chain, and will recursively
160  * destroy any sub-chains under chain (whos refs must also be 0 at this
161  * point).
162  *
163  * parent can be NULL.
164  */
165 static void
166 hammer2_chain_dealloc(hammer2_mount_t *hmp, hammer2_chain_t *chain)
167 {
168 	hammer2_inode_t *ip;
169 	hammer2_chain_t *parent;
170 	hammer2_chain_t *child;
171 
172 	KKASSERT(chain->refs == 0);
173 	KKASSERT((chain->flags &
174 		  (HAMMER2_CHAIN_MOVED | HAMMER2_CHAIN_MODIFIED)) == 0);
175 
176 	parent = chain->parent;
177 	chain->parent = NULL;
178 	if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
179 		ip = chain->u.ip;
180 	else
181 		ip = NULL;
182 
183 	/*
184 	 * If the sub-tree is not empty all the elements on it must have
185 	 * 0 refs and be deallocatable.
186 	 */
187 	while ((child = RB_ROOT(&chain->rbhead)) != NULL) {
188 		ccms_thread_lock(&child->cst, CCMS_STATE_EXCLUSIVE);
189 		hammer2_chain_dealloc(hmp, child);
190 	}
191 
192 	/*
193 	 * If the DELETED flag is not set the chain must be removed from
194 	 * its parent's tree.
195 	 */
196 	if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) {
197 		RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
198 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
199 		if (ip)
200 			ip->pip = NULL;
201 	}
202 
203 	/*
204 	 * When cleaning out a hammer2_inode we must
205 	 * also clean out the related ccms_inode.
206 	 */
207 	if (ip)
208 		ccms_cst_uninit(&ip->topo_cst);
209 	hammer2_chain_free(hmp, chain);
210 }
211 
212 /*
213  * Free a disconnected chain element
214  */
215 void
216 hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
217 {
218 	void *mem;
219 
220 	if (chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
221 	    chain->bref.type == HAMMER2_BREF_TYPE_VOLUME) {
222 		chain->data = NULL;
223 	}
224 
225 	KKASSERT(chain->bp == NULL);
226 	KKASSERT(chain->data == NULL);
227 	KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
228 		 chain->u.ip->vp == NULL);
229 	ccms_thread_unlock(&chain->cst);
230 	KKASSERT(chain->cst.count == 0);
231 	KKASSERT(chain->cst.upgrade == 0);
232 
233 	if ((mem = chain->u.mem) != NULL) {
234 		chain->u.mem = NULL;
235 		if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
236 			kfree(mem, hmp->minode);
237 		else
238 			kfree(mem, hmp->mchain);
239 	}
240 }
241 
242 /*
243  * Add a reference to a chain element, preventing its destruction.
244  *
245  * The parent chain must be locked shared or exclusive or otherwise be
246  * stable and already have a reference.
247  */
248 void
249 hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
250 {
251 	u_int refs;
252 
253 	while (chain) {
254 		refs = chain->refs;
255 		KKASSERT(chain->refs >= 0);
256 		cpu_ccfence();
257 		if (refs == 0) {
258 			/*
259 			 * 0 -> 1 transition must bump the refs on the parent
260 			 * too.  The caller has stabilized the parent.
261 			 */
262 			if (atomic_cmpset_int(&chain->refs, 0, 1)) {
263 				chain = chain->parent;
264 				KKASSERT(chain == NULL || chain->refs > 0);
265 			}
266 			/* retry or continue along the parent chain */
267 		} else {
268 			/*
269 			 * N -> N+1
270 			 */
271 			if (atomic_cmpset_int(&chain->refs, refs, refs + 1))
272 				break;
273 			/* retry */
274 		}
275 	}
276 }
277 
278 /*
279  * Drop the callers reference to the chain element.  If the ref count
280  * reaches zero we attempt to recursively drop the parent.
281  *
282  * MOVED and MODIFIED elements hold additional references so it should not
283  * be possible for the count on a modified element to drop to 0.
284  *
285  * The chain element must NOT be locked by the caller on the 1->0 transition.
286  *
287  * The parent might or might not be locked by the caller.  If we are unable
288  * to lock the parent on the 1->0 transition the destruction of the chain
289  * will be deferred but we still recurse upward and drop the ref on the
290  * parent (see the lastdrop() function)
291  */
292 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_mount_t *hmp,
293 						hammer2_chain_t *chain);
294 
295 void
296 hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
297 {
298 	u_int refs;
299 
300 	while (chain) {
301 		refs = chain->refs;
302 		cpu_ccfence();
303 		KKASSERT(refs > 0);
304 		if (refs == 1) {
305 			/*
306 			 * (1) lastdrop successfully drops the chain and
307 			 *     returns the parent, we recursively drop the
308 			 *     parent.
309 			 *
310 			 * (2) lastdrop fails to transition refs from 1 to 0
311 			 *     and returns the same chain, we retry.
312 			 *
313 			 * (3) lastdrop fails to drop the chain and returns
314 			 *     NULL, leaving the ref intact for a deferred
315 			 *     drop later on.
316 			 */
317 			chain = hammer2_chain_lastdrop(hmp, chain);
318 		} else {
319 			if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
320 				/*
321 				 * Succeeded, count did not reach zero so
322 				 * cut out of the loop.
323 				 */
324 				break;
325 			}
326 			/* retry the same chain */
327 		}
328 	}
329 }
330 
331 /*
332  * On the last drop we have to stabilize chain->parent, which we can do
333  * by acquiring the chain->cst.spin lock.  If we get a full-blown lock
334  * it messes up the chain_unlock() code's ccms_thread_unlock_zero() call.
335  *
336  * Once the spinlock has been obtained we can drop the refs and become the
337  * owner of the implied ref on the parent, allowing us to return the parent.
338  */
339 static
340 hammer2_chain_t *
341 hammer2_chain_lastdrop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
342 {
343 	hammer2_chain_t *parent;
344 
345 	/*
346 	 * gain lock, drop refs, return chain to retry if we were unable
347 	 * to drop the refs from 1 to 0.
348 	 */
349 	spin_lock(&chain->cst.spin);
350 	if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
351 		spin_unlock(&chain->cst.spin);
352 		return (chain);
353 	}
354 
355 	/*
356 	 * Refs is 0 and we own the implied ref on the parent.  The
357 	 * chain can still be accessed at this point but any cycling
358 	 * of its refs will simply build-up more implied refs on the
359 	 * parent.
360 	 *
361 	 * Thus the parent pointer is valid.
362 	 */
363 	parent = chain->parent;
364 	spin_unlock(&chain->cst.spin);
365 
366 	/*
367 	 * Attempt to acquire an exclusive lock on the parent.  If this
368 	 * fails we just leave chain alone but still return the parent
369 	 * for the drop recursion.
370 	 */
371 	if (parent &&
372 	    ccms_thread_lock_nonblock(&parent->cst, CCMS_STATE_EXCLUSIVE)) {
373 		return (parent);
374 	}
375 
376 	/*
377 	 * With an exclusive lock on the parent in-hand if chain->refs is
378 	 * still 0 then its impossible for anyone new to access it (or any
379 	 * of its children), and it can be deallocated.
380 	 */
381 	if (chain->refs == 0) {
382 		ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
383 		hammer2_chain_dealloc(hmp, chain);
384 	}
385 
386 	/*
387 	 * drop recursion, return parent so the caller can eat the implied
388 	 * ref we own on it.  We have to use hammer2_chain_unlock() (which
389 	 * also does a drop so we also need a ref on parent).
390 	 */
391 	if (parent) {
392 		hammer2_chain_ref(hmp, parent);
393 		hammer2_chain_unlock(hmp, parent);
394 	}
395 	return (parent);
396 }
397 
398 /*
399  * Ref and lock a chain element, acquiring its data with I/O if necessary,
400  * and specify how you would like the data to be resolved.
401  *
402  * Returns 0 on success or an error code if the data could not be acquired.
403  * The chain element is locked either way.
404  *
405  * The lock is allowed to recurse, multiple locking ops will aggregate
406  * the requested resolve types.  Once data is assigned it will not be
407  * removed until the last unlock.
408  *
409  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
410  *			   (typically used to avoid device/logical buffer
411  *			    aliasing for data)
412  *
413  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
414  *			   the INITIAL-create state (indirect blocks only).
415  *
416  *			   Do not resolve data elements for DATA chains.
417  *			   (typically used to avoid device/logical buffer
418  *			    aliasing for data)
419  *
420  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
421  *
422  * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise
423  *			   it will be locked exclusive.
424  *
425  * NOTE: Embedded elements (volume header, inodes) are always resolved
426  *	 regardless.
427  *
428  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
429  *	 element will instantiate and zero its buffer, and flush it on
430  *	 release.
431  *
432  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
433  *	 so as not to instantiate a device buffer, which could alias against
434  *	 a logical file buffer.  However, if ALWAYS is specified the
435  *	 device buffer will be instantiated anyway.
436  */
437 int
438 hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
439 {
440 	hammer2_blockref_t *bref;
441 	hammer2_off_t pbase;
442 	hammer2_off_t peof;
443 	ccms_state_t ostate;
444 	size_t boff;
445 	size_t bbytes;
446 	int error;
447 	char *bdata;
448 
449 	/*
450 	 * Ref and lock the element.  Recursive locks are allowed.
451 	 */
452 	hammer2_chain_ref(hmp, chain);
453 	if (how & HAMMER2_RESOLVE_SHARED)
454 		ccms_thread_lock(&chain->cst, CCMS_STATE_SHARED);
455 	else
456 		ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
457 
458 	/*
459 	 * If we already have a valid data pointer no further action is
460 	 * necessary.
461 	 */
462 	if (chain->data)
463 		return (0);
464 
465 	/*
466 	 * Do we have to resolve the data?
467 	 */
468 	switch(how & HAMMER2_RESOLVE_MASK) {
469 	case HAMMER2_RESOLVE_NEVER:
470 		return(0);
471 	case HAMMER2_RESOLVE_MAYBE:
472 		if (chain->flags & HAMMER2_CHAIN_INITIAL)
473 			return(0);
474 		if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
475 			return(0);
476 		/* fall through */
477 	case HAMMER2_RESOLVE_ALWAYS:
478 		break;
479 	}
480 
481 	/*
482 	 * Upgrade to an exclusive lock so we can safely manipulate the
483 	 * buffer cache.  If another thread got to it before us we
484 	 * can just return.
485 	 */
486 	ostate = ccms_thread_lock_upgrade(&chain->cst);
487 	if (chain->data) {
488 		ccms_thread_lock_restore(&chain->cst, ostate);
489 		return (0);
490 	}
491 
492 	/*
493 	 * We must resolve to a device buffer, either by issuing I/O or
494 	 * by creating a zero-fill element.  We do not mark the buffer
495 	 * dirty when creating a zero-fill element (the hammer2_chain_modify()
496 	 * API must still be used to do that).
497 	 *
498 	 * The device buffer is variable-sized in powers of 2 down
499 	 * to HAMMER2_MINALLOCSIZE (typically 1K).  A 64K physical storage
500 	 * chunk always contains buffers of the same size. (XXX)
501 	 *
502 	 * The minimum physical IO size may be larger than the variable
503 	 * block size.
504 	 */
505 	bref = &chain->bref;
506 
507 	if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
508 		bbytes = HAMMER2_MINIOSIZE;
509 	pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
510 	peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64;
511 	boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
512 	KKASSERT(pbase != 0);
513 
514 	/*
515 	 * The getblk() optimization can only be used on newly created
516 	 * elements if the physical block size matches the request.
517 	 */
518 	if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
519 	    chain->bytes == bbytes) {
520 		chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
521 		error = 0;
522 	} else if (hammer2_cluster_enable) {
523 		error = cluster_read(hmp->devvp, peof, pbase, bbytes,
524 				     HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE,
525 				     &chain->bp);
526 	} else {
527 		error = bread(hmp->devvp, pbase, bbytes, &chain->bp);
528 	}
529 
530 	if (error) {
531 		kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
532 			(intmax_t)pbase, error);
533 		bqrelse(chain->bp);
534 		chain->bp = NULL;
535 		ccms_thread_lock_restore(&chain->cst, ostate);
536 		return (error);
537 	}
538 
539 	/*
540 	 * Zero the data area if the chain is in the INITIAL-create state.
541 	 * Mark the buffer for bdwrite().
542 	 */
543 	bdata = (char *)chain->bp->b_data + boff;
544 	if (chain->flags & HAMMER2_CHAIN_INITIAL) {
545 		bzero(bdata, chain->bytes);
546 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
547 	}
548 
549 	/*
550 	 * Setup the data pointer, either pointing it to an embedded data
551 	 * structure and copying the data from the buffer, or pointing it
552 	 * into the buffer.
553 	 *
554 	 * The buffer is not retained when copying to an embedded data
555 	 * structure in order to avoid potential deadlocks or recursions
556 	 * on the same physical buffer.
557 	 */
558 	switch (bref->type) {
559 	case HAMMER2_BREF_TYPE_VOLUME:
560 		/*
561 		 * Copy data from bp to embedded buffer
562 		 */
563 		panic("hammer2_chain_lock: called on unresolved volume header");
564 #if 0
565 		/* NOT YET */
566 		KKASSERT(pbase == 0);
567 		KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
568 		bcopy(bdata, &hmp->voldata, chain->bytes);
569 		chain->data = (void *)&hmp->voldata;
570 		bqrelse(chain->bp);
571 		chain->bp = NULL;
572 #endif
573 		break;
574 	case HAMMER2_BREF_TYPE_INODE:
575 		/*
576 		 * Copy data from bp to embedded buffer, do not retain the
577 		 * device buffer.
578 		 */
579 		bcopy(bdata, &chain->u.ip->ip_data, chain->bytes);
580 		chain->data = (void *)&chain->u.ip->ip_data;
581 		bqrelse(chain->bp);
582 		chain->bp = NULL;
583 		break;
584 	case HAMMER2_BREF_TYPE_INDIRECT:
585 	case HAMMER2_BREF_TYPE_DATA:
586 	default:
587 		/*
588 		 * Point data at the device buffer and leave bp intact.
589 		 */
590 		chain->data = (void *)bdata;
591 		break;
592 	}
593 
594 	/*
595 	 * Make sure the bp is not specifically owned by this thread before
596 	 * restoring to a possibly shared lock, so another hammer2 thread
597 	 * can release it.
598 	 */
599 	if (chain->bp)
600 		BUF_KERNPROC(chain->bp);
601 	ccms_thread_lock_restore(&chain->cst, ostate);
602 	return (0);
603 }
604 
605 /*
606  * Unlock and deref a chain element.
607  *
608  * On the last lock release any non-embedded data (chain->bp) will be
609  * retired.
610  */
611 void
612 hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
613 {
614 	long *counterp;
615 
616 	/*
617 	 * Release the CST lock but with a special 1->0 transition case.
618 	 *
619 	 * Returns non-zero if lock references remain.  When zero is
620 	 * returned the last lock reference is retained and any shared
621 	 * lock is upgraded to an exclusive lock for final disposition.
622 	 */
623 	if (ccms_thread_unlock_zero(&chain->cst)) {
624 		KKASSERT(chain->refs > 1);
625 		atomic_add_int(&chain->refs, -1);
626 		return;
627 	}
628 
629 	/*
630 	 * Shortcut the case if the data is embedded or not resolved.
631 	 *
632 	 * Do NOT null-out pointers to embedded data (e.g. inode).
633 	 *
634 	 * The DIRTYBP flag is non-applicable in this situation and can
635 	 * be cleared to keep the flags state clean.
636 	 */
637 	if (chain->bp == NULL) {
638 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
639 		ccms_thread_unlock(&chain->cst);
640 		hammer2_chain_drop(hmp, chain);
641 		return;
642 	}
643 
644 	/*
645 	 * Statistics
646 	 */
647 	if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) {
648 		;
649 	} else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
650 		switch(chain->bref.type) {
651 		case HAMMER2_BREF_TYPE_DATA:
652 			counterp = &hammer2_ioa_file_write;
653 			break;
654 		case HAMMER2_BREF_TYPE_INODE:
655 			counterp = &hammer2_ioa_meta_write;
656 			break;
657 		case HAMMER2_BREF_TYPE_INDIRECT:
658 			counterp = &hammer2_ioa_indr_write;
659 			break;
660 		default:
661 			counterp = &hammer2_ioa_volu_write;
662 			break;
663 		}
664 		++*counterp;
665 	} else {
666 		switch(chain->bref.type) {
667 		case HAMMER2_BREF_TYPE_DATA:
668 			counterp = &hammer2_iod_file_write;
669 			break;
670 		case HAMMER2_BREF_TYPE_INODE:
671 			counterp = &hammer2_iod_meta_write;
672 			break;
673 		case HAMMER2_BREF_TYPE_INDIRECT:
674 			counterp = &hammer2_iod_indr_write;
675 			break;
676 		default:
677 			counterp = &hammer2_iod_volu_write;
678 			break;
679 		}
680 		++*counterp;
681 	}
682 
683 	/*
684 	 * Clean out the bp.
685 	 *
686 	 * If a device buffer was used for data be sure to destroy the
687 	 * buffer when we are done to avoid aliases (XXX what about the
688 	 * underlying VM pages?).
689 	 */
690 	if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
691 		chain->bp->b_flags |= B_RELBUF;
692 
693 	/*
694 	 * The DIRTYBP flag tracks whether we have to bdwrite() the buffer
695 	 * or not.  The flag will get re-set when chain_modify() is called,
696 	 * even if MODIFIED is already set, allowing the OS to retire the
697 	 * buffer independent of a hammer2 flus.
698 	 */
699 	chain->data = NULL;
700 	if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
701 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
702 		if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
703 			atomic_clear_int(&chain->flags,
704 					 HAMMER2_CHAIN_IOFLUSH);
705 			chain->bp->b_flags |= B_RELBUF;
706 			cluster_awrite(chain->bp);
707 		} else {
708 			chain->bp->b_flags |= B_CLUSTEROK;
709 			bdwrite(chain->bp);
710 		}
711 	} else {
712 		if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
713 			atomic_clear_int(&chain->flags,
714 					 HAMMER2_CHAIN_IOFLUSH);
715 			chain->bp->b_flags |= B_RELBUF;
716 			brelse(chain->bp);
717 		} else {
718 			/* bp might still be dirty */
719 			bqrelse(chain->bp);
720 		}
721 	}
722 	chain->bp = NULL;
723 	ccms_thread_unlock(&chain->cst);
724 	hammer2_chain_drop(hmp, chain);
725 }
726 
727 /*
728  * Resize the chain's physical storage allocation.  Chains can be resized
729  * smaller without reallocating the storage.  Resizing larger will reallocate
730  * the storage.
731  *
732  * Must be passed a locked chain.
733  *
734  * If you want the resize code to copy the data to the new block then the
735  * caller should lock the chain RESOLVE_MAYBE or RESOLVE_ALWAYS.
736  *
737  * If the caller already holds a logical buffer containing the data and
738  * intends to bdwrite() that buffer resolve with RESOLVE_NEVER.  The resize
739  * operation will then not copy the data.
740  *
741  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
742  * to avoid instantiating a device buffer that conflicts with the vnode
743  * data buffer.
744  *
745  * XXX flags currently ignored, uses chain->bp to detect data/no-data.
746  */
747 void
748 hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
749 		     int nradix, int flags)
750 {
751 	hammer2_mount_t *hmp = ip->hmp;
752 	struct buf *nbp;
753 	hammer2_off_t pbase;
754 	size_t obytes;
755 	size_t nbytes;
756 	size_t bbytes;
757 	int boff;
758 	char *bdata;
759 	int error;
760 
761 	/*
762 	 * Only data and indirect blocks can be resized for now
763 	 */
764 	KKASSERT(chain != &hmp->vchain);
765 	KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
766 		 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
767 
768 	/*
769 	 * Nothing to do if the element is already the proper size
770 	 */
771 	obytes = chain->bytes;
772 	nbytes = 1U << nradix;
773 	if (obytes == nbytes)
774 		return;
775 
776 	/*
777 	 * Set MODIFIED and add a chain ref to prevent destruction.  Both
778 	 * modified flags share the same ref.
779 	 *
780 	 * If the chain is already marked MODIFIED then we can safely
781 	 * return the previous allocation to the pool without having to
782 	 * worry about snapshots.
783 	 */
784 	if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
785 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
786 					      HAMMER2_CHAIN_MODIFY_TID);
787 		hammer2_chain_ref(hmp, chain);
788 	} else {
789 		hammer2_freemap_free(hmp, chain->bref.data_off,
790 				     chain->bref.type);
791 	}
792 
793 	/*
794 	 * Relocate the block, even if making it smaller (because different
795 	 * block sizes may be in different regions).
796 	 */
797 	chain->bref.data_off = hammer2_freemap_alloc(hmp, chain->bref.type,
798 						     nbytes);
799 	chain->bytes = nbytes;
800 	ip->delta_dcount += (ssize_t)(nbytes - obytes); /* XXX atomic */
801 
802 	/*
803 	 * The device buffer may be larger than the allocation size.
804 	 */
805 	if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
806 		bbytes = HAMMER2_MINIOSIZE;
807 	pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
808 	boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
809 
810 	/*
811 	 * Only copy the data if resolved, otherwise the caller is
812 	 * responsible.
813 	 */
814 	if (chain->bp) {
815 		KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
816 			 chain->bref.type == HAMMER2_BREF_TYPE_DATA);
817 		KKASSERT(chain != &hmp->vchain);	/* safety */
818 
819 		/*
820 		 * The getblk() optimization can only be used if the
821 		 * physical block size matches the request.
822 		 */
823 		if (nbytes == bbytes) {
824 			nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
825 			error = 0;
826 		} else {
827 			error = bread(hmp->devvp, pbase, bbytes, &nbp);
828 			KKASSERT(error == 0);
829 		}
830 		bdata = (char *)nbp->b_data + boff;
831 
832 		if (nbytes < obytes) {
833 			bcopy(chain->data, bdata, nbytes);
834 		} else {
835 			bcopy(chain->data, bdata, obytes);
836 			bzero(bdata + obytes, nbytes - obytes);
837 		}
838 
839 		/*
840 		 * NOTE: The INITIAL state of the chain is left intact.
841 		 *	 We depend on hammer2_chain_modify() to do the
842 		 *	 right thing.
843 		 *
844 		 * NOTE: We set B_NOCACHE to throw away the previous bp and
845 		 *	 any VM backing store, even if it was dirty.
846 		 *	 Otherwise we run the risk of a logical/device
847 		 *	 conflict on reallocation.
848 		 */
849 		chain->bp->b_flags |= B_RELBUF | B_NOCACHE;
850 		brelse(chain->bp);
851 		chain->bp = nbp;
852 		chain->data = (void *)bdata;
853 		hammer2_chain_modify(hmp, chain, 0);
854 	}
855 
856 	/*
857 	 * Make sure the chain is marked MOVED and SUBMOD is set in the
858 	 * parent(s) so the adjustments are picked up by flush.
859 	 */
860 	if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
861 		hammer2_chain_ref(hmp, chain);
862 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
863 	}
864 	hammer2_chain_parent_setsubmod(hmp, chain);
865 }
866 
867 /*
868  * Convert a locked chain that was retrieved read-only to read-write.
869  *
870  * If not already marked modified a new physical block will be allocated
871  * and assigned to the bref.
872  *
873  * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
874  *		     level or the COW operation will not work.
875  *
876  * Data blocks	   - The chain is usually locked RESOLVE_NEVER so as not to
877  *		     run the data through the device buffers.
878  */
879 void
880 hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags)
881 {
882 	struct buf *nbp;
883 	int error;
884 	hammer2_off_t pbase;
885 	size_t bbytes;
886 	size_t boff;
887 	void *bdata;
888 
889 	/*
890 	 * Tells flush that modify_tid must be updated, otherwise only
891 	 * mirror_tid is updated.  This is the default.
892 	 */
893 	if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
894 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFY_TID);
895 
896 	/*
897 	 * If the chain is already marked MODIFIED we can just return.
898 	 *
899 	 * However, it is possible that a prior lock/modify sequence
900 	 * retired the buffer.  During this lock/modify sequence MODIFIED
901 	 * may still be set but the buffer could wind up clean.  Since
902 	 * the caller is going to modify the buffer further we have to
903 	 * be sure that DIRTYBP is set again.
904 	 */
905 	if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
906 		if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
907 		    chain->bp == NULL) {
908 			goto skip1;
909 		}
910 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
911 		return;
912 	}
913 
914 	/*
915 	 * Set MODIFIED and add a chain ref to prevent destruction.  Both
916 	 * modified flags share the same ref.
917 	 */
918 	atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
919 	hammer2_chain_ref(hmp, chain);
920 
921 	/*
922 	 * We must allocate the copy-on-write block.
923 	 *
924 	 * If the data is embedded no other action is required.
925 	 *
926 	 * If the data is not embedded we acquire and clear the
927 	 * new block.  If chain->data is not NULL we then do the
928 	 * copy-on-write.  chain->data will then be repointed to the new
929 	 * buffer and the old buffer will be released.
930 	 *
931 	 * For newly created elements with no prior allocation we go
932 	 * through the copy-on-write steps except without the copying part.
933 	 */
934 	if (chain != &hmp->vchain) {
935 		if ((hammer2_debug & 0x0001) &&
936 		    (chain->bref.data_off & HAMMER2_OFF_MASK)) {
937 			kprintf("Replace %d\n", chain->bytes);
938 		}
939 		chain->bref.data_off =
940 			hammer2_freemap_alloc(hmp, chain->bref.type,
941 					      chain->bytes);
942 		/* XXX failed allocation */
943 	}
944 
945 	/*
946 	 * If data instantiation is optional and the chain has no current
947 	 * data association (typical for DATA and newly-created INDIRECT
948 	 * elements), don't instantiate the buffer now.
949 	 */
950 	if ((flags & HAMMER2_MODIFY_OPTDATA) && chain->bp == NULL)
951 		goto skip2;
952 
953 skip1:
954 	/*
955 	 * Setting the DIRTYBP flag will cause the buffer to be dirtied or
956 	 * written-out on unlock.  This bit is independent of the MODIFIED
957 	 * bit because the chain may still need meta-data adjustments done
958 	 * by virtue of MODIFIED for its parent, and the buffer can be
959 	 * flushed out (possibly multiple times) by the OS before that.
960 	 *
961 	 * Clearing the INITIAL flag (for indirect blocks) indicates that
962 	 * a zero-fill buffer has been instantiated.
963 	 */
964 	atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
965 	atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
966 
967 	/*
968 	 * We currently should never instantiate a device buffer for a
969 	 * data chain.
970 	 */
971 	KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
972 
973 	/*
974 	 * Execute COW operation
975 	 */
976 	switch(chain->bref.type) {
977 	case HAMMER2_BREF_TYPE_VOLUME:
978 	case HAMMER2_BREF_TYPE_INODE:
979 		/*
980 		 * The data is embedded, no copy-on-write operation is
981 		 * needed.
982 		 */
983 		KKASSERT(chain->bp == NULL);
984 		break;
985 	case HAMMER2_BREF_TYPE_DATA:
986 	case HAMMER2_BREF_TYPE_INDIRECT:
987 		/*
988 		 * Perform the copy-on-write operation
989 		 */
990 		KKASSERT(chain != &hmp->vchain);	/* safety */
991 		/*
992 		 * The device buffer may be larger than the allocation size.
993 		 */
994 		if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
995 			bbytes = HAMMER2_MINIOSIZE;
996 		pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
997 		boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
998 
999 		/*
1000 		 * The getblk() optimization can only be used if the
1001 		 * physical block size matches the request.
1002 		 */
1003 		if (chain->bytes == bbytes) {
1004 			nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
1005 			error = 0;
1006 		} else {
1007 			error = bread(hmp->devvp, pbase, bbytes, &nbp);
1008 			KKASSERT(error == 0);
1009 		}
1010 		bdata = (char *)nbp->b_data + boff;
1011 
1012 		/*
1013 		 * Copy or zero-fill on write depending on whether
1014 		 * chain->data exists or not.
1015 		 */
1016 		if (chain->data) {
1017 			bcopy(chain->data, bdata, chain->bytes);
1018 			KKASSERT(chain->bp != NULL);
1019 		} else {
1020 			bzero(bdata, chain->bytes);
1021 		}
1022 		if (chain->bp) {
1023 			chain->bp->b_flags |= B_RELBUF;
1024 			brelse(chain->bp);
1025 		}
1026 		chain->bp = nbp;
1027 		chain->data = bdata;
1028 		break;
1029 	default:
1030 		panic("hammer2_chain_modify: illegal non-embedded type %d",
1031 		      chain->bref.type);
1032 		break;
1033 
1034 	}
1035 skip2:
1036 	if ((flags & HAMMER2_MODIFY_NOSUB) == 0)
1037 		hammer2_chain_parent_setsubmod(hmp, chain);
1038 }
1039 
1040 /*
1041  * Mark the volume as having been modified.  This short-cut version
1042  * does not have to lock the volume's chain, which allows the ioctl
1043  * code to make adjustments to connections without deadlocking.
1044  */
1045 void
1046 hammer2_modify_volume(hammer2_mount_t *hmp)
1047 {
1048 	hammer2_voldata_lock(hmp);
1049 	atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
1050 	hammer2_voldata_unlock(hmp);
1051 }
1052 
1053 /*
1054  * Locate an in-memory chain.  The parent must be locked.  The in-memory
1055  * chain is returned or NULL if no in-memory chain is present.
1056  *
1057  * NOTE: A chain on-media might exist for this index when NULL is returned.
1058  */
1059 hammer2_chain_t *
1060 hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
1061 {
1062 	hammer2_chain_t dummy;
1063 	hammer2_chain_t *chain;
1064 
1065 	dummy.index = index;
1066 	chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
1067 	return (chain);
1068 }
1069 
1070 /*
1071  * Return a locked chain structure with all associated data acquired.
1072  *
1073  * Caller must lock the parent on call, the returned child will be locked.
1074  */
1075 hammer2_chain_t *
1076 hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1077 		  int index, int flags)
1078 {
1079 	hammer2_blockref_t *bref;
1080 	hammer2_inode_t *ip;
1081 	hammer2_chain_t *chain;
1082 	hammer2_chain_t dummy;
1083 	int how;
1084 	ccms_state_t ostate;
1085 
1086 	/*
1087 	 * Figure out how to lock.  MAYBE can be used to optimized
1088 	 * the initial-create state for indirect blocks.
1089 	 */
1090 	if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK))
1091 		how = HAMMER2_RESOLVE_NEVER;
1092 	else
1093 		how = HAMMER2_RESOLVE_MAYBE;
1094 	if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
1095 		how |= HAMMER2_RESOLVE_SHARED;
1096 
1097 	/*
1098 	 * First see if we have a (possibly modified) chain element cached
1099 	 * for this (parent, index).  Acquire the data if necessary.
1100 	 *
1101 	 * If chain->data is non-NULL the chain should already be marked
1102 	 * modified.
1103 	 */
1104 	dummy.index = index;
1105 	chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
1106 	if (chain) {
1107 		if (flags & HAMMER2_LOOKUP_NOLOCK)
1108 			hammer2_chain_ref(hmp, chain);
1109 		else
1110 			hammer2_chain_lock(hmp, chain, how);
1111 		return(chain);
1112 	}
1113 
1114 	/*
1115 	 * Upgrade our thread lock and handle any race that may have
1116 	 * occurred.  Leave the lock upgraded for the rest of the get.
1117 	 * We have to do this because we will be modifying the chain
1118 	 * structure.
1119 	 */
1120 	ostate = ccms_thread_lock_upgrade(&parent->cst);
1121 	chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
1122 	if (chain) {
1123 		if (flags & HAMMER2_LOOKUP_NOLOCK)
1124 			hammer2_chain_ref(hmp, chain);
1125 		else
1126 			hammer2_chain_lock(hmp, chain, how);
1127 		ccms_thread_lock_restore(&parent->cst, ostate);
1128 		return(chain);
1129 	}
1130 
1131 	/*
1132 	 * The get function must always succeed, panic if there's no
1133 	 * data to index.
1134 	 */
1135 	if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1136 		ccms_thread_lock_restore(&parent->cst, ostate);
1137 		panic("hammer2_chain_get: Missing bref(1)");
1138 		/* NOT REACHED */
1139 	}
1140 
1141 	/*
1142 	 * Otherwise lookup the bref and issue I/O (switch on the parent)
1143 	 */
1144 	switch(parent->bref.type) {
1145 	case HAMMER2_BREF_TYPE_INODE:
1146 		KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1147 		bref = &parent->data->ipdata.u.blockset.blockref[index];
1148 		break;
1149 	case HAMMER2_BREF_TYPE_INDIRECT:
1150 		KKASSERT(parent->data != NULL);
1151 		KKASSERT(index >= 0 &&
1152 			 index < parent->bytes / sizeof(hammer2_blockref_t));
1153 		bref = &parent->data->npdata.blockref[index];
1154 		break;
1155 	case HAMMER2_BREF_TYPE_VOLUME:
1156 		KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1157 		bref = &hmp->voldata.sroot_blockset.blockref[index];
1158 		break;
1159 	default:
1160 		bref = NULL;
1161 		panic("hammer2_chain_get: unrecognized blockref type: %d",
1162 		      parent->bref.type);
1163 	}
1164 	if (bref->type == 0) {
1165 		panic("hammer2_chain_get: Missing bref(2)");
1166 		/* NOT REACHED */
1167 	}
1168 
1169 	/*
1170 	 * Allocate a chain structure representing the existing media
1171 	 * entry.
1172 	 *
1173 	 * The locking operation we do later will issue I/O to read it.
1174 	 */
1175 	chain = hammer2_chain_alloc(hmp, bref);
1176 
1177 	/*
1178 	 * Link the chain into its parent.  Caller is expected to hold an
1179 	 * exclusive lock on the parent.
1180 	 */
1181 	chain->parent = parent;
1182 	chain->index = index;
1183 	if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain))
1184 		panic("hammer2_chain_link: collision");
1185 	KKASSERT(parent->refs > 0);
1186 	atomic_add_int(&parent->refs, 1);	/* for red-black entry */
1187 	ccms_thread_lock_restore(&parent->cst, ostate);
1188 
1189 	/*
1190 	 * Additional linkage for inodes.  Reuse the parent pointer to
1191 	 * find the parent directory.
1192 	 *
1193 	 * The ccms_inode is initialized from its parent directory.  The
1194 	 * chain of ccms_inode's is seeded by the mount code.
1195 	 */
1196 	if (bref->type == HAMMER2_BREF_TYPE_INODE) {
1197 		ip = chain->u.ip;
1198 		while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1199 			parent = parent->parent;
1200 		if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
1201 			ip->pip = parent->u.ip;
1202 			ip->pmp = parent->u.ip->pmp;
1203 			ip->depth = parent->u.ip->depth + 1;
1204 			ccms_cst_init(&ip->topo_cst, &ip->chain);
1205 		}
1206 	}
1207 
1208 	/*
1209 	 * Our new chain structure has already been referenced and locked
1210 	 * but the lock code handles the I/O so call it to resolve the data.
1211 	 * Then release one of our two exclusive locks.
1212 	 *
1213 	 * If NOLOCK is set the release will release the one-and-only lock.
1214 	 */
1215 	if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1216 		hammer2_chain_lock(hmp, chain, how);	/* recusive lock */
1217 		hammer2_chain_drop(hmp, chain);		/* excess ref */
1218 	}
1219 	ccms_thread_unlock(&chain->cst);			/* from alloc */
1220 
1221 	return (chain);
1222 }
1223 
1224 /*
1225  * Locate any key between key_beg and key_end inclusive.  (*parentp)
1226  * typically points to an inode but can also point to a related indirect
1227  * block and this function will recurse upwards and find the inode again.
1228  *
1229  * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
1230  *	     WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
1231  *	     WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
1232  *
1233  * (*parentp) must be exclusively locked and referenced and can be an inode
1234  * or an existing indirect block within the inode.
1235  *
1236  * On return (*parentp) will be modified to point at the deepest parent chain
1237  * element encountered during the search, as a helper for an insertion or
1238  * deletion.   The new (*parentp) will be locked and referenced and the old
1239  * will be unlocked and dereferenced (no change if they are both the same).
1240  *
1241  * The matching chain will be returned exclusively locked and referenced.
1242  *
1243  * NULL is returned if no match was found, but (*parentp) will still
1244  * potentially be adjusted.
1245  *
1246  * This function will also recurse up the chain if the key is not within the
1247  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
1248  * can simply allow (*parentp) to float inside the loop.
1249  */
1250 hammer2_chain_t *
1251 hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1252 		     hammer2_key_t key_beg, hammer2_key_t key_end,
1253 		     int flags)
1254 {
1255 	hammer2_chain_t *parent;
1256 	hammer2_chain_t *chain;
1257 	hammer2_chain_t *tmp;
1258 	hammer2_blockref_t *base;
1259 	hammer2_blockref_t *bref;
1260 	hammer2_key_t scan_beg;
1261 	hammer2_key_t scan_end;
1262 	int count = 0;
1263 	int i;
1264 	int how_always = HAMMER2_RESOLVE_ALWAYS;
1265 	int how_maybe = HAMMER2_RESOLVE_MAYBE;
1266 
1267 	if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) {
1268 		how_maybe |= HAMMER2_RESOLVE_SHARED;
1269 		how_always |= HAMMER2_RESOLVE_SHARED;
1270 	}
1271 
1272 	/*
1273 	 * Recurse (*parentp) upward if necessary until the parent completely
1274 	 * encloses the key range or we hit the inode.
1275 	 */
1276 	parent = *parentp;
1277 	while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1278 		scan_beg = parent->bref.key;
1279 		scan_end = scan_beg +
1280 			   ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1281 		if (key_beg >= scan_beg && key_end <= scan_end)
1282 			break;
1283 		hammer2_chain_ref(hmp, parent);		/* ref old parent */
1284 		hammer2_chain_unlock(hmp, parent);	/* unlock old parent */
1285 		parent = parent->parent;
1286 							/* lock new parent */
1287 		hammer2_chain_lock(hmp, parent, how_maybe);
1288 		hammer2_chain_drop(hmp, *parentp);	/* drop old parent */
1289 		*parentp = parent;			/* new parent */
1290 	}
1291 
1292 again:
1293 	/*
1294 	 * Locate the blockref array.  Currently we do a fully associative
1295 	 * search through the array.
1296 	 */
1297 	switch(parent->bref.type) {
1298 	case HAMMER2_BREF_TYPE_INODE:
1299 		/*
1300 		 * Special shortcut for embedded data returns the inode
1301 		 * itself.  Callers must detect this condition and access
1302 		 * the embedded data (the strategy code does this for us).
1303 		 *
1304 		 * This is only applicable to regular files and softlinks.
1305 		 */
1306 		if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1307 			if (flags & HAMMER2_LOOKUP_NOLOCK)
1308 				hammer2_chain_ref(hmp, parent);
1309 			else
1310 				hammer2_chain_lock(hmp, parent, how_always);
1311 			return (parent);
1312 		}
1313 		base = &parent->data->ipdata.u.blockset.blockref[0];
1314 		count = HAMMER2_SET_COUNT;
1315 		break;
1316 	case HAMMER2_BREF_TYPE_INDIRECT:
1317 		/*
1318 		 * Optimize indirect blocks in the INITIAL state to avoid
1319 		 * I/O.
1320 		 */
1321 		if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1322 			base = NULL;
1323 		} else {
1324 			if (parent->data == NULL)
1325 				panic("parent->data is NULL");
1326 			base = &parent->data->npdata.blockref[0];
1327 		}
1328 		count = parent->bytes / sizeof(hammer2_blockref_t);
1329 		break;
1330 	case HAMMER2_BREF_TYPE_VOLUME:
1331 		base = &hmp->voldata.sroot_blockset.blockref[0];
1332 		count = HAMMER2_SET_COUNT;
1333 		break;
1334 	default:
1335 		panic("hammer2_chain_lookup: unrecognized blockref type: %d",
1336 		      parent->bref.type);
1337 		base = NULL;	/* safety */
1338 		count = 0;	/* safety */
1339 	}
1340 
1341 	/*
1342 	 * If the element and key overlap we use the element.
1343 	 */
1344 	bref = NULL;
1345 	for (i = 0; i < count; ++i) {
1346 		tmp = hammer2_chain_find(hmp, parent, i);
1347 		if (tmp) {
1348 			bref = &tmp->bref;
1349 			KKASSERT(bref->type != 0);
1350 		} else if (base == NULL || base[i].type == 0) {
1351 			continue;
1352 		} else {
1353 			bref = &base[i];
1354 		}
1355 		scan_beg = bref->key;
1356 		scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1357 		if (key_beg <= scan_end && key_end >= scan_beg)
1358 			break;
1359 	}
1360 	if (i == count) {
1361 		if (key_beg == key_end)
1362 			return (NULL);
1363 		return (hammer2_chain_next(hmp, parentp, NULL,
1364 					   key_beg, key_end, flags));
1365 	}
1366 
1367 	/*
1368 	 * Acquire the new chain element.  If the chain element is an
1369 	 * indirect block we must search recursively.
1370 	 */
1371 	chain = hammer2_chain_get(hmp, parent, i, flags);
1372 	if (chain == NULL)
1373 		return (NULL);
1374 
1375 	/*
1376 	 * If the chain element is an indirect block it becomes the new
1377 	 * parent and we loop on it.
1378 	 *
1379 	 * The parent always has to be locked with at least RESOLVE_MAYBE,
1380 	 * so it might need a fixup if the caller passed incompatible flags.
1381 	 */
1382 	if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1383 		hammer2_chain_unlock(hmp, parent);
1384 		*parentp = parent = chain;
1385 		if (flags & HAMMER2_LOOKUP_NOLOCK) {
1386 			hammer2_chain_lock(hmp, chain, how_maybe);
1387 			hammer2_chain_drop(hmp, chain);	/* excess ref */
1388 		} else if (flags & HAMMER2_LOOKUP_NODATA) {
1389 			hammer2_chain_lock(hmp, chain, how_maybe);
1390 			hammer2_chain_unlock(hmp, chain);
1391 		}
1392 		goto again;
1393 	}
1394 
1395 	/*
1396 	 * All done, return chain
1397 	 */
1398 	return (chain);
1399 }
1400 
1401 /*
1402  * After having issued a lookup we can iterate all matching keys.
1403  *
1404  * If chain is non-NULL we continue the iteration from just after it's index.
1405  *
1406  * If chain is NULL we assume the parent was exhausted and continue the
1407  * iteration at the next parent.
1408  *
1409  * parent must be locked on entry and remains locked throughout.  chain's
1410  * lock status must match flags.
1411  */
1412 hammer2_chain_t *
1413 hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1414 		   hammer2_chain_t *chain,
1415 		   hammer2_key_t key_beg, hammer2_key_t key_end,
1416 		   int flags)
1417 {
1418 	hammer2_chain_t *parent;
1419 	hammer2_chain_t *tmp;
1420 	hammer2_blockref_t *base;
1421 	hammer2_blockref_t *bref;
1422 	hammer2_key_t scan_beg;
1423 	hammer2_key_t scan_end;
1424 	int i;
1425 	int how_maybe = HAMMER2_RESOLVE_MAYBE;
1426 	int count;
1427 
1428 	if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
1429 		how_maybe |= HAMMER2_RESOLVE_SHARED;
1430 
1431 	parent = *parentp;
1432 
1433 again:
1434 	/*
1435 	 * Calculate the next index and recalculate the parent if necessary.
1436 	 */
1437 	if (chain) {
1438 		/*
1439 		 * Continue iteration within current parent.  If not NULL
1440 		 * the passed-in chain may or may not be locked, based on
1441 		 * the LOOKUP_NOLOCK flag (passed in as returned from lookup
1442 		 * or a prior next).
1443 		 */
1444 		i = chain->index + 1;
1445 		if (flags & HAMMER2_LOOKUP_NOLOCK)
1446 			hammer2_chain_drop(hmp, chain);
1447 		else
1448 			hammer2_chain_unlock(hmp, chain);
1449 
1450 		/*
1451 		 * Any scan where the lookup returned degenerate data embedded
1452 		 * in the inode has an invalid index and must terminate.
1453 		 */
1454 		if (chain == parent)
1455 			return(NULL);
1456 		chain = NULL;
1457 	} else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT) {
1458 		/*
1459 		 * We reached the end of the iteration.
1460 		 */
1461 		return (NULL);
1462 	} else {
1463 		/*
1464 		 * Continue iteration with next parent unless the current
1465 		 * parent covers the range.
1466 		 */
1467 		hammer2_chain_t *nparent;
1468 
1469 		scan_beg = parent->bref.key;
1470 		scan_end = scan_beg +
1471 			    ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1472 		if (key_beg >= scan_beg && key_end <= scan_end)
1473 			return (NULL);
1474 
1475 		i = parent->index + 1;
1476 		nparent = parent->parent;
1477 		hammer2_chain_ref(hmp, nparent);	/* ref new parent */
1478 		hammer2_chain_unlock(hmp, parent);	/* unlock old parent */
1479 							/* lock new parent */
1480 		hammer2_chain_lock(hmp, nparent, how_maybe);
1481 		hammer2_chain_drop(hmp, nparent);	/* drop excess ref */
1482 		*parentp = parent = nparent;
1483 	}
1484 
1485 again2:
1486 	/*
1487 	 * Locate the blockref array.  Currently we do a fully associative
1488 	 * search through the array.
1489 	 */
1490 	switch(parent->bref.type) {
1491 	case HAMMER2_BREF_TYPE_INODE:
1492 		base = &parent->data->ipdata.u.blockset.blockref[0];
1493 		count = HAMMER2_SET_COUNT;
1494 		break;
1495 	case HAMMER2_BREF_TYPE_INDIRECT:
1496 		if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1497 			base = NULL;
1498 		} else {
1499 			KKASSERT(parent->data != NULL);
1500 			base = &parent->data->npdata.blockref[0];
1501 		}
1502 		count = parent->bytes / sizeof(hammer2_blockref_t);
1503 		break;
1504 	case HAMMER2_BREF_TYPE_VOLUME:
1505 		base = &hmp->voldata.sroot_blockset.blockref[0];
1506 		count = HAMMER2_SET_COUNT;
1507 		break;
1508 	default:
1509 		panic("hammer2_chain_next: unrecognized blockref type: %d",
1510 		      parent->bref.type);
1511 		base = NULL;	/* safety */
1512 		count = 0;	/* safety */
1513 		break;
1514 	}
1515 	KKASSERT(i <= count);
1516 
1517 	/*
1518 	 * Look for the key.  If we are unable to find a match and an exact
1519 	 * match was requested we return NULL.  If a range was requested we
1520 	 * run hammer2_chain_next() to iterate.
1521 	 */
1522 	bref = NULL;
1523 	while (i < count) {
1524 		tmp = hammer2_chain_find(hmp, parent, i);
1525 		if (tmp) {
1526 			bref = &tmp->bref;
1527 		} else if (base == NULL || base[i].type == 0) {
1528 			++i;
1529 			continue;
1530 		} else {
1531 			bref = &base[i];
1532 		}
1533 		scan_beg = bref->key;
1534 		scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1535 		if (key_beg <= scan_end && key_end >= scan_beg)
1536 			break;
1537 		++i;
1538 	}
1539 
1540 	/*
1541 	 * If we couldn't find a match recurse up a parent to continue the
1542 	 * search.
1543 	 */
1544 	if (i == count)
1545 		goto again;
1546 
1547 	/*
1548 	 * Acquire the new chain element.  If the chain element is an
1549 	 * indirect block we must search recursively.
1550 	 */
1551 	chain = hammer2_chain_get(hmp, parent, i, flags);
1552 	if (chain == NULL)
1553 		return (NULL);
1554 
1555 	/*
1556 	 * If the chain element is an indirect block it becomes the new
1557 	 * parent and we loop on it.
1558 	 *
1559 	 * The parent always has to be locked with at least RESOLVE_MAYBE,
1560 	 * so it might need a fixup if the caller passed incompatible flags.
1561 	 */
1562 	if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1563 		hammer2_chain_unlock(hmp, parent);
1564 		*parentp = parent = chain;
1565 		chain = NULL;
1566 		if (flags & HAMMER2_LOOKUP_NOLOCK) {
1567 			hammer2_chain_lock(hmp, parent, how_maybe);
1568 			hammer2_chain_drop(hmp, parent);	/* excess ref */
1569 		} else if (flags & HAMMER2_LOOKUP_NODATA) {
1570 			hammer2_chain_lock(hmp, parent, how_maybe);
1571 			hammer2_chain_unlock(hmp, parent);
1572 		}
1573 		i = 0;
1574 		goto again2;
1575 	}
1576 
1577 	/*
1578 	 * All done, return chain
1579 	 */
1580 	return (chain);
1581 }
1582 
1583 /*
1584  * Create and return a new hammer2 system memory structure of the specified
1585  * key, type and size and insert it RELATIVE TO (PARENT).
1586  *
1587  * (parent) is typically either an inode or an indirect block, acquired
1588  * acquired as a side effect of issuing a prior failed lookup.  parent
1589  * must be locked and held.  Do not pass the inode chain to this function
1590  * unless that is the chain returned by the failed lookup.
1591  *
1592  * Non-indirect types will automatically allocate indirect blocks as required
1593  * if the new item does not fit in the current (parent).
1594  *
1595  * Indirect types will move a portion of the existing blockref array in
1596  * (parent) into the new indirect type and then use one of the free slots
1597  * to emplace the new indirect type.
1598  *
1599  * A new locked, referenced chain element is returned of the specified type.
1600  * The element may or may not have a data area associated with it:
1601  *
1602  *	VOLUME		not allowed here
1603  *	INODE		embedded data are will be set-up
1604  *	INDIRECT	not allowed here
1605  *	DATA		no data area will be set-up (caller is expected
1606  *			to have logical buffers, we don't want to alias
1607  *			the data onto device buffers!).
1608  *
1609  * Requires an exclusively locked parent.
1610  */
1611 hammer2_chain_t *
1612 hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1613 		     hammer2_chain_t *chain,
1614 		     hammer2_key_t key, int keybits, int type, size_t bytes)
1615 {
1616 	hammer2_blockref_t dummy;
1617 	hammer2_blockref_t *base;
1618 	hammer2_chain_t dummy_chain;
1619 	int unlock_parent = 0;
1620 	int allocated = 0;
1621 	int count;
1622 	int i;
1623 
1624 	KKASSERT(ccms_thread_lock_owned(&parent->cst));
1625 
1626 	if (chain == NULL) {
1627 		/*
1628 		 * First allocate media space and construct the dummy bref,
1629 		 * then allocate the in-memory chain structure.
1630 		 */
1631 		bzero(&dummy, sizeof(dummy));
1632 		dummy.type = type;
1633 		dummy.key = key;
1634 		dummy.keybits = keybits;
1635 		dummy.data_off = hammer2_bytes_to_radix(bytes);
1636 		chain = hammer2_chain_alloc(hmp, &dummy);
1637 		allocated = 1;
1638 
1639 		/*
1640 		 * We do NOT set INITIAL here (yet).  INITIAL is only
1641 		 * used for indirect blocks.
1642 		 *
1643 		 * Recalculate bytes to reflect the actual media block
1644 		 * allocation.
1645 		 */
1646 		bytes = (hammer2_off_t)1 <<
1647 			(int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
1648 		chain->bytes = bytes;
1649 
1650 		switch(type) {
1651 		case HAMMER2_BREF_TYPE_VOLUME:
1652 			panic("hammer2_chain_create: called with volume type");
1653 			break;
1654 		case HAMMER2_BREF_TYPE_INODE:
1655 			KKASSERT(bytes == HAMMER2_INODE_BYTES);
1656 			chain->data = (void *)&chain->u.ip->ip_data;
1657 			break;
1658 		case HAMMER2_BREF_TYPE_INDIRECT:
1659 			panic("hammer2_chain_create: cannot be used to"
1660 			      "create indirect block");
1661 			break;
1662 		case HAMMER2_BREF_TYPE_DATA:
1663 		default:
1664 			/* leave chain->data NULL */
1665 			KKASSERT(chain->data == NULL);
1666 			break;
1667 		}
1668 	} else {
1669 		/*
1670 		 * Potentially update the chain's key/keybits.
1671 		 */
1672 		chain->bref.key = key;
1673 		chain->bref.keybits = keybits;
1674 	}
1675 
1676 again:
1677 	/*
1678 	 * Locate a free blockref in the parent's array
1679 	 */
1680 	switch(parent->bref.type) {
1681 	case HAMMER2_BREF_TYPE_INODE:
1682 		KKASSERT((parent->u.ip->ip_data.op_flags &
1683 			  HAMMER2_OPFLAG_DIRECTDATA) == 0);
1684 		KKASSERT(parent->data != NULL);
1685 		base = &parent->data->ipdata.u.blockset.blockref[0];
1686 		count = HAMMER2_SET_COUNT;
1687 		break;
1688 	case HAMMER2_BREF_TYPE_INDIRECT:
1689 		if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1690 			base = NULL;
1691 		} else {
1692 			KKASSERT(parent->data != NULL);
1693 			base = &parent->data->npdata.blockref[0];
1694 		}
1695 		count = parent->bytes / sizeof(hammer2_blockref_t);
1696 		break;
1697 	case HAMMER2_BREF_TYPE_VOLUME:
1698 		KKASSERT(parent->data != NULL);
1699 		base = &hmp->voldata.sroot_blockset.blockref[0];
1700 		count = HAMMER2_SET_COUNT;
1701 		break;
1702 	default:
1703 		panic("hammer2_chain_create: unrecognized blockref type: %d",
1704 		      parent->bref.type);
1705 		count = 0;
1706 		break;
1707 	}
1708 
1709 	/*
1710 	 * Scan for an unallocated bref, also skipping any slots occupied
1711 	 * by in-memory chain elements that may not yet have been updated
1712 	 * in the parent's bref array.
1713 	 */
1714 	bzero(&dummy_chain, sizeof(dummy_chain));
1715 	for (i = 0; i < count; ++i) {
1716 		if (base == NULL) {
1717 			dummy_chain.index = i;
1718 			if (RB_FIND(hammer2_chain_tree,
1719 				    &parent->rbhead, &dummy_chain) == NULL) {
1720 				break;
1721 			}
1722 		} else if (base[i].type == 0) {
1723 			dummy_chain.index = i;
1724 			if (RB_FIND(hammer2_chain_tree,
1725 				    &parent->rbhead, &dummy_chain) == NULL) {
1726 				break;
1727 			}
1728 		}
1729 	}
1730 
1731 	/*
1732 	 * If no free blockref could be found we must create an indirect
1733 	 * block and move a number of blockrefs into it.  With the parent
1734 	 * locked we can safely lock each child in order to move it without
1735 	 * causing a deadlock.
1736 	 *
1737 	 * This may return the new indirect block or the old parent depending
1738 	 * on where the key falls.
1739 	 */
1740 	if (i == count) {
1741 		hammer2_chain_t *nparent;
1742 
1743 		nparent = hammer2_chain_create_indirect(hmp, parent,
1744 							key, keybits);
1745 		if (nparent == NULL) {
1746 			if (allocated)
1747 				hammer2_chain_free(hmp, chain);
1748 			chain = NULL;
1749 			goto done;
1750 		}
1751 		if (parent != nparent) {
1752 			if (unlock_parent)
1753 				hammer2_chain_unlock(hmp, parent);
1754 			parent = nparent;
1755 			unlock_parent = 1;
1756 		}
1757 		goto again;
1758 	}
1759 
1760 	/*
1761 	 * Link the chain into its parent.  Later on we will have to set
1762 	 * the MOVED bit in situations where we don't mark the new chain
1763 	 * as being modified.
1764 	 */
1765 	if (chain->parent != NULL)
1766 		panic("hammer2: hammer2_chain_create: chain already connected");
1767 	KKASSERT(chain->parent == NULL);
1768 	chain->parent = parent;
1769 	chain->index = i;
1770 	if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain))
1771 		panic("hammer2_chain_link: collision");
1772 	atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED);
1773 	KKASSERT(parent->refs > 0);
1774 	atomic_add_int(&parent->refs, 1);
1775 
1776 	/*
1777 	 * Additional linkage for inodes.  Reuse the parent pointer to
1778 	 * find the parent directory.
1779 	 *
1780 	 * Cumulative adjustments are inherited on [re]attach and will
1781 	 * propagate up the tree on the next flush.
1782 	 *
1783 	 * The ccms_inode is initialized from its parent directory.  The
1784 	 * chain of ccms_inode's is seeded by the mount code.
1785 	 */
1786 	if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1787 		hammer2_chain_t *scan = parent;
1788 		hammer2_inode_t *ip = chain->u.ip;
1789 
1790 		while (scan->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1791 			scan = scan->parent;
1792 		if (scan->bref.type == HAMMER2_BREF_TYPE_INODE) {
1793 			ip->pip = scan->u.ip;
1794 			ip->pmp = scan->u.ip->pmp;
1795 			ip->depth = scan->u.ip->depth + 1;
1796 			ip->pip->delta_icount += ip->ip_data.inode_count;
1797 			ip->pip->delta_dcount += ip->ip_data.data_count;
1798 			++ip->pip->delta_icount;
1799 			ccms_cst_init(&ip->topo_cst, &ip->chain);
1800 		}
1801 	}
1802 
1803 	/*
1804 	 * (allocated) indicates that this is a newly-created chain element
1805 	 * rather than a renamed chain element.  In this situation we want
1806 	 * to place the chain element in the MODIFIED state.
1807 	 *
1808 	 * The data area will be set up as follows:
1809 	 *
1810 	 *	VOLUME		not allowed here.
1811 	 *
1812 	 *	INODE		embedded data are will be set-up.
1813 	 *
1814 	 *	INDIRECT	not allowed here.
1815 	 *
1816 	 *	DATA		no data area will be set-up (caller is expected
1817 	 *			to have logical buffers, we don't want to alias
1818 	 *			the data onto device buffers!).
1819 	 */
1820 	if (allocated) {
1821 		if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1822 			hammer2_chain_modify(hmp, chain,
1823 					     HAMMER2_MODIFY_OPTDATA);
1824 		} else if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1825 			/* not supported in this function */
1826 			panic("hammer2_chain_create: bad type");
1827 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1828 			hammer2_chain_modify(hmp, chain,
1829 					     HAMMER2_MODIFY_OPTDATA);
1830 		} else {
1831 			hammer2_chain_modify(hmp, chain, 0);
1832 		}
1833 	} else {
1834 		/*
1835 		 * When reconnecting inodes we have to call setsubmod()
1836 		 * to ensure that its state propagates up the newly
1837 		 * connected parent.
1838 		 *
1839 		 * Make sure MOVED is set but do not update bref_flush.  If
1840 		 * the chain is undergoing modification bref_flush will be
1841 		 * updated when it gets flushed.  If it is not then the
1842 		 * bref may not have been flushed yet and we do not want to
1843 		 * set MODIFIED here as this could result in unnecessary
1844 		 * reallocations.
1845 		 */
1846 		if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1847 			hammer2_chain_ref(hmp, chain);
1848 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1849 		}
1850 		hammer2_chain_parent_setsubmod(hmp, chain);
1851 	}
1852 
1853 done:
1854 	if (unlock_parent)
1855 		hammer2_chain_unlock(hmp, parent);
1856 	return (chain);
1857 }
1858 
1859 /*
1860  * Create an indirect block that covers one or more of the elements in the
1861  * current parent.  Either returns the existing parent with no locking or
1862  * ref changes or returns the new indirect block locked and referenced
1863  * and leaving the original parent lock/ref intact as well.
1864  *
1865  * The returned chain depends on where the specified key falls.
1866  *
1867  * The key/keybits for the indirect mode only needs to follow three rules:
1868  *
1869  * (1) That all elements underneath it fit within its key space and
1870  *
1871  * (2) That all elements outside it are outside its key space.
1872  *
1873  * (3) When creating the new indirect block any elements in the current
1874  *     parent that fit within the new indirect block's keyspace must be
1875  *     moved into the new indirect block.
1876  *
1877  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
1878  *     keyspace the the current parent, but lookup/iteration rules will
1879  *     ensure (and must ensure) that rule (2) for all parents leading up
1880  *     to the nearest inode or the root volume header is adhered to.  This
1881  *     is accomplished by always recursing through matching keyspaces in
1882  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
1883  *
1884  * The current implementation calculates the current worst-case keyspace by
1885  * iterating the current parent and then divides it into two halves, choosing
1886  * whichever half has the most elements (not necessarily the half containing
1887  * the requested key).
1888  *
1889  * We can also opt to use the half with the least number of elements.  This
1890  * causes lower-numbered keys (aka logical file offsets) to recurse through
1891  * fewer indirect blocks and higher-numbered keys to recurse through more.
1892  * This also has the risk of not moving enough elements to the new indirect
1893  * block and being forced to create several indirect blocks before the element
1894  * can be inserted.
1895  *
1896  * Must be called with an exclusively locked parent
1897  */
1898 static
1899 hammer2_chain_t *
1900 hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1901 			      hammer2_key_t create_key, int create_bits)
1902 {
1903 	hammer2_blockref_t *base;
1904 	hammer2_blockref_t *bref;
1905 	hammer2_chain_t *chain;
1906 	hammer2_chain_t *ichain;
1907 	hammer2_chain_t dummy;
1908 	hammer2_key_t key = create_key;
1909 	int keybits = create_bits;
1910 	int locount = 0;
1911 	int hicount = 0;
1912 	int count;
1913 	int nbytes;
1914 	int i;
1915 
1916 	/*
1917 	 * Calculate the base blockref pointer or NULL if the chain
1918 	 * is known to be empty.  We need to calculate the array count
1919 	 * for RB lookups either way.
1920 	 */
1921 	KKASSERT(ccms_thread_lock_owned(&parent->cst));
1922 
1923 	hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
1924 	if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1925 		base = NULL;
1926 
1927 		switch(parent->bref.type) {
1928 		case HAMMER2_BREF_TYPE_INODE:
1929 			count = HAMMER2_SET_COUNT;
1930 			break;
1931 		case HAMMER2_BREF_TYPE_INDIRECT:
1932 			count = parent->bytes / sizeof(hammer2_blockref_t);
1933 			break;
1934 		case HAMMER2_BREF_TYPE_VOLUME:
1935 			count = HAMMER2_SET_COUNT;
1936 			break;
1937 		default:
1938 			panic("hammer2_chain_create_indirect: "
1939 			      "unrecognized blockref type: %d",
1940 			      parent->bref.type);
1941 			count = 0;
1942 			break;
1943 		}
1944 	} else {
1945 		switch(parent->bref.type) {
1946 		case HAMMER2_BREF_TYPE_INODE:
1947 			base = &parent->data->ipdata.u.blockset.blockref[0];
1948 			count = HAMMER2_SET_COUNT;
1949 			break;
1950 		case HAMMER2_BREF_TYPE_INDIRECT:
1951 			base = &parent->data->npdata.blockref[0];
1952 			count = parent->bytes / sizeof(hammer2_blockref_t);
1953 			break;
1954 		case HAMMER2_BREF_TYPE_VOLUME:
1955 			base = &hmp->voldata.sroot_blockset.blockref[0];
1956 			count = HAMMER2_SET_COUNT;
1957 			break;
1958 		default:
1959 			panic("hammer2_chain_create_indirect: "
1960 			      "unrecognized blockref type: %d",
1961 			      parent->bref.type);
1962 			count = 0;
1963 			break;
1964 		}
1965 	}
1966 
1967 	/*
1968 	 * Scan for an unallocated bref, also skipping any slots occupied
1969 	 * by in-memory chain elements which may not yet have been updated
1970 	 * in the parent's bref array.
1971 	 */
1972 	bzero(&dummy, sizeof(dummy));
1973 	for (i = 0; i < count; ++i) {
1974 		int nkeybits;
1975 
1976 		dummy.index = i;
1977 		chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
1978 		if (chain) {
1979 			bref = &chain->bref;
1980 		} else if (base && base[i].type) {
1981 			bref = &base[i];
1982 		} else {
1983 			continue;
1984 		}
1985 
1986 		/*
1987 		 * Expand our calculated key range (key, keybits) to fit
1988 		 * the scanned key.  nkeybits represents the full range
1989 		 * that we will later cut in half (two halves @ nkeybits - 1).
1990 		 */
1991 		nkeybits = keybits;
1992 		if (nkeybits < bref->keybits)
1993 			nkeybits = bref->keybits;
1994 		while (nkeybits < 64 &&
1995 		       (~(((hammer2_key_t)1 << nkeybits) - 1) &
1996 		        (key ^ bref->key)) != 0) {
1997 			++nkeybits;
1998 		}
1999 
2000 		/*
2001 		 * If the new key range is larger we have to determine
2002 		 * which side of the new key range the existing keys fall
2003 		 * under by checking the high bit, then collapsing the
2004 		 * locount into the hicount or vise-versa.
2005 		 */
2006 		if (keybits != nkeybits) {
2007 			if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
2008 				hicount += locount;
2009 				locount = 0;
2010 			} else {
2011 				locount += hicount;
2012 				hicount = 0;
2013 			}
2014 			keybits = nkeybits;
2015 		}
2016 
2017 		/*
2018 		 * The newly scanned key will be in the lower half or the
2019 		 * higher half of the (new) key range.
2020 		 */
2021 		if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
2022 			++hicount;
2023 		else
2024 			++locount;
2025 	}
2026 
2027 	/*
2028 	 * Adjust keybits to represent half of the full range calculated
2029 	 * above (radix 63 max)
2030 	 */
2031 	--keybits;
2032 
2033 	/*
2034 	 * Select whichever half contains the most elements.  Theoretically
2035 	 * we can select either side as long as it contains at least one
2036 	 * element (in order to ensure that a free slot is present to hold
2037 	 * the indirect block).
2038 	 */
2039 	key &= ~(((hammer2_key_t)1 << keybits) - 1);
2040 	if (hammer2_indirect_optimize) {
2041 		/*
2042 		 * Insert node for least number of keys, this will arrange
2043 		 * the first few blocks of a large file or the first few
2044 		 * inodes in a directory with fewer indirect blocks when
2045 		 * created linearly.
2046 		 */
2047 		if (hicount < locount && hicount != 0)
2048 			key |= (hammer2_key_t)1 << keybits;
2049 		else
2050 			key &= ~(hammer2_key_t)1 << keybits;
2051 	} else {
2052 		/*
2053 		 * Insert node for most number of keys, best for heavily
2054 		 * fragmented files.
2055 		 */
2056 		if (hicount > locount)
2057 			key |= (hammer2_key_t)1 << keybits;
2058 		else
2059 			key &= ~(hammer2_key_t)1 << keybits;
2060 	}
2061 
2062 	/*
2063 	 * How big should our new indirect block be?  It has to be at least
2064 	 * as large as its parent.
2065 	 */
2066 	if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
2067 		nbytes = HAMMER2_IND_BYTES_MIN;
2068 	else
2069 		nbytes = HAMMER2_IND_BYTES_MAX;
2070 	if (nbytes < count * sizeof(hammer2_blockref_t))
2071 		nbytes = count * sizeof(hammer2_blockref_t);
2072 
2073 	/*
2074 	 * Ok, create our new indirect block
2075 	 */
2076 	dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
2077 	dummy.bref.key = key;
2078 	dummy.bref.keybits = keybits;
2079 	dummy.bref.data_off = hammer2_bytes_to_radix(nbytes);
2080 	ichain = hammer2_chain_alloc(hmp, &dummy.bref);
2081 	atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
2082 
2083 	/*
2084 	 * Iterate the original parent and move the matching brefs into
2085 	 * the new indirect block.
2086 	 */
2087 	for (i = 0; i < count; ++i) {
2088 		/*
2089 		 * For keying purposes access the bref from the media or
2090 		 * from our in-memory cache.  In cases where the in-memory
2091 		 * cache overrides the media the keyrefs will be the same
2092 		 * anyway so we can avoid checking the cache when the media
2093 		 * has a key.
2094 		 */
2095 		dummy.index = i;
2096 		chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
2097 		if (chain) {
2098 			bref = &chain->bref;
2099 		} else if (base && base[i].type) {
2100 			bref = &base[i];
2101 		} else {
2102 			if (ichain->index < 0)
2103 				ichain->index = i;
2104 			continue;
2105 		}
2106 
2107 		/*
2108 		 * Skip keys not in the chosen half (low or high), only bit
2109 		 * (keybits - 1) needs to be compared but for safety we
2110 		 * will compare all msb bits plus that bit again.
2111 		 */
2112 		if ((~(((hammer2_key_t)1 << keybits) - 1) &
2113 		    (key ^ bref->key)) != 0) {
2114 			continue;
2115 		}
2116 
2117 		/*
2118 		 * This element is being moved from the parent, its slot
2119 		 * is available for our new indirect block.
2120 		 */
2121 		if (ichain->index < 0)
2122 			ichain->index = i;
2123 
2124 		/*
2125 		 * Load the new indirect block by acquiring or allocating
2126 		 * the related chain entries, then simply move them to the
2127 		 * new parent (ichain).
2128 		 *
2129 		 * When adjusting the parent/child relationship we must
2130 		 * set the MOVED bit but we do NOT update bref_flush
2131 		 * because otherwise we might synchronize a bref that has
2132 		 * not yet been flushed.  We depend on chain's bref_flush
2133 		 * either being correct or the chain being in a MODIFIED
2134 		 * state.
2135 		 *
2136 		 * We do not want to set MODIFIED here as this would result
2137 		 * in unnecessary reallocations.
2138 		 *
2139 		 * We must still set SUBMODIFIED in the parent but we do
2140 		 * that after the loop.
2141 		 *
2142 		 * XXX we really need a lock here but we don't need the
2143 		 *     data.  NODATA feature needed.
2144 		 */
2145 		chain = hammer2_chain_get(hmp, parent, i,
2146 					  HAMMER2_LOOKUP_NODATA);
2147 		RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
2148 		if (RB_INSERT(hammer2_chain_tree, &ichain->rbhead, chain))
2149 			panic("hammer2_chain_create_indirect: collision");
2150 		chain->parent = ichain;
2151 		if (base)
2152 			bzero(&base[i], sizeof(base[i]));
2153 		atomic_add_int(&parent->refs, -1);
2154 		atomic_add_int(&ichain->refs, 1);
2155 		if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2156 			hammer2_chain_ref(hmp, chain);
2157 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2158 		}
2159 		hammer2_chain_unlock(hmp, chain);
2160 		KKASSERT(parent->refs > 0);
2161 		chain = NULL;
2162 	}
2163 
2164 	/*
2165 	 * Insert the new indirect block into the parent now that we've
2166 	 * cleared out some entries in the parent.  We calculated a good
2167 	 * insertion index in the loop above (ichain->index).
2168 	 *
2169 	 * We don't have to set MOVED here because we mark ichain modified
2170 	 * down below (so the normal modified -> flush -> set-moved sequence
2171 	 * applies).
2172 	 */
2173 	KKASSERT(ichain->index >= 0);
2174 	if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, ichain))
2175 		panic("hammer2_chain_create_indirect: ichain insertion");
2176 	ichain->parent = parent;
2177 	atomic_add_int(&parent->refs, 1);
2178 
2179 	/*
2180 	 * Mark the new indirect block modified after insertion, which
2181 	 * will propagate up through parent all the way to the root and
2182 	 * also allocate the physical block in ichain for our caller,
2183 	 * and assign ichain->data to a pre-zero'd space (because there
2184 	 * is not prior data to copy into it).
2185 	 *
2186 	 * We have to set SUBMODIFIED in ichain's flags manually so the
2187 	 * flusher knows it has to recurse through it to get to all of
2188 	 * our moved blocks, then call setsubmod() to set the bit
2189 	 * recursively.
2190 	 */
2191 	hammer2_chain_modify(hmp, ichain, HAMMER2_MODIFY_OPTDATA);
2192 	hammer2_chain_parent_setsubmod(hmp, ichain);
2193 	atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2194 
2195 	/*
2196 	 * Figure out what to return.
2197 	 */
2198 	if (create_bits > keybits) {
2199 		/*
2200 		 * Key being created is way outside the key range,
2201 		 * return the original parent.
2202 		 */
2203 		hammer2_chain_unlock(hmp, ichain);
2204 	} else if (~(((hammer2_key_t)1 << keybits) - 1) &
2205 		   (create_key ^ key)) {
2206 		/*
2207 		 * Key being created is outside the key range,
2208 		 * return the original parent.
2209 		 */
2210 		hammer2_chain_unlock(hmp, ichain);
2211 	} else {
2212 		/*
2213 		 * Otherwise its in the range, return the new parent.
2214 		 * (leave both the new and old parent locked).
2215 		 */
2216 		parent = ichain;
2217 	}
2218 
2219 	return(parent);
2220 }
2221 
2222 /*
2223  * Physically delete the specified chain element.  Note that inodes with
2224  * open descriptors should not be deleted (as with other filesystems) until
2225  * the last open descriptor is closed.
2226  *
2227  * This routine will remove the chain element from its parent and potentially
2228  * also recurse upward and delete indirect blocks which become empty as a
2229  * side effect.
2230  *
2231  * The caller must pass a pointer to the chain's parent, also locked and
2232  * referenced.  (*parentp) will be modified in a manner similar to a lookup
2233  * or iteration when indirect blocks are also deleted as a side effect.
2234  *
2235  * XXX This currently does not adhere to the MOVED flag protocol in that
2236  *     the removal is immediately indicated in the parent's blockref[]
2237  *     array.
2238  *
2239  * Must be called with an exclusively locked parent.
2240  */
2241 void
2242 hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
2243 		     hammer2_chain_t *chain, int retain)
2244 {
2245 	hammer2_blockref_t *base;
2246 	hammer2_inode_t *ip;
2247 	int count;
2248 
2249 	if (chain->parent != parent)
2250 		panic("hammer2_chain_delete: parent mismatch");
2251 	KKASSERT(ccms_thread_lock_owned(&parent->cst));
2252 
2253 	/*
2254 	 * Mark the parent modified so our base[] pointer remains valid
2255 	 * while we move entries.  For the optimized indirect block
2256 	 * case mark the parent moved instead.
2257 	 *
2258 	 * Calculate the blockref reference in the parent
2259 	 */
2260 	switch(parent->bref.type) {
2261 	case HAMMER2_BREF_TYPE_INODE:
2262 		hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2263 		base = &parent->data->ipdata.u.blockset.blockref[0];
2264 		count = HAMMER2_SET_COUNT;
2265 		break;
2266 	case HAMMER2_BREF_TYPE_INDIRECT:
2267 		hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA |
2268 						  HAMMER2_MODIFY_NO_MODIFY_TID);
2269 		if (parent->flags & HAMMER2_CHAIN_INITIAL)
2270 			base = NULL;
2271 		else
2272 			base = &parent->data->npdata.blockref[0];
2273 		count = parent->bytes / sizeof(hammer2_blockref_t);
2274 		break;
2275 	case HAMMER2_BREF_TYPE_VOLUME:
2276 		hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2277 		base = &hmp->voldata.sroot_blockset.blockref[0];
2278 		count = HAMMER2_SET_COUNT;
2279 		break;
2280 	default:
2281 		panic("hammer2_chain_delete: unrecognized blockref type: %d",
2282 		      parent->bref.type);
2283 		count = 0;
2284 		break;
2285 	}
2286 
2287 	/*
2288 	 * Disconnect the bref in the parent, remove the chain, and
2289 	 * disconnect in-memory fields from the parent.
2290 	 */
2291 	KKASSERT(chain->index >= 0 && chain->index < count);
2292 	if (base)
2293 		bzero(&base[chain->index], sizeof(*base));
2294 
2295 	RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
2296 	atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
2297 	atomic_add_int(&parent->refs, -1);	/* for red-black entry */
2298 	chain->index = -1;
2299 	chain->parent = NULL;
2300 
2301 	/*
2302 	 * Cumulative adjustments must be propagated to the parent inode
2303 	 * when deleting and synchronized to ip.
2304 	 *
2305 	 * NOTE:  We do not propagate ip->delta_*count to the parent because
2306 	 *	  these represent adjustments that have not yet been
2307 	 *	  propagated upward, so we don't need to remove them from
2308 	 *	  the parent.
2309 	 *
2310 	 * Clear the pointer to the parent inode.
2311 	 */
2312 	if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2313 		ip = chain->u.ip;
2314 		if (ip->pip) {
2315 			ip->pip->delta_icount -= ip->ip_data.inode_count;
2316 			ip->pip->delta_dcount -= ip->ip_data.data_count;
2317 			ip->ip_data.inode_count += ip->delta_icount;
2318 			ip->ip_data.data_count += ip->delta_dcount;
2319 			ip->delta_icount = 0;
2320 			ip->delta_dcount = 0;
2321 			--ip->pip->delta_icount;
2322 			ip->pip = NULL;
2323 		}
2324 		chain->u.ip->depth = 0;
2325 	}
2326 
2327 	/*
2328 	 * If retain is 0 the deletion is permanent.  Because the chain is
2329 	 * no longer connected to the topology a flush will have no
2330 	 * visibility into it.  We must dispose of the references related
2331 	 * to the MODIFIED and MOVED flags, otherwise the ref count will
2332 	 * never transition to 0.
2333 	 *
2334 	 * If retain is non-zero the deleted element is likely an inode
2335 	 * which the vnops frontend will mark DESTROYED and flush.  In that
2336 	 * situation we must retain the flags for any open file descriptors
2337 	 * on the (removed) inode.  The final close will destroy the
2338 	 * disconnected chain.
2339 	 */
2340 	if (retain == 0) {
2341 		if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
2342 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
2343 			hammer2_chain_drop(hmp, chain);
2344 		}
2345 		if (chain->flags & HAMMER2_CHAIN_MOVED) {
2346 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2347 			hammer2_chain_drop(hmp, chain);
2348 		}
2349 	}
2350 
2351 	/*
2352 	 * The chain is still likely referenced, possibly even by a vnode
2353 	 * (if an inode), so defer further action until the chain gets
2354 	 * dropped.
2355 	 */
2356 }
2357 
2358 /*
2359  * Recursively flush the specified chain.  The chain is locked and
2360  * referenced by the caller and will remain so on return.  The chain
2361  * will remain referenced throughout but can temporarily lose its
2362  * lock during the recursion to avoid unnecessarily stalling user
2363  * processes.
2364  */
2365 struct hammer2_flush_info {
2366 	struct flush_deferral_list flush_list;
2367 	int		depth;
2368 	hammer2_tid_t	modify_tid;
2369 };
2370 
2371 typedef struct hammer2_flush_info hammer2_flush_info_t;
2372 
2373 static void
2374 hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
2375 			  hammer2_flush_info_t *info)
2376 {
2377 	hammer2_blockref_t *bref;
2378 	hammer2_off_t pbase;
2379 	size_t bbytes;
2380 	size_t boff;
2381 	char *bdata;
2382 	struct buf *bp;
2383 	int error;
2384 	int wasmodified;
2385 
2386 	/*
2387 	 * If we hit the stack recursion depth limit defer the operation.
2388 	 * The controller of the info structure will execute the deferral
2389 	 * list and then retry.
2390 	 *
2391 	 * This is only applicable if SUBMODIFIED is set.  After a reflush
2392 	 * SUBMODIFIED will probably be cleared and we want to drop through
2393 	 * to finish processing the current element so our direct parent
2394 	 * can process the results.
2395 	 */
2396 	if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT &&
2397 	    (chain->flags & HAMMER2_CHAIN_SUBMODIFIED)) {
2398 		if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
2399 			hammer2_chain_ref(hmp, chain);
2400 			TAILQ_INSERT_TAIL(&info->flush_list,
2401 					  chain, flush_node);
2402 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
2403 		}
2404 		return;
2405 	}
2406 
2407 	if (hammer2_debug & 0x0008)
2408 		kprintf("%*.*sCHAIN type=%d@%08jx %p/%d %04x {\n",
2409 			info->depth, info->depth, "",
2410 			chain->bref.type, chain->bref.data_off,
2411 			chain, chain->refs, chain->flags);
2412 
2413 	/*
2414 	 * If SUBMODIFIED is set we recurse the flush and adjust the
2415 	 * blockrefs accordingly.
2416 	 *
2417 	 * NOTE: Looping on SUBMODIFIED can prevent a flush from ever
2418 	 *	 finishing in the face of filesystem activity.
2419 	 */
2420 	if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) {
2421 		hammer2_chain_t *child;
2422 		hammer2_chain_t *next;
2423 		hammer2_blockref_t *base;
2424 		int count;
2425 
2426 		/*
2427 		 * Clear SUBMODIFIED to catch races.  Note that if any
2428 		 * child has to be flushed SUBMODIFIED will wind up being
2429 		 * set again (for next time), but this does not stop us from
2430 		 * synchronizing block updates which occurred.
2431 		 *
2432 		 * We don't want to set our chain to MODIFIED gratuitously.
2433 		 */
2434 		/* XXX SUBMODIFIED not interlocked, can race */
2435 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2436 
2437 		/*
2438 		 * Flush the children and update the blockrefs in the chain.
2439 		 * Be careful of ripouts during the loop.
2440 		 */
2441 		next = RB_MIN(hammer2_chain_tree, &chain->rbhead);
2442 		if (next)
2443 			hammer2_chain_ref(hmp, next);
2444 		while ((child = next) != NULL) {
2445 			next = RB_NEXT(hammer2_chain_tree,
2446 				       &chain->rbhead, child);
2447 			if (next)
2448 				hammer2_chain_ref(hmp, next);
2449 			/*
2450 			 * We only recurse if SUBMODIFIED (internal node)
2451 			 * or MODIFIED (internal node or leaf) is set.
2452 			 * However, we must still track whether any MOVED
2453 			 * entries are present to determine if the chain's
2454 			 * blockref's need updating or not.
2455 			 */
2456 			if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2457 					     HAMMER2_CHAIN_MODIFIED |
2458 					    HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2459 				hammer2_chain_drop(hmp, child);
2460 				continue;
2461 			}
2462 			hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_MAYBE);
2463 			hammer2_chain_drop(hmp, child);
2464 			if (child->parent != chain ||
2465 			    (child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2466 					     HAMMER2_CHAIN_MODIFIED |
2467 					    HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2468 				hammer2_chain_unlock(hmp, child);
2469 				continue;
2470 			}
2471 
2472 			/*
2473 			 * Propagate the DESTROYED flag if found set, then
2474 			 * recurse the flush.
2475 			 */
2476 			if ((chain->flags & HAMMER2_CHAIN_DESTROYED) &&
2477 			    (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) {
2478 				atomic_set_int(&child->flags,
2479 					       HAMMER2_CHAIN_DESTROYED |
2480 					       HAMMER2_CHAIN_SUBMODIFIED);
2481 			}
2482 			++info->depth;
2483 			hammer2_chain_flush_pass1(hmp, child, info);
2484 			--info->depth;
2485 			hammer2_chain_unlock(hmp, child);
2486 		}
2487 
2488 		/*
2489 		 * Now synchronize any block updates.
2490 		 */
2491 		next = RB_MIN(hammer2_chain_tree, &chain->rbhead);
2492 		if (next)
2493 			hammer2_chain_ref(hmp, next);
2494 		while ((child = next) != NULL) {
2495 			next = RB_NEXT(hammer2_chain_tree,
2496 				       &chain->rbhead, child);
2497 			if (next)
2498 				hammer2_chain_ref(hmp, next);
2499 			if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) {
2500 				hammer2_chain_drop(hmp, child);
2501 				continue;
2502 			}
2503 			hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_NEVER);
2504 			hammer2_chain_drop(hmp, child);
2505 			if (child->parent != chain ||
2506 			    (child->flags & HAMMER2_CHAIN_MOVED) == 0) {
2507 				hammer2_chain_unlock(hmp, child);
2508 				continue;
2509 			}
2510 
2511 			hammer2_chain_modify(hmp, chain,
2512 					     HAMMER2_MODIFY_NO_MODIFY_TID);
2513 
2514 			switch(chain->bref.type) {
2515 			case HAMMER2_BREF_TYPE_INODE:
2516 				KKASSERT((chain->data->ipdata.op_flags &
2517 					  HAMMER2_OPFLAG_DIRECTDATA) == 0);
2518 				base = &chain->data->ipdata.u.blockset.
2519 					blockref[0];
2520 				count = HAMMER2_SET_COUNT;
2521 				break;
2522 			case HAMMER2_BREF_TYPE_INDIRECT:
2523 				base = &chain->data->npdata.blockref[0];
2524 				count = chain->bytes /
2525 					sizeof(hammer2_blockref_t);
2526 				break;
2527 			case HAMMER2_BREF_TYPE_VOLUME:
2528 				base = &hmp->voldata.sroot_blockset.blockref[0];
2529 				count = HAMMER2_SET_COUNT;
2530 				break;
2531 			default:
2532 				base = NULL;
2533 				panic("hammer2_chain_get: "
2534 				      "unrecognized blockref type: %d",
2535 				      chain->bref.type);
2536 			}
2537 
2538 			KKASSERT(child->index >= 0);
2539 			base[child->index] = child->bref_flush;
2540 
2541 			if (chain->bref.mirror_tid <
2542 			    child->bref_flush.mirror_tid) {
2543 				chain->bref.mirror_tid =
2544 					child->bref_flush.mirror_tid;
2545 			}
2546 
2547 			if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME &&
2548 			    hmp->voldata.mirror_tid <
2549 			    child->bref_flush.mirror_tid) {
2550 				hmp->voldata.mirror_tid =
2551 					child->bref_flush.mirror_tid;
2552 			}
2553 			atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED);
2554 			hammer2_chain_drop(hmp, child); /* MOVED flag */
2555 			hammer2_chain_unlock(hmp, child);
2556 		}
2557 	}
2558 
2559 	/*
2560 	 * If destroying the object we unconditonally clear the MODIFIED
2561 	 * and MOVED bits, and we destroy the buffer without writing it
2562 	 * out.
2563 	 *
2564 	 * We don't bother updating the hash/crc or the chain bref.
2565 	 *
2566 	 * NOTE: The destroy'd object's bref has already been updated.
2567 	 *	 so we can clear MOVED without propagating mirror_tid
2568 	 *	 or modify_tid upward.
2569 	 *
2570 	 * XXX allocations for unflushed data can be returned to the
2571 	 *     free pool.
2572 	 */
2573 	if (chain->flags & HAMMER2_CHAIN_DESTROYED) {
2574 		if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
2575 			if (chain->bp) {
2576 				chain->bp->b_flags |= B_INVAL|B_RELBUF;
2577 			}
2578 			atomic_clear_int(&chain->flags,
2579 					 HAMMER2_CHAIN_MODIFIED |
2580 					 HAMMER2_CHAIN_MODIFY_TID);
2581 			hammer2_chain_drop(hmp, chain);
2582 		}
2583 		if (chain->flags & HAMMER2_CHAIN_MODIFIED_AUX) {
2584 			atomic_clear_int(&chain->flags,
2585 					 HAMMER2_CHAIN_MODIFIED_AUX);
2586 		}
2587 		if (chain->flags & HAMMER2_CHAIN_MOVED) {
2588 			atomic_clear_int(&chain->flags,
2589 					 HAMMER2_CHAIN_MOVED);
2590 			hammer2_chain_drop(hmp, chain);
2591 		}
2592 		return;
2593 	}
2594 
2595 	/*
2596 	 * Flush this chain entry only if it is marked modified.
2597 	 */
2598 	if ((chain->flags & (HAMMER2_CHAIN_MODIFIED |
2599 			     HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2600 		goto done;
2601 	}
2602 
2603 	/*
2604 	 * Synchronize cumulative data and inode count adjustments to
2605 	 * the inode and propagate the deltas upward to the parent.
2606 	 */
2607 	if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2608 		hammer2_inode_t *ip;
2609 
2610 		ip = chain->u.ip;
2611 		ip->ip_data.inode_count += ip->delta_icount;
2612 		ip->ip_data.data_count += ip->delta_dcount;
2613 		if (ip->pip) {
2614 			ip->pip->delta_icount += ip->delta_icount;
2615 			ip->pip->delta_dcount += ip->delta_dcount;
2616 		}
2617 		ip->delta_icount = 0;
2618 		ip->delta_dcount = 0;
2619 	}
2620 
2621 	/*
2622 	 * Flush if MODIFIED or MODIFIED_AUX is set.  MODIFIED_AUX is only
2623 	 * used by the volume header (&hmp->vchain).
2624 	 */
2625 	if ((chain->flags & (HAMMER2_CHAIN_MODIFIED |
2626 			     HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2627 		goto done;
2628 	}
2629 	atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED_AUX);
2630 
2631 	/*
2632 	 * Clear MODIFIED and set HAMMER2_CHAIN_MOVED.  The caller
2633 	 * will re-test the MOVED bit.  We must also update the mirror_tid
2634 	 * and modify_tid fields as appropriate.
2635 	 *
2636 	 * bits own a single chain ref and the MOVED bit owns its own
2637 	 * chain ref.
2638 	 */
2639 	chain->bref.mirror_tid = info->modify_tid;
2640 	if (chain->flags & HAMMER2_CHAIN_MODIFY_TID)
2641 		chain->bref.modify_tid = info->modify_tid;
2642 	wasmodified = (chain->flags & HAMMER2_CHAIN_MODIFIED) != 0;
2643 	atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
2644 					HAMMER2_CHAIN_MODIFY_TID);
2645 
2646 	if (chain->flags & HAMMER2_CHAIN_MOVED) {
2647 		/*
2648 		 * Drop the ref from the MODIFIED bit we cleared.
2649 		 */
2650 		if (wasmodified)
2651 			hammer2_chain_drop(hmp, chain);
2652 	} else {
2653 		/*
2654 		 * If we were MODIFIED we inherit the ref from clearing
2655 		 * that bit, otherwise we need another ref.
2656 		 */
2657 		if (wasmodified == 0)
2658 			hammer2_chain_ref(hmp, chain);
2659 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2660 	}
2661 	chain->bref_flush = chain->bref;
2662 
2663 	/*
2664 	 * If this is part of a recursive flush we can go ahead and write
2665 	 * out the buffer cache buffer and pass a new bref back up the chain.
2666 	 *
2667 	 * This will never be a volume header.
2668 	 */
2669 	switch(chain->bref.type) {
2670 	case HAMMER2_BREF_TYPE_VOLUME:
2671 		/*
2672 		 * The volume header is flushed manually by the syncer, not
2673 		 * here.
2674 		 */
2675 		break;
2676 	case HAMMER2_BREF_TYPE_DATA:
2677 		/*
2678 		 * Data elements have already been flushed via the logical
2679 		 * file buffer cache.  Their hash was set in the bref by
2680 		 * the vop_write code.
2681 		 *
2682 		 * Make sure the buffer(s) have been flushed out here.
2683 		 */
2684 		bbytes = chain->bytes;
2685 		pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
2686 		boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
2687 
2688 		bp = getblk(hmp->devvp, pbase, bbytes, GETBLK_NOWAIT, 0);
2689 		if (bp) {
2690 			if ((bp->b_flags & (B_CACHE | B_DIRTY)) ==
2691 			    (B_CACHE | B_DIRTY)) {
2692 				kprintf("x");
2693 				cluster_awrite(bp);
2694 			} else {
2695 				bp->b_flags |= B_RELBUF;
2696 				brelse(bp);
2697 			}
2698 		}
2699 		break;
2700 	case HAMMER2_BREF_TYPE_INDIRECT:
2701 		/*
2702 		 * Indirect blocks may be in an INITIAL state.  Use the
2703 		 * chain_lock() call to ensure that the buffer has been
2704 		 * instantiated (even though it is already locked the buffer
2705 		 * might not have been instantiated).
2706 		 *
2707 		 * Only write the buffer out if it is dirty, it is possible
2708 		 * the operating system had already written out the buffer.
2709 		 */
2710 		hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_ALWAYS);
2711 		KKASSERT(chain->bp != NULL);
2712 
2713 		bp = chain->bp;
2714 		if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) ||
2715 		    (bp->b_flags & B_DIRTY)) {
2716 			bawrite(chain->bp);
2717 		} else {
2718 			brelse(chain->bp);
2719 		}
2720 		chain->bp = NULL;
2721 		chain->data = NULL;
2722 		hammer2_chain_unlock(hmp, chain);
2723 		break;
2724 	default:
2725 		/*
2726 		 * Embedded elements have to be flushed out.
2727 		 */
2728 		KKASSERT(chain->data != NULL);
2729 		KKASSERT(chain->bp == NULL);
2730 		bref = &chain->bref;
2731 
2732 		KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0);
2733 
2734 		if (chain->bp == NULL) {
2735 			/*
2736 			 * The data is embedded, we have to acquire the
2737 			 * buffer cache buffer and copy the data into it.
2738 			 */
2739 			if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
2740 				bbytes = HAMMER2_MINIOSIZE;
2741 			pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
2742 			boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
2743 
2744 			/*
2745 			 * The getblk() optimization can only be used if the
2746 			 * physical block size matches the request.
2747 			 */
2748 			if (chain->bytes == bbytes) {
2749 				bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
2750 				error = 0;
2751 			} else {
2752 				error = bread(hmp->devvp, pbase, bbytes, &bp);
2753 				KKASSERT(error == 0);
2754 			}
2755 			bdata = (char *)bp->b_data + boff;
2756 
2757 			/*
2758 			 * Copy the data to the buffer, mark the buffer
2759 			 * dirty, and convert the chain to unmodified.
2760 			 *
2761 			 * We expect we might have to make adjustments to
2762 			 * non-data delayed-write buffers when doing an
2763 			 * actual flush so use bawrite() instead of
2764 			 * cluster_awrite() here.
2765 			 */
2766 			bcopy(chain->data, bdata, chain->bytes);
2767 			bp->b_flags |= B_CLUSTEROK;
2768 			bawrite(bp);
2769 			bp = NULL;
2770 			chain->bref.check.iscsi32.value =
2771 				hammer2_icrc32(chain->data, chain->bytes);
2772 			if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
2773 				++hammer2_iod_meta_write;
2774 			else
2775 				++hammer2_iod_indr_write;
2776 		} else {
2777 			chain->bref.check.iscsi32.value =
2778 				hammer2_icrc32(chain->data, chain->bytes);
2779 		}
2780 	}
2781 
2782 	/*
2783 	 * Adjustments to the bref.  The caller will use this to adjust
2784 	 * our chain's pointer to this chain element.
2785 	 */
2786 	bref = &chain->bref;
2787 
2788 	switch(bref->type) {
2789 	case HAMMER2_BREF_TYPE_VOLUME:
2790 		KKASSERT(chain->data != NULL);
2791 		KKASSERT(chain->bp == NULL);
2792 
2793 		hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
2794 			hammer2_icrc32(
2795 				(char *)&hmp->voldata +
2796 				 HAMMER2_VOLUME_ICRC1_OFF,
2797 				HAMMER2_VOLUME_ICRC1_SIZE);
2798 		hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
2799 			hammer2_icrc32(
2800 				(char *)&hmp->voldata +
2801 				 HAMMER2_VOLUME_ICRC0_OFF,
2802 				HAMMER2_VOLUME_ICRC0_SIZE);
2803 		hmp->voldata.icrc_volheader =
2804 			hammer2_icrc32(
2805 				(char *)&hmp->voldata +
2806 				 HAMMER2_VOLUME_ICRCVH_OFF,
2807 				HAMMER2_VOLUME_ICRCVH_SIZE);
2808 		break;
2809 	default:
2810 		break;
2811 
2812 	}
2813 done:
2814 	if (hammer2_debug & 0x0008) {
2815 		kprintf("%*.*s} %p/%d %04x ",
2816 			info->depth, info->depth, "",
2817 			chain, chain->refs, chain->flags);
2818 	}
2819 }
2820 
2821 #if 0
2822 /*
2823  * PASS2 - not yet implemented (should be called only with the root chain?)
2824  */
2825 static void
2826 hammer2_chain_flush_pass2(hammer2_mount_t *hmp, hammer2_chain_t *chain)
2827 {
2828 }
2829 #endif
2830 
2831 /*
2832  * Stand-alone flush.  If the chain is unable to completely flush we have
2833  * to be sure that SUBMODIFIED propagates up the parent chain.  We must not
2834  * clear the MOVED bit after flushing in this situation or our desynchronized
2835  * bref will not properly update in the parent.
2836  *
2837  * This routine can be called from several places but the most important
2838  * is from the hammer2_vop_reclaim() function.  We want to try to completely
2839  * clean out the inode structure to prevent disconnected inodes from
2840  * building up and blowing out the kmalloc pool.
2841  *
2842  * If modify_tid is 0 (usual case), a new modify_tid is allocated and
2843  * applied to the flush.  The depth-limit handling code is the only
2844  * code which passes a non-zero modify_tid to hammer2_chain_flush().
2845  */
2846 void
2847 hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain,
2848 		    hammer2_tid_t modify_tid)
2849 {
2850 	hammer2_chain_t *parent;
2851 	hammer2_chain_t *scan;
2852 	hammer2_blockref_t *base;
2853 	hammer2_flush_info_t info;
2854 	int count;
2855 	int reflush;
2856 
2857 	/*
2858 	 * Execute the recursive flush and handle deferrals.
2859 	 *
2860 	 * Chains can be ridiculously long (thousands deep), so to
2861 	 * avoid blowing out the kernel stack the recursive flush has a
2862 	 * depth limit.  Elements at the limit are placed on a list
2863 	 * for re-execution after the stack has been popped.
2864 	 */
2865 	bzero(&info, sizeof(info));
2866 	TAILQ_INIT(&info.flush_list);
2867 
2868 	if (modify_tid == 0) {
2869 		hammer2_voldata_lock(hmp);
2870 		info.modify_tid = hmp->voldata.alloc_tid++;
2871 		atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
2872 		hammer2_voldata_unlock(hmp);
2873 	} else {
2874 		info.modify_tid = modify_tid;
2875 	}
2876 	reflush = 1;
2877 
2878 	while (reflush) {
2879 		/*
2880 		 * Primary recursion
2881 		 */
2882 		hammer2_chain_flush_pass1(hmp, chain, &info);
2883 		reflush = 0;
2884 
2885 		while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) {
2886 			/*
2887 			 * Secondary recursion.  Note that a reference is
2888 			 * retained from the element's presence on the
2889 			 * deferral list.
2890 			 */
2891 			KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
2892 			TAILQ_REMOVE(&info.flush_list, scan, flush_node);
2893 			atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
2894 
2895 			/*
2896 			 * Now that we've popped back up we can do a secondary
2897 			 * recursion on the deferred elements.
2898 			 */
2899 			if (hammer2_debug & 0x0040)
2900 				kprintf("defered flush %p\n", scan);
2901 			hammer2_chain_lock(hmp, scan, HAMMER2_RESOLVE_MAYBE);
2902 			hammer2_chain_flush(hmp, scan, info.modify_tid);
2903 			hammer2_chain_unlock(hmp, scan);
2904 
2905 			/*
2906 			 * Only flag a reflush if SUBMODIFIED is no longer
2907 			 * set.  If SUBMODIFIED is set the element will just
2908 			 * wind up on our flush_list again.
2909 			 */
2910 			if ((scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2911 					    HAMMER2_CHAIN_MODIFIED |
2912 					    HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2913 				reflush = 1;
2914 			}
2915 			hammer2_chain_drop(hmp, scan);
2916 		}
2917 		if ((hammer2_debug & 0x0040) && reflush)
2918 			kprintf("reflush %p\n", chain);
2919 	}
2920 
2921 	/*
2922 	 * The SUBMODIFIED bit must propagate upward if the chain could not
2923 	 * be completely flushed.
2924 	 */
2925 	if (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2926 			    HAMMER2_CHAIN_MODIFIED |
2927 			    HAMMER2_CHAIN_MODIFIED_AUX |
2928 			    HAMMER2_CHAIN_MOVED)) {
2929 		hammer2_chain_parent_setsubmod(hmp, chain);
2930 	}
2931 
2932 	/*
2933 	 * If the only thing left is a simple bref update try to
2934 	 * pro-actively update the parent, otherwise return early.
2935 	 */
2936 	parent = chain->parent;
2937 	if (parent == NULL) {
2938 		return;
2939 	}
2940 	if (chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
2941 	    (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2942 			     HAMMER2_CHAIN_MODIFIED |
2943 			     HAMMER2_CHAIN_MODIFIED_AUX |
2944 			     HAMMER2_CHAIN_MOVED)) != HAMMER2_CHAIN_MOVED) {
2945 		return;
2946 	}
2947 
2948 	/*
2949 	 * We are locking backwards so allow the lock to fail.
2950 	 */
2951 	if (ccms_thread_lock_nonblock(&parent->cst, CCMS_STATE_EXCLUSIVE))
2952 		return;
2953 
2954 	/*
2955 	 * We are updating brefs but we have to call chain_modify()
2956 	 * because our caller is not being run from a recursive flush.
2957 	 *
2958 	 * This will also chain up the parent list and set the SUBMODIFIED
2959 	 * flag.
2960 	 *
2961 	 * We do not want to set HAMMER2_CHAIN_MODIFY_TID here because the
2962 	 * modification is only related to updating a bref in the parent.
2963 	 *
2964 	 * When updating the blockset embedded in the volume header we must
2965 	 * also update voldata.mirror_tid.
2966 	 */
2967 	hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
2968 	hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2969 
2970 	switch(parent->bref.type) {
2971 	case HAMMER2_BREF_TYPE_INODE:
2972 		base = &parent->data->ipdata.u.blockset.
2973 			blockref[0];
2974 		count = HAMMER2_SET_COUNT;
2975 		break;
2976 	case HAMMER2_BREF_TYPE_INDIRECT:
2977 		base = &parent->data->npdata.blockref[0];
2978 		count = parent->bytes /
2979 			sizeof(hammer2_blockref_t);
2980 		break;
2981 	case HAMMER2_BREF_TYPE_VOLUME:
2982 		base = &hmp->voldata.sroot_blockset.blockref[0];
2983 		count = HAMMER2_SET_COUNT;
2984 		if (chain->flags & HAMMER2_CHAIN_MOVED) {
2985 			if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
2986 				hmp->voldata.mirror_tid =
2987 					chain->bref.mirror_tid;
2988 			}
2989 		}
2990 		break;
2991 	default:
2992 		base = NULL;
2993 		panic("hammer2_chain_flush: "
2994 		      "unrecognized blockref type: %d",
2995 		      parent->bref.type);
2996 	}
2997 
2998 	/*
2999 	 * Update the blockref in the parent.  We do not have to set
3000 	 * MOVED in the parent because the parent has been marked modified,
3001 	 * so the flush sequence will pick up the bref change.
3002 	 *
3003 	 * We do have to propagate mirror_tid upward.
3004 	 */
3005 	KKASSERT(chain->index >= 0 &&
3006 		 chain->index < count);
3007 	KKASSERT(chain->parent == parent);
3008 	if (chain->flags & HAMMER2_CHAIN_MOVED) {
3009 		base[chain->index] = chain->bref_flush;
3010 		if (parent->bref.mirror_tid < chain->bref_flush.mirror_tid)
3011 			parent->bref.mirror_tid = chain->bref_flush.mirror_tid;
3012 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
3013 		hammer2_chain_drop(hmp, chain);
3014 	} else if (bcmp(&base[chain->index], &chain->bref_flush,
3015 		   sizeof(chain->bref)) != 0) {
3016 		panic("hammer2: unflagged bref update(2)");
3017 	}
3018 	ccms_thread_unlock(&parent->cst);		/* release manual op */
3019 	hammer2_chain_unlock(hmp, parent);
3020 }
3021