1 /*
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2022 Tomohiro Kusumi <tkusumi@netbsd.org>
5  * Copyright (c) 2011-2022 The DragonFly Project.  All rights reserved.
6  *
7  * This code is derived from software contributed to The DragonFly Project
8  * by Matthew Dillon <dillon@dragonflybsd.org>
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in
18  *    the documentation and/or other materials provided with the
19  *    distribution.
20  * 3. Neither the name of The DragonFly Project nor the names of its
21  *    contributors may be used to endorse or promote products derived
22  *    from this software without specific, prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
27  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
28  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
34  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 /*
38  * This subsystem implements most of the core support functions for
39  * the hammer2_chain structure.
40  *
41  * Chains are the in-memory version on media objects (volume header, inodes,
42  * indirect blocks, data blocks, etc).  Chains represent a portion of the
43  * HAMMER2 topology.
44  *
45  * Chains are no-longer delete-duplicated.  Instead, the original in-memory
46  * chain will be moved along with its block reference (e.g. for things like
47  * renames, hardlink operations, modifications, etc), and will be indexed
48  * on a secondary list for flush handling instead of propagating a flag
49  * upward to the root.
50  *
51  * Concurrent front-end operations can still run against backend flushes
52  * as long as they do not cross the current flush boundary.  An operation
53  * running above the current flush (in areas not yet flushed) can become
54  * part of the current flush while ano peration running below the current
55  * flush can become part of the next flush.
56  */
57 /*
58 #include <sys/cdefs.h>
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/types.h>
62 #include <sys/lock.h>
63 #include <sys/buf.h>
64 
65 #include <crypto/sha2/sha2.h>
66 */
67 
68 #include "hammer2.h"
69 
70 static hammer2_chain_t *hammer2_chain_create_indirect(
71 		hammer2_chain_t *parent,
72 		hammer2_key_t key, int keybits,
73 		hammer2_tid_t mtid, int for_type, int *errorp);
74 static int hammer2_chain_delete_obref(hammer2_chain_t *parent,
75 		hammer2_chain_t *chain,
76 		hammer2_tid_t mtid, int flags,
77 		hammer2_blockref_t *obref);
78 static hammer2_chain_t *hammer2_combined_find(
79 		hammer2_chain_t *parent,
80 		hammer2_blockref_t *base, int count,
81 		hammer2_key_t *key_nextp,
82 		hammer2_key_t key_beg, hammer2_key_t key_end,
83 		hammer2_blockref_t **brefp);
84 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain,
85 				int depth);
86 static void hammer2_chain_lru_flush(hammer2_pfs_t *pmp);
87 
88 /*
89  * There are many degenerate situations where an extreme rate of console
90  * output can occur from warnings and errors.  Make sure this output does
91  * not impede operations.
92  */
93 /*
94 static struct krate krate_h2chk = { .freq = 5 };
95 static struct krate krate_h2me = { .freq = 1 };
96 static struct krate krate_h2em = { .freq = 1 };
97 */
98 
99 /*
100  * Basic RBTree for chains (core.rbtree).
101  */
102 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
103 
104 int
105 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
106 {
107 	hammer2_key_t c1_beg;
108 	hammer2_key_t c1_end;
109 	hammer2_key_t c2_beg;
110 	hammer2_key_t c2_end;
111 
112 	/*
113 	 * Compare chains.  Overlaps are not supposed to happen and catch
114 	 * any software issues early we count overlaps as a match.
115 	 */
116 	c1_beg = chain1->bref.key;
117 	c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1;
118 	c2_beg = chain2->bref.key;
119 	c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1;
120 
121 	if (c1_end < c2_beg)	/* fully to the left */
122 		return(-1);
123 	if (c1_beg > c2_end)	/* fully to the right */
124 		return(1);
125 	return(0);		/* overlap (must not cross edge boundary) */
126 }
127 
128 /*
129  * Assert that a chain has no media data associated with it.
130  */
131 static __inline void
132 hammer2_chain_assert_no_data(hammer2_chain_t *chain)
133 {
134 	KKASSERT(chain->dio == NULL);
135 	if (chain->bref.type != HAMMER2_BREF_TYPE_VOLUME &&
136 	    chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP &&
137 	    chain->data) {
138 		panic("hammer2_chain_assert_no_data: chain %p still has data",
139 		    chain);
140 	}
141 }
142 
143 /*
144  * Make a chain visible to the flusher.  The flusher operates using a top-down
145  * recursion based on the ONFLUSH flag.  It locates MODIFIED and UPDATE chains,
146  * flushes them, and updates blocks back to the volume root.
147  *
148  * This routine sets the ONFLUSH flag upward from the triggering chain until
149  * it hits an inode root or the volume root.  Inode chains serve as inflection
150  * points, requiring the flusher to bridge across trees.  Inodes include
151  * regular inodes, PFS roots (pmp->iroot), and the media super root
152  * (spmp->iroot).
153  */
154 void
155 hammer2_chain_setflush(hammer2_chain_t *chain)
156 {
157 	hammer2_chain_t *parent;
158 
159 	if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) {
160 		hammer2_spin_sh(&chain->core.spin);
161 		while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) {
162 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
163 			if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
164 				break;
165 			if ((parent = chain->parent) == NULL)
166 				break;
167 			hammer2_spin_sh(&parent->core.spin);
168 			hammer2_spin_unsh(&chain->core.spin);
169 			chain = parent;
170 		}
171 		hammer2_spin_unsh(&chain->core.spin);
172 	}
173 }
174 
175 /*
176  * Allocate a new disconnected chain element representing the specified
177  * bref.  chain->refs is set to 1 and the passed bref is copied to
178  * chain->bref.  chain->bytes is derived from the bref.
179  *
180  * chain->pmp inherits pmp unless the chain is an inode (other than the
181  * super-root inode).
182  *
183  * NOTE: Returns a referenced but unlocked (because there is no core) chain.
184  */
185 hammer2_chain_t *
186 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp,
187 		    hammer2_blockref_t *bref)
188 {
189 	hammer2_chain_t *chain;
190 	u_int bytes;
191 
192 	/*
193 	 * Special case - radix of 0 indicates a chain that does not
194 	 * need a data reference (context is completely embedded in the
195 	 * bref).
196 	 */
197 	if ((int)(bref->data_off & HAMMER2_OFF_MASK_RADIX))
198 		bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
199 	else
200 		bytes = 0;
201 
202 	switch(bref->type) {
203 	case HAMMER2_BREF_TYPE_INODE:
204 	case HAMMER2_BREF_TYPE_INDIRECT:
205 	case HAMMER2_BREF_TYPE_DATA:
206 	case HAMMER2_BREF_TYPE_DIRENT:
207 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
208 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
209 	case HAMMER2_BREF_TYPE_FREEMAP:
210 	case HAMMER2_BREF_TYPE_VOLUME:
211 		chain = kmalloc_obj(sizeof(*chain), hmp->mchain,
212 				    M_WAITOK | M_ZERO);
213 		atomic_add_long(&hammer2_chain_allocs, 1);
214 		break;
215 	case HAMMER2_BREF_TYPE_EMPTY:
216 	default:
217 		panic("hammer2_chain_alloc: unrecognized blockref type: %d",
218 		      bref->type);
219 		break;
220 	}
221 
222 	/*
223 	 * Initialize the new chain structure.  pmp must be set to NULL for
224 	 * chains belonging to the super-root topology of a device mount.
225 	 */
226 	if (pmp == hmp->spmp)
227 		chain->pmp = NULL;
228 	else
229 		chain->pmp = pmp;
230 
231 	chain->hmp = hmp;
232 	chain->bref = *bref;
233 	chain->bytes = bytes;
234 	chain->refs = 1;
235 	chain->flags = HAMMER2_CHAIN_ALLOCATED;
236 	lockinit(&chain->diolk, "chdio", 0, 0);
237 
238 	/*
239 	 * Set the PFS boundary flag if this chain represents a PFS root.
240 	 */
241 	if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT)
242 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_PFSBOUNDARY);
243 	hammer2_chain_init(chain);
244 
245 	return (chain);
246 }
247 
248 /*
249  * A common function to initialize chains including fchain and vchain.
250  */
251 void
252 hammer2_chain_init(hammer2_chain_t *chain)
253 {
254 	RB_INIT(&chain->core.rbtree);	/* live chains */
255 	hammer2_mtx_init(&chain->lock, "h2chain");
256 	hammer2_spin_init(&chain->core.spin, "h2chain");
257 }
258 
259 /*
260  * Add a reference to a chain element, preventing its destruction.
261  *
262  * (can be called with spinlock held)
263  */
264 void
265 hammer2_chain_ref(hammer2_chain_t *chain)
266 {
267 	if (atomic_fetchadd_int(&chain->refs, 1) == 0) {
268 		/*
269 		 * Just flag that the chain was used and should be recycled
270 		 * on the LRU if it encounters it later.
271 		 */
272 		if (chain->flags & HAMMER2_CHAIN_ONLRU)
273 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_LRUHINT);
274 
275 #if 0
276 		/*
277 		 * REMOVED - reduces contention, lru_list is more heuristical
278 		 * now.
279 		 *
280 		 * 0->non-zero transition must ensure that chain is removed
281 		 * from the LRU list.
282 		 *
283 		 * NOTE: Already holding lru_spin here so we cannot call
284 		 *	 hammer2_chain_ref() to get it off lru_list, do
285 		 *	 it manually.
286 		 */
287 		if (chain->flags & HAMMER2_CHAIN_ONLRU) {
288 			hammer2_pfs_t *pmp = chain->pmp;
289 			hammer2_spin_ex(&pmp->lru_spin);
290 			if (chain->flags & HAMMER2_CHAIN_ONLRU) {
291 				atomic_add_int(&pmp->lru_count, -1);
292 				atomic_clear_int(&chain->flags,
293 						 HAMMER2_CHAIN_ONLRU);
294 				TAILQ_REMOVE(&pmp->lru_list, chain, lru_node);
295 			}
296 			hammer2_spin_unex(&pmp->lru_spin);
297 		}
298 #endif
299 	}
300 }
301 
302 /*
303  * Ref a locked chain and force the data to be held across an unlock.
304  * Chain must be currently locked.  The user of the chain who desires
305  * to release the hold must call hammer2_chain_lock_unhold() to relock
306  * and unhold the chain, then unlock normally, or may simply call
307  * hammer2_chain_drop_unhold() (which is safer against deadlocks).
308  */
309 void
310 hammer2_chain_ref_hold(hammer2_chain_t *chain)
311 {
312 	atomic_add_int(&chain->lockcnt, 1);
313 	hammer2_chain_ref(chain);
314 }
315 
316 /*
317  * Insert the chain in the core rbtree.
318  *
319  * Normal insertions are placed in the live rbtree.  Insertion of a deleted
320  * chain is a special case used by the flush code that is placed on the
321  * unstaged deleted list to avoid confusing the live view.
322  */
323 #define HAMMER2_CHAIN_INSERT_SPIN	0x0001
324 #define HAMMER2_CHAIN_INSERT_LIVE	0x0002
325 #define HAMMER2_CHAIN_INSERT_RACE	0x0004
326 
327 static
328 int
329 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain,
330 		     int flags, int generation)
331 {
332 	hammer2_chain_t *xchain __debugvar;
333 	int error = 0;
334 
335 	if (flags & HAMMER2_CHAIN_INSERT_SPIN)
336 		hammer2_spin_ex(&parent->core.spin);
337 
338 	/*
339 	 * Interlocked by spinlock, check for race
340 	 */
341 	if ((flags & HAMMER2_CHAIN_INSERT_RACE) &&
342 	    parent->core.generation != generation) {
343 		error = HAMMER2_ERROR_EAGAIN;
344 		goto failed;
345 	}
346 
347 	/*
348 	 * Insert chain
349 	 */
350 	xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain);
351 	KASSERT(xchain == NULL,
352 		("hammer2_chain_insert: collision %p %p (key=%016jx)",
353 		chain, xchain, chain->bref.key));
354 	atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
355 	chain->parent = parent;
356 	++parent->core.chain_count;
357 	++parent->core.generation;	/* XXX incs for _get() too, XXX */
358 
359 	/*
360 	 * We have to keep track of the effective live-view blockref count
361 	 * so the create code knows when to push an indirect block.
362 	 */
363 	if (flags & HAMMER2_CHAIN_INSERT_LIVE)
364 		atomic_add_int(&parent->core.live_count, 1);
365 failed:
366 	if (flags & HAMMER2_CHAIN_INSERT_SPIN)
367 		hammer2_spin_unex(&parent->core.spin);
368 	return error;
369 }
370 
371 /*
372  * Drop the caller's reference to the chain.  When the ref count drops to
373  * zero this function will try to disassociate the chain from its parent and
374  * deallocate it, then recursely drop the parent using the implied ref
375  * from the chain's chain->parent.
376  *
377  * Nobody should own chain's mutex on the 1->0 transition, unless this drop
378  * races an acquisition by another cpu.  Therefore we can loop if we are
379  * unable to acquire the mutex, and refs is unlikely to be 1 unless we again
380  * race against another drop.
381  */
382 void
383 hammer2_chain_drop(hammer2_chain_t *chain)
384 {
385 	u_int refs;
386 
387 	KKASSERT(chain->refs > 0);
388 
389 	while (chain) {
390 		refs = chain->refs;
391 		cpu_ccfence();
392 		KKASSERT(refs > 0);
393 
394 		if (refs == 1) {
395 			if (hammer2_mtx_ex_try(&chain->lock) == 0)
396 				chain = hammer2_chain_lastdrop(chain, 0);
397 			/* retry the same chain, or chain from lastdrop */
398 		} else {
399 			if (atomic_cmpset_int(&chain->refs, refs, refs - 1))
400 				break;
401 			/* retry the same chain */
402 		}
403 		cpu_pause();
404 	}
405 }
406 
407 /*
408  * Unhold a held and probably not-locked chain, ensure that the data is
409  * dropped on the 1->0 transition of lockcnt by obtaining an exclusive
410  * lock and then simply unlocking the chain.
411  */
412 void
413 hammer2_chain_unhold(hammer2_chain_t *chain)
414 {
415 	u_int lockcnt;
416 	int iter = 0;
417 
418 	for (;;) {
419 		lockcnt = chain->lockcnt;
420 		cpu_ccfence();
421 		if (lockcnt > 1) {
422 			if (atomic_cmpset_int(&chain->lockcnt,
423 					      lockcnt, lockcnt - 1)) {
424 				break;
425 			}
426 		} else if (hammer2_mtx_ex_try(&chain->lock) == 0) {
427 			hammer2_chain_unlock(chain);
428 			break;
429 		} else {
430 			/*
431 			 * This situation can easily occur on SMP due to
432 			 * the gap inbetween the 1->0 transition and the
433 			 * final unlock.  We cannot safely block on the
434 			 * mutex because lockcnt might go above 1.
435 			 *
436 			 * XXX Sleep for one tick if it takes too long.
437 			 */
438 			if (++iter > 1000) {
439 				if (iter > 1000 + hz) {
440 					kprintf("hammer2: h2race1 %p\n", chain);
441 					iter = 1000;
442 				}
443 				tsleep(&iter, 0, "h2race1", 1);
444 			}
445 			cpu_pause();
446 		}
447 	}
448 }
449 
450 void
451 hammer2_chain_drop_unhold(hammer2_chain_t *chain)
452 {
453 	hammer2_chain_unhold(chain);
454 	hammer2_chain_drop(chain);
455 }
456 
457 void
458 hammer2_chain_rehold(hammer2_chain_t *chain)
459 {
460 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED);
461 	atomic_add_int(&chain->lockcnt, 1);
462 	hammer2_chain_unlock(chain);
463 }
464 
465 /*
466  * Handles the (potential) last drop of chain->refs from 1->0.  Called with
467  * the mutex exclusively locked, refs == 1, and lockcnt 0.  SMP races are
468  * possible against refs and lockcnt.  We must dispose of the mutex on chain.
469  *
470  * This function returns an unlocked chain for recursive drop or NULL.  It
471  * can return the same chain if it determines it has raced another ref.
472  *
473  * --
474  *
475  * When two chains need to be recursively dropped we use the chain we
476  * would otherwise free to placehold the additional chain.  It's a bit
477  * convoluted but we can't just recurse without potentially blowing out
478  * the kernel stack.
479  *
480  * The chain cannot be freed if it has any children.
481  * The chain cannot be freed if flagged MODIFIED unless we can dispose of it.
482  * The chain cannot be freed if flagged UPDATE unless we can dispose of it.
483  * Any dedup registration can remain intact.
484  *
485  * The core spinlock is allowed to nest child-to-parent (not parent-to-child).
486  */
487 static
488 hammer2_chain_t *
489 hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth)
490 {
491 	hammer2_pfs_t *pmp;
492 	hammer2_dev_t *hmp;
493 	hammer2_chain_t *parent;
494 	hammer2_chain_t *rdrop;
495 
496 	/*
497 	 * We need chain's spinlock to interlock the sub-tree test.
498 	 * We already have chain's mutex, protecting chain->parent.
499 	 *
500 	 * Remember that chain->refs can be in flux.
501 	 */
502 	hammer2_spin_ex(&chain->core.spin);
503 
504 	if (chain->parent != NULL) {
505 		/*
506 		 * If the chain has a parent the UPDATE bit prevents scrapping
507 		 * as the chain is needed to properly flush the parent.  Try
508 		 * to complete the 1->0 transition and return NULL.  Retry
509 		 * (return chain) if we are unable to complete the 1->0
510 		 * transition, else return NULL (nothing more to do).
511 		 *
512 		 * If the chain has a parent the MODIFIED bit prevents
513 		 * scrapping.
514 		 *
515 		 * Chains with UPDATE/MODIFIED are *not* put on the LRU list!
516 		 */
517 		if (chain->flags & (HAMMER2_CHAIN_UPDATE |
518 				    HAMMER2_CHAIN_MODIFIED)) {
519 			if (atomic_cmpset_int(&chain->refs, 1, 0)) {
520 				hammer2_spin_unex(&chain->core.spin);
521 				hammer2_chain_assert_no_data(chain);
522 				hammer2_mtx_unlock(&chain->lock);
523 				chain = NULL;
524 			} else {
525 				hammer2_spin_unex(&chain->core.spin);
526 				hammer2_mtx_unlock(&chain->lock);
527 			}
528 			return (chain);
529 		}
530 		/* spinlock still held */
531 	} else if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME ||
532 		   chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP) {
533 		/*
534 		 * Retain the static vchain and fchain.  Clear bits that
535 		 * are not relevant.  Do not clear the MODIFIED bit,
536 		 * and certainly do not put it on the delayed-flush queue.
537 		 */
538 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
539 	} else {
540 		/*
541 		 * The chain has no parent and can be flagged for destruction.
542 		 * Since it has no parent, UPDATE can also be cleared.
543 		 */
544 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
545 		if (chain->flags & HAMMER2_CHAIN_UPDATE)
546 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
547 
548 		/*
549 		 * If the chain has children we must propagate the DESTROY
550 		 * flag downward and rip the disconnected topology apart.
551 		 * This is accomplished by calling hammer2_flush() on the
552 		 * chain.
553 		 *
554 		 * Any dedup is already handled by the underlying DIO, so
555 		 * we do not have to specifically flush it here.
556 		 */
557 		if (chain->core.chain_count) {
558 			hammer2_spin_unex(&chain->core.spin);
559 			hammer2_flush(chain, HAMMER2_FLUSH_TOP |
560 					     HAMMER2_FLUSH_ALL);
561 			hammer2_mtx_unlock(&chain->lock);
562 
563 			return(chain);	/* retry drop */
564 		}
565 
566 		/*
567 		 * Otherwise we can scrap the MODIFIED bit if it is set,
568 		 * and continue along the freeing path.
569 		 *
570 		 * Be sure to clean-out any dedup bits.  Without a parent
571 		 * this chain will no longer be visible to the flush code.
572 		 * Easy check data_off to avoid the volume root.
573 		 */
574 		if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
575 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
576 			atomic_add_long(&hammer2_count_modified_chains, -1);
577 			if (chain->pmp)
578 				hammer2_pfs_memory_wakeup(chain->pmp, -1);
579 		}
580 		/* spinlock still held */
581 	}
582 
583 	/* spinlock still held */
584 
585 	/*
586 	 * If any children exist we must leave the chain intact with refs == 0.
587 	 * They exist because chains are retained below us which have refs or
588 	 * may require flushing.
589 	 *
590 	 * Retry (return chain) if we fail to transition the refs to 0, else
591 	 * return NULL indication nothing more to do.
592 	 *
593 	 * Chains with children are NOT put on the LRU list.
594 	 */
595 	if (chain->core.chain_count) {
596 		if (atomic_cmpset_int(&chain->refs, 1, 0)) {
597 			hammer2_spin_unex(&chain->core.spin);
598 			hammer2_chain_assert_no_data(chain);
599 			hammer2_mtx_unlock(&chain->lock);
600 			chain = NULL;
601 		} else {
602 			hammer2_spin_unex(&chain->core.spin);
603 			hammer2_mtx_unlock(&chain->lock);
604 		}
605 		return (chain);
606 	}
607 	/* spinlock still held */
608 	/* no chains left under us */
609 
610 	/*
611 	 * chain->core has no children left so no accessors can get to our
612 	 * chain from there.  Now we have to lock the parent core to interlock
613 	 * remaining possible accessors that might bump chain's refs before
614 	 * we can safely drop chain's refs with intent to free the chain.
615 	 */
616 	hmp = chain->hmp;
617 	pmp = chain->pmp;	/* can be NULL */
618 	rdrop = NULL;
619 
620 	parent = chain->parent;
621 
622 	/*
623 	 * WARNING! chain's spin lock is still held here, and other spinlocks
624 	 *	    will be acquired and released in the code below.  We
625 	 *	    cannot be making fancy procedure calls!
626 	 */
627 
628 	/*
629 	 * We can cache the chain if it is associated with a pmp
630 	 * and not flagged as being destroyed or requesting a full
631 	 * release.  In this situation the chain is not removed
632 	 * from its parent, i.e. it can still be looked up.
633 	 *
634 	 * We intentionally do not cache DATA chains because these
635 	 * were likely used to load data into the logical buffer cache
636 	 * and will not be accessed again for some time.
637 	 */
638 	if ((chain->flags &
639 	     (HAMMER2_CHAIN_DESTROY | HAMMER2_CHAIN_RELEASE)) == 0 &&
640 	    chain->pmp &&
641 	    chain->bref.type != HAMMER2_BREF_TYPE_DATA) {
642 		if (parent)
643 			hammer2_spin_ex(&parent->core.spin);
644 		if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
645 			/*
646 			 * 1->0 transition failed, retry.  Do not drop
647 			 * the chain's data yet!
648 			 */
649 			if (parent)
650 				hammer2_spin_unex(&parent->core.spin);
651 			hammer2_spin_unex(&chain->core.spin);
652 			hammer2_mtx_unlock(&chain->lock);
653 
654 			return(chain);
655 		}
656 
657 		/*
658 		 * Success
659 		 */
660 		hammer2_chain_assert_no_data(chain);
661 
662 		/*
663 		 * Make sure we are on the LRU list, clean up excessive
664 		 * LRU entries.  We can only really drop one but there might
665 		 * be other entries that we can remove from the lru_list
666 		 * without dropping.
667 		 *
668 		 * NOTE: HAMMER2_CHAIN_ONLRU may only be safely set when
669 		 *	 chain->core.spin AND pmp->lru_spin are held, but
670 		 *	 can be safely cleared only holding pmp->lru_spin.
671 		 */
672 		if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) {
673 			hammer2_spin_ex(&pmp->lru_spin);
674 			if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) {
675 				atomic_set_int(&chain->flags,
676 					       HAMMER2_CHAIN_ONLRU);
677 				TAILQ_INSERT_TAIL(&pmp->lru_list,
678 						  chain, lru_node);
679 				atomic_add_int(&pmp->lru_count, 1);
680 			}
681 			if (pmp->lru_count < HAMMER2_LRU_LIMIT)
682 				depth = 1;	/* disable lru_list flush */
683 			hammer2_spin_unex(&pmp->lru_spin);
684 		} else {
685 			/* disable lru flush */
686 			depth = 1;
687 		}
688 
689 		if (parent) {
690 			hammer2_spin_unex(&parent->core.spin);
691 			parent = NULL;	/* safety */
692 		}
693 		hammer2_spin_unex(&chain->core.spin);
694 		hammer2_mtx_unlock(&chain->lock);
695 
696 		/*
697 		 * lru_list hysteresis (see above for depth overrides).
698 		 * Note that depth also prevents excessive lastdrop recursion.
699 		 */
700 		if (depth == 0)
701 			hammer2_chain_lru_flush(pmp);
702 
703 		return NULL;
704 		/* NOT REACHED */
705 	}
706 
707 	/*
708 	 * Make sure we are not on the LRU list.
709 	 */
710 	if (chain->flags & HAMMER2_CHAIN_ONLRU) {
711 		hammer2_spin_ex(&pmp->lru_spin);
712 		if (chain->flags & HAMMER2_CHAIN_ONLRU) {
713 			atomic_add_int(&pmp->lru_count, -1);
714 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU);
715 			TAILQ_REMOVE(&pmp->lru_list, chain, lru_node);
716 		}
717 		hammer2_spin_unex(&pmp->lru_spin);
718 	}
719 
720 	/*
721 	 * Spinlock the parent and try to drop the last ref on chain.
722 	 * On success determine if we should dispose of the chain
723 	 * (remove the chain from its parent, etc).
724 	 *
725 	 * (normal core locks are top-down recursive but we define
726 	 * core spinlocks as bottom-up recursive, so this is safe).
727 	 */
728 	if (parent) {
729 		hammer2_spin_ex(&parent->core.spin);
730 		if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
731 			/*
732 			 * 1->0 transition failed, retry.
733 			 */
734 			hammer2_spin_unex(&parent->core.spin);
735 			hammer2_spin_unex(&chain->core.spin);
736 			hammer2_mtx_unlock(&chain->lock);
737 
738 			return(chain);
739 		}
740 
741 		/*
742 		 * 1->0 transition successful, parent spin held to prevent
743 		 * new lookups, chain spinlock held to protect parent field.
744 		 * Remove chain from the parent.
745 		 *
746 		 * If the chain is being removed from the parent's rbtree but
747 		 * is not blkmapped, we have to adjust live_count downward.  If
748 		 * it is blkmapped then the blockref is retained in the parent
749 		 * as is its associated live_count.  This case can occur when
750 		 * a chain added to the topology is unable to flush and is
751 		 * then later deleted.
752 		 */
753 		if (chain->flags & HAMMER2_CHAIN_ONRBTREE) {
754 			if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) &&
755 			    (chain->flags & HAMMER2_CHAIN_BLKMAPPED) == 0) {
756 				atomic_add_int(&parent->core.live_count, -1);
757 			}
758 			RB_REMOVE(hammer2_chain_tree,
759 				  &parent->core.rbtree, chain);
760 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
761 			--parent->core.chain_count;
762 			chain->parent = NULL;
763 		}
764 
765 		/*
766 		 * If our chain was the last chain in the parent's core the
767 		 * core is now empty and its parent might have to be
768 		 * re-dropped if it has 0 refs.
769 		 */
770 		if (parent->core.chain_count == 0) {
771 			rdrop = parent;
772 			atomic_add_int(&rdrop->refs, 1);
773 			/*
774 			if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0)
775 				rdrop = NULL;
776 			*/
777 		}
778 		hammer2_spin_unex(&parent->core.spin);
779 		parent = NULL;	/* safety */
780 		/* FALL THROUGH */
781 	} else {
782 		/*
783 		 * No-parent case.
784 		 */
785 		if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
786 			/*
787 			 * 1->0 transition failed, retry.
788 			 */
789 			hammer2_spin_unex(&parent->core.spin);
790 			hammer2_spin_unex(&chain->core.spin);
791 			hammer2_mtx_unlock(&chain->lock);
792 
793 			return(chain);
794 		}
795 	}
796 
797 	/*
798 	 * Successful 1->0 transition, no parent, no children... no way for
799 	 * anyone to ref this chain any more.  We can clean-up and free it.
800 	 *
801 	 * We still have the core spinlock, and core's chain_count is 0.
802 	 * Any parent spinlock is gone.
803 	 */
804 	hammer2_spin_unex(&chain->core.spin);
805 	hammer2_chain_assert_no_data(chain);
806 	hammer2_mtx_unlock(&chain->lock);
807 	KKASSERT(RB_EMPTY(&chain->core.rbtree) &&
808 		 chain->core.chain_count == 0);
809 
810 	/*
811 	 * All locks are gone, no pointers remain to the chain, finish
812 	 * freeing it.
813 	 */
814 	KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE |
815 				  HAMMER2_CHAIN_MODIFIED)) == 0);
816 
817 	/*
818 	 * Once chain resources are gone we can use the now dead chain
819 	 * structure to placehold what might otherwise require a recursive
820 	 * drop, because we have potentially two things to drop and can only
821 	 * return one directly.
822 	 */
823 	if (chain->flags & HAMMER2_CHAIN_ALLOCATED) {
824 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ALLOCATED);
825 		chain->hmp = NULL;
826 		kfree_obj(chain, hmp->mchain);
827 		atomic_add_long(&hammer2_chain_allocs, -1);
828 	}
829 
830 	/*
831 	 * Possible chaining loop when parent re-drop needed.
832 	 */
833 	return(rdrop);
834 }
835 
836 /*
837  * Heuristical flush of the LRU, try to reduce the number of entries
838  * on the LRU to (HAMMER2_LRU_LIMIT * 2 / 3).  This procedure is called
839  * only when lru_count exceeds HAMMER2_LRU_LIMIT.
840  */
841 static
842 void
843 hammer2_chain_lru_flush(hammer2_pfs_t *pmp)
844 {
845 	hammer2_chain_t *chain;
846 
847 again:
848 	chain = NULL;
849 	hammer2_spin_ex(&pmp->lru_spin);
850 	while (pmp->lru_count > HAMMER2_LRU_LIMIT * 2 / 3) {
851 		/*
852 		 * Pick a chain off the lru_list, just recycle it quickly
853 		 * if LRUHINT is set (the chain was ref'd but left on
854 		 * the lru_list, so cycle to the end).
855 		 */
856 		chain = TAILQ_FIRST(&pmp->lru_list);
857 		TAILQ_REMOVE(&pmp->lru_list, chain, lru_node);
858 
859 		if (chain->flags & HAMMER2_CHAIN_LRUHINT) {
860 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_LRUHINT);
861 			TAILQ_INSERT_TAIL(&pmp->lru_list, chain, lru_node);
862 			chain = NULL;
863 			continue;
864 		}
865 
866 		/*
867 		 * Ok, we are off the LRU.  We must adjust refs before we
868 		 * can safely clear the ONLRU flag.
869 		 */
870 		atomic_add_int(&pmp->lru_count, -1);
871 		if (atomic_cmpset_int(&chain->refs, 0, 1)) {
872 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU);
873 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
874 			break;
875 		}
876 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU);
877 		chain = NULL;
878 	}
879 	hammer2_spin_unex(&pmp->lru_spin);
880 	if (chain == NULL)
881 		return;
882 
883 	/*
884 	 * If we picked a chain off the lru list we may be able to lastdrop
885 	 * it.  Use a depth of 1 to prevent excessive lastdrop recursion.
886 	 */
887 	while (chain) {
888 		u_int refs;
889 
890 		refs = chain->refs;
891 		cpu_ccfence();
892 		KKASSERT(refs > 0);
893 
894 		if (refs == 1) {
895 			if (hammer2_mtx_ex_try(&chain->lock) == 0)
896 				chain = hammer2_chain_lastdrop(chain, 1);
897 			/* retry the same chain, or chain from lastdrop */
898 		} else {
899 			if (atomic_cmpset_int(&chain->refs, refs, refs - 1))
900 				break;
901 			/* retry the same chain */
902 		}
903 		cpu_pause();
904 	}
905 	goto again;
906 }
907 
908 /*
909  * On last lock release.
910  */
911 static hammer2_io_t *
912 hammer2_chain_drop_data(hammer2_chain_t *chain)
913 {
914 	hammer2_io_t *dio;
915 
916 	if ((dio = chain->dio) != NULL) {
917 		chain->dio = NULL;
918 		chain->data = NULL;
919 	} else {
920 		switch(chain->bref.type) {
921 		case HAMMER2_BREF_TYPE_VOLUME:
922 		case HAMMER2_BREF_TYPE_FREEMAP:
923 			break;
924 		default:
925 			if (chain->data != NULL) {
926 				hammer2_spin_unex(&chain->core.spin);
927 				panic("chain data not null: "
928 				      "chain %p bref %016jx.%02x "
929 				      "refs %d parent %p dio %p data %p",
930 				      chain, chain->bref.data_off,
931 				      chain->bref.type, chain->refs,
932 				      chain->parent,
933 				      chain->dio, chain->data);
934 			}
935 			KKASSERT(chain->data == NULL);
936 			break;
937 		}
938 	}
939 	return dio;
940 }
941 
942 /*
943  * Lock a referenced chain element, acquiring its data with I/O if necessary,
944  * and specify how you would like the data to be resolved.
945  *
946  * If an I/O or other fatal error occurs, chain->error will be set to non-zero.
947  *
948  * The lock is allowed to recurse, multiple locking ops will aggregate
949  * the requested resolve types.  Once data is assigned it will not be
950  * removed until the last unlock.
951  *
952  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
953  *			   (typically used to avoid device/logical buffer
954  *			    aliasing for data)
955  *
956  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
957  *			   the INITIAL-create state (indirect blocks only).
958  *
959  *			   Do not resolve data elements for DATA chains.
960  *			   (typically used to avoid device/logical buffer
961  *			    aliasing for data)
962  *
963  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
964  *
965  * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise
966  *			   it will be locked exclusive.
967  *
968  * HAMMER2_RESOLVE_NONBLOCK- (flag) The chain is locked non-blocking.  If
969  *			   the lock fails, EAGAIN is returned.
970  *
971  * NOTE: Embedded elements (volume header, inodes) are always resolved
972  *	 regardless.
973  *
974  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
975  *	 element will instantiate and zero its buffer, and flush it on
976  *	 release.
977  *
978  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
979  *	 so as not to instantiate a device buffer, which could alias against
980  *	 a logical file buffer.  However, if ALWAYS is specified the
981  *	 device buffer will be instantiated anyway.
982  *
983  * NOTE: The return value is always 0 unless NONBLOCK is specified, in which
984  *	 case it can be either 0 or EAGAIN.
985  *
986  * WARNING! This function blocks on I/O if data needs to be fetched.  This
987  *	    blocking can run concurrent with other compatible lock holders
988  *	    who do not need data returning.  The lock is not upgraded to
989  *	    exclusive during a data fetch, a separate bit is used to
990  *	    interlock I/O.  However, an exclusive lock holder can still count
991  *	    on being interlocked against an I/O fetch managed by a shared
992  *	    lock holder.
993  */
994 int
995 hammer2_chain_lock(hammer2_chain_t *chain, int how)
996 {
997 	KKASSERT(chain->refs > 0);
998 
999 	if (how & HAMMER2_RESOLVE_NONBLOCK) {
1000 		/*
1001 		 * We still have to bump lockcnt before acquiring the lock,
1002 		 * even for non-blocking operation, because the unlock code
1003 		 * live-loops on lockcnt == 1 when dropping the last lock.
1004 		 *
1005 		 * If the non-blocking operation fails we have to use an
1006 		 * unhold sequence to undo the mess.
1007 		 *
1008 		 * NOTE: LOCKAGAIN must always succeed without blocking,
1009 		 *	 even if NONBLOCK is specified.
1010 		 */
1011 		atomic_add_int(&chain->lockcnt, 1);
1012 		if (how & HAMMER2_RESOLVE_SHARED) {
1013 			if (how & HAMMER2_RESOLVE_LOCKAGAIN) {
1014 				hammer2_mtx_sh_again(&chain->lock);
1015 			} else {
1016 				if (hammer2_mtx_sh_try(&chain->lock) != 0) {
1017 					hammer2_chain_unhold(chain);
1018 					return EAGAIN;
1019 				}
1020 			}
1021 		} else {
1022 			if (hammer2_mtx_ex_try(&chain->lock) != 0) {
1023 				hammer2_chain_unhold(chain);
1024 				return EAGAIN;
1025 			}
1026 		}
1027 	} else {
1028 		/*
1029 		 * Get the appropriate lock.  If LOCKAGAIN is flagged with
1030 		 * SHARED the caller expects a shared lock to already be
1031 		 * present and we are giving it another ref.  This case must
1032 		 * importantly not block if there is a pending exclusive lock
1033 		 * request.
1034 		 */
1035 		atomic_add_int(&chain->lockcnt, 1);
1036 		if (how & HAMMER2_RESOLVE_SHARED) {
1037 			if (how & HAMMER2_RESOLVE_LOCKAGAIN) {
1038 				hammer2_mtx_sh_again(&chain->lock);
1039 			} else {
1040 				hammer2_mtx_sh(&chain->lock);
1041 			}
1042 		} else {
1043 			hammer2_mtx_ex(&chain->lock);
1044 		}
1045 	}
1046 
1047 	/*
1048 	 * If we already have a valid data pointer make sure the data is
1049 	 * synchronized to the current cpu, and then no further action is
1050 	 * necessary.
1051 	 */
1052 	if (chain->data) {
1053 		if (chain->dio)
1054 			hammer2_io_bkvasync(chain->dio);
1055 		return 0;
1056 	}
1057 
1058 	/*
1059 	 * Do we have to resolve the data?  This is generally only
1060 	 * applicable to HAMMER2_BREF_TYPE_DATA which is special-cased.
1061 	 * Other BREF types expects the data to be there.
1062 	 */
1063 	switch(how & HAMMER2_RESOLVE_MASK) {
1064 	case HAMMER2_RESOLVE_NEVER:
1065 		return 0;
1066 	case HAMMER2_RESOLVE_MAYBE:
1067 		if (chain->flags & HAMMER2_CHAIN_INITIAL)
1068 			return 0;
1069 		if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
1070 			return 0;
1071 #if 0
1072 		if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE)
1073 			return 0;
1074 		if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF)
1075 			return 0;
1076 #endif
1077 		/* fall through */
1078 	case HAMMER2_RESOLVE_ALWAYS:
1079 	default:
1080 		break;
1081 	}
1082 
1083 	/*
1084 	 * Caller requires data
1085 	 */
1086 	hammer2_chain_load_data(chain);
1087 
1088 	return 0;
1089 }
1090 
1091 #if 0
1092 /*
1093  * Lock the chain, retain the hold, and drop the data persistence count.
1094  * The data should remain valid because we never transitioned lockcnt
1095  * through 0.
1096  */
1097 void
1098 hammer2_chain_lock_unhold(hammer2_chain_t *chain, int how)
1099 {
1100 	hammer2_chain_lock(chain, how);
1101 	atomic_add_int(&chain->lockcnt, -1);
1102 }
1103 
1104 /*
1105  * Downgrade an exclusive chain lock to a shared chain lock.
1106  *
1107  * NOTE: There is no upgrade equivalent due to the ease of
1108  *	 deadlocks in that direction.
1109  */
1110 void
1111 hammer2_chain_lock_downgrade(hammer2_chain_t *chain)
1112 {
1113 	hammer2_mtx_downgrade(&chain->lock);
1114 }
1115 #endif
1116 
1117 /*
1118  * Issue I/O and install chain->data.  Caller must hold a chain lock, lock
1119  * may be of any type.
1120  *
1121  * Once chain->data is set it cannot be disposed of until all locks are
1122  * released.
1123  *
1124  * Make sure the data is synchronized to the current cpu.
1125  */
1126 void
1127 hammer2_chain_load_data(hammer2_chain_t *chain)
1128 {
1129 	hammer2_blockref_t *bref;
1130 	hammer2_dev_t *hmp;
1131 	hammer2_io_t *dio;
1132 	char *bdata;
1133 	int error;
1134 
1135 	/*
1136 	 * Degenerate case, data already present, or chain has no media
1137 	 * reference to load.
1138 	 */
1139 	KKASSERT(chain->lock.mtx_lock & MTX_MASK);
1140 	if (chain->data) {
1141 		if (chain->dio)
1142 			hammer2_io_bkvasync(chain->dio);
1143 		return;
1144 	}
1145 	if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0)
1146 		return;
1147 
1148 	hmp = chain->hmp;
1149 	KKASSERT(hmp != NULL);
1150 
1151 	/*
1152 	 * Gain the IOINPROG bit, interlocked block.
1153 	 */
1154 	for (;;) {
1155 		u_int oflags;
1156 		u_int nflags;
1157 
1158 		oflags = chain->flags;
1159 		cpu_ccfence();
1160 		if (oflags & HAMMER2_CHAIN_IOINPROG) {
1161 			nflags = oflags | HAMMER2_CHAIN_IOSIGNAL;
1162 			tsleep_interlock(&chain->flags, 0);
1163 			if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
1164 				tsleep(&chain->flags, PINTERLOCKED,
1165 					"h2iocw", 0);
1166 			}
1167 			/* retry */
1168 		} else {
1169 			nflags = oflags | HAMMER2_CHAIN_IOINPROG;
1170 			if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
1171 				break;
1172 			}
1173 			/* retry */
1174 		}
1175 	}
1176 
1177 	/*
1178 	 * We own CHAIN_IOINPROG
1179 	 *
1180 	 * Degenerate case if we raced another load.
1181 	 */
1182 	if (chain->data) {
1183 		if (chain->dio)
1184 			hammer2_io_bkvasync(chain->dio);
1185 		goto done;
1186 	}
1187 
1188 	/*
1189 	 * We must resolve to a device buffer, either by issuing I/O or
1190 	 * by creating a zero-fill element.  We do not mark the buffer
1191 	 * dirty when creating a zero-fill element (the hammer2_chain_modify()
1192 	 * API must still be used to do that).
1193 	 *
1194 	 * The device buffer is variable-sized in powers of 2 down
1195 	 * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1196 	 * chunk always contains buffers of the same size. (XXX)
1197 	 *
1198 	 * The minimum physical IO size may be larger than the variable
1199 	 * block size.
1200 	 */
1201 	bref = &chain->bref;
1202 
1203 	/*
1204 	 * The getblk() optimization can only be used on newly created
1205 	 * elements if the physical block size matches the request.
1206 	 */
1207 	if (chain->flags & HAMMER2_CHAIN_INITIAL) {
1208 		error = hammer2_io_new(hmp, bref->type,
1209 				       bref->data_off, chain->bytes,
1210 				       &chain->dio);
1211 	} else {
1212 		error = hammer2_io_bread(hmp, bref->type,
1213 					 bref->data_off, chain->bytes,
1214 					 &chain->dio);
1215 		hammer2_adjreadcounter(chain->bref.type, chain->bytes);
1216 	}
1217 	if (error) {
1218 		chain->error = HAMMER2_ERROR_EIO;
1219 		kprintf("hammer2_chain_load_data: I/O error %016jx: %d\n",
1220 			(intmax_t)bref->data_off, error);
1221 		hammer2_io_bqrelse(&chain->dio);
1222 		goto done;
1223 	}
1224 	chain->error = 0;
1225 
1226 	/*
1227 	 * This isn't perfect and can be ignored on OSs which do not have
1228 	 * an indication as to whether a buffer is coming from cache or
1229 	 * if I/O was actually issued for the read.  TESTEDGOOD will work
1230 	 * pretty well without the B_IOISSUED logic because chains are
1231 	 * cached, but in that situation (without B_IOISSUED) it will not
1232 	 * detect whether a re-read via I/O is corrupted verses the original
1233 	 * read.
1234 	 *
1235 	 * We can't re-run the CRC on every fresh lock.  That would be
1236 	 * insanely expensive.
1237 	 *
1238 	 * If the underlying kernel buffer covers the entire chain we can
1239 	 * use the B_IOISSUED indication to determine if we have to re-run
1240 	 * the CRC on chain data for chains that managed to stay cached
1241 	 * across the kernel disposal of the original buffer.
1242 	 */
1243 	if ((dio = chain->dio) != NULL && dio->bp) {
1244 		//struct m_buf *bp = dio->bp;
1245 
1246 		if (dio->psize == chain->bytes //&&
1247 		    /*(bp->b_flags & B_IOISSUED)*/) {
1248 			atomic_clear_int(&chain->flags,
1249 					 HAMMER2_CHAIN_TESTEDGOOD);
1250 			//bp->b_flags &= ~B_IOISSUED;
1251 		}
1252 	}
1253 
1254 	/*
1255 	 * NOTE: A locked chain's data cannot be modified without first
1256 	 *	 calling hammer2_chain_modify().
1257 	 */
1258 
1259 	/*
1260 	 * NOTE: hammer2_io_data() call issues bkvasync()
1261 	 */
1262 	bdata = hammer2_io_data(chain->dio, chain->bref.data_off);
1263 
1264 	if (chain->flags & HAMMER2_CHAIN_INITIAL) {
1265 		/*
1266 		 * Clear INITIAL.  In this case we used io_new() and the
1267 		 * buffer has been zero'd and marked dirty.
1268 		 *
1269 		 * CHAIN_MODIFIED has not been set yet, and we leave it
1270 		 * that way for now.  Set a temporary CHAIN_NOTTESTED flag
1271 		 * to prevent hammer2_chain_testcheck() from trying to match
1272 		 * a check code that has not yet been generated.  This bit
1273 		 * should NOT end up on the actual media.
1274 		 */
1275 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1276 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED);
1277 	} else if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
1278 		/*
1279 		 * check data not currently synchronized due to
1280 		 * modification.  XXX assumes data stays in the buffer
1281 		 * cache, which might not be true (need biodep on flush
1282 		 * to calculate crc?  or simple crc?).
1283 		 */
1284 	} else if ((chain->flags & HAMMER2_CHAIN_TESTEDGOOD) == 0) {
1285 		if (hammer2_chain_testcheck(chain, bdata) == 0) {
1286 			chain->error = HAMMER2_ERROR_CHECK;
1287 		} else {
1288 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_TESTEDGOOD);
1289 		}
1290 	}
1291 
1292 	/*
1293 	 * Setup the data pointer by pointing it into the buffer.
1294 	 * WARNING! Other threads can start using the data the instant we
1295 	 *	    set chain->data non-NULL.
1296 	 */
1297 	switch (bref->type) {
1298 	case HAMMER2_BREF_TYPE_VOLUME:
1299 	case HAMMER2_BREF_TYPE_FREEMAP:
1300 		panic("hammer2_chain_load_data: unresolved volume header");
1301 		break;
1302 	case HAMMER2_BREF_TYPE_DIRENT:
1303 		KKASSERT(chain->bytes != 0);
1304 		/* fall through */
1305 	case HAMMER2_BREF_TYPE_INODE:
1306 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1307 	case HAMMER2_BREF_TYPE_INDIRECT:
1308 	case HAMMER2_BREF_TYPE_DATA:
1309 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1310 	default:
1311 		/*
1312 		 * Point data at the device buffer and leave dio intact.
1313 		 */
1314 		chain->data = (void *)bdata;
1315 		break;
1316 	}
1317 
1318 	/*
1319 	 * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested.
1320 	 */
1321 done:
1322 	for (;;) {
1323 		u_int oflags;
1324 		u_int nflags;
1325 
1326 		oflags = chain->flags;
1327 		nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG |
1328 				    HAMMER2_CHAIN_IOSIGNAL);
1329 		KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG);
1330 		if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
1331 			if (oflags & HAMMER2_CHAIN_IOSIGNAL)
1332 				wakeup(&chain->flags);
1333 			break;
1334 		}
1335 	}
1336 }
1337 
1338 /*
1339  * Unlock and deref a chain element.
1340  *
1341  * Remember that the presence of children under chain prevent the chain's
1342  * destruction but do not add additional references, so the dio will still
1343  * be dropped.
1344  */
1345 void
1346 hammer2_chain_unlock(hammer2_chain_t *chain)
1347 {
1348 	hammer2_io_t *dio;
1349 	u_int lockcnt;
1350 	int iter = 0;
1351 
1352 	/*
1353 	 * If multiple locks are present (or being attempted) on this
1354 	 * particular chain we can just unlock, drop refs, and return.
1355 	 *
1356 	 * Otherwise fall-through on the 1->0 transition.
1357 	 */
1358 	for (;;) {
1359 		lockcnt = chain->lockcnt;
1360 		KKASSERT(lockcnt > 0);
1361 		cpu_ccfence();
1362 		if (lockcnt > 1) {
1363 			if (atomic_cmpset_int(&chain->lockcnt,
1364 					      lockcnt, lockcnt - 1)) {
1365 				hammer2_mtx_unlock(&chain->lock);
1366 				return;
1367 			}
1368 		} else if (hammer2_mtx_upgrade_try(&chain->lock) == 0) {
1369 			/* while holding the mutex exclusively */
1370 			if (atomic_cmpset_int(&chain->lockcnt, 1, 0))
1371 				break;
1372 		} else {
1373 			/*
1374 			 * This situation can easily occur on SMP due to
1375 			 * the gap inbetween the 1->0 transition and the
1376 			 * final unlock.  We cannot safely block on the
1377 			 * mutex because lockcnt might go above 1.
1378 			 *
1379 			 * XXX Sleep for one tick if it takes too long.
1380 			 */
1381 			if (++iter > 1000) {
1382 				if (iter > 1000 + hz) {
1383 					kprintf("hammer2: h2race2 %p\n", chain);
1384 					iter = 1000;
1385 				}
1386 				tsleep(&iter, 0, "h2race2", 1);
1387 			}
1388 			cpu_pause();
1389 		}
1390 		/* retry */
1391 	}
1392 
1393 	/*
1394 	 * Last unlock / mutex upgraded to exclusive.  Drop the data
1395 	 * reference.
1396 	 */
1397 	dio = hammer2_chain_drop_data(chain);
1398 	if (dio)
1399 		hammer2_io_bqrelse(&dio);
1400 	hammer2_mtx_unlock(&chain->lock);
1401 }
1402 
1403 #if 0
1404 /*
1405  * Unlock and hold chain data intact
1406  */
1407 void
1408 hammer2_chain_unlock_hold(hammer2_chain_t *chain)
1409 {
1410 	atomic_add_int(&chain->lockcnt, 1);
1411 	hammer2_chain_unlock(chain);
1412 }
1413 #endif
1414 
1415 /*
1416  * Helper to obtain the blockref[] array base and count for a chain.
1417  *
1418  * XXX Not widely used yet, various use cases need to be validated and
1419  *     converted to use this function.
1420  */
1421 static
1422 hammer2_blockref_t *
1423 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp)
1424 {
1425 	hammer2_blockref_t *base;
1426 	int count;
1427 
1428 	if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1429 		base = NULL;
1430 
1431 		switch(parent->bref.type) {
1432 		case HAMMER2_BREF_TYPE_INODE:
1433 			count = HAMMER2_SET_COUNT;
1434 			break;
1435 		case HAMMER2_BREF_TYPE_INDIRECT:
1436 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1437 			count = parent->bytes / sizeof(hammer2_blockref_t);
1438 			break;
1439 		case HAMMER2_BREF_TYPE_VOLUME:
1440 			count = HAMMER2_SET_COUNT;
1441 			break;
1442 		case HAMMER2_BREF_TYPE_FREEMAP:
1443 			count = HAMMER2_SET_COUNT;
1444 			break;
1445 		default:
1446 			panic("hammer2_chain_base_and_count: "
1447 			      "unrecognized blockref type: %d",
1448 			      parent->bref.type);
1449 			count = 0;
1450 			break;
1451 		}
1452 	} else {
1453 		switch(parent->bref.type) {
1454 		case HAMMER2_BREF_TYPE_INODE:
1455 			base = &parent->data->ipdata.u.blockset.blockref[0];
1456 			count = HAMMER2_SET_COUNT;
1457 			break;
1458 		case HAMMER2_BREF_TYPE_INDIRECT:
1459 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1460 			base = &parent->data->npdata[0];
1461 			count = parent->bytes / sizeof(hammer2_blockref_t);
1462 			break;
1463 		case HAMMER2_BREF_TYPE_VOLUME:
1464 			base = &parent->data->voldata.
1465 					sroot_blockset.blockref[0];
1466 			count = HAMMER2_SET_COUNT;
1467 			break;
1468 		case HAMMER2_BREF_TYPE_FREEMAP:
1469 			base = &parent->data->blkset.blockref[0];
1470 			count = HAMMER2_SET_COUNT;
1471 			break;
1472 		default:
1473 			panic("hammer2_chain_base_and_count: "
1474 			      "unrecognized blockref type: %d",
1475 			      parent->bref.type);
1476 			base = NULL;
1477 			count = 0;
1478 			break;
1479 		}
1480 	}
1481 	*countp = count;
1482 
1483 	return base;
1484 }
1485 
1486 /*
1487  * This counts the number of live blockrefs in a block array and
1488  * also calculates the point at which all remaining blockrefs are empty.
1489  * This routine can only be called on a live chain.
1490  *
1491  * Caller holds the chain locked, but possibly with a shared lock.  We
1492  * must use an exclusive spinlock to prevent corruption.
1493  *
1494  * NOTE: Flag is not set until after the count is complete, allowing
1495  *	 callers to test the flag without holding the spinlock.
1496  *
1497  * NOTE: If base is NULL the related chain is still in the INITIAL
1498  *	 state and there are no blockrefs to count.
1499  *
1500  * NOTE: live_count may already have some counts accumulated due to
1501  *	 creation and deletion and could even be initially negative.
1502  */
1503 void
1504 hammer2_chain_countbrefs(hammer2_chain_t *chain,
1505 			 hammer2_blockref_t *base, int count)
1506 {
1507 	hammer2_spin_ex(&chain->core.spin);
1508 	if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) {
1509 		if (base) {
1510 			while (--count >= 0) {
1511 				if (base[count].type != HAMMER2_BREF_TYPE_EMPTY)
1512 					break;
1513 			}
1514 			chain->core.live_zero = count + 1;
1515 			while (count >= 0) {
1516 				if (base[count].type != HAMMER2_BREF_TYPE_EMPTY)
1517 					atomic_add_int(&chain->core.live_count,
1518 						       1);
1519 				--count;
1520 			}
1521 		} else {
1522 			chain->core.live_zero = 0;
1523 		}
1524 		/* else do not modify live_count */
1525 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS);
1526 	}
1527 	hammer2_spin_unex(&chain->core.spin);
1528 }
1529 
1530 /*
1531  * Resize the chain's physical storage allocation in-place.  This function does
1532  * not usually adjust the data pointer and must be followed by (typically) a
1533  * hammer2_chain_modify() call to copy any old data over and adjust the
1534  * data pointer.
1535  *
1536  * Chains can be resized smaller without reallocating the storage.  Resizing
1537  * larger will reallocate the storage.  Excess or prior storage is reclaimed
1538  * asynchronously at a later time.
1539  *
1540  * An nradix value of 0 is special-cased to mean that the storage should
1541  * be disassociated, that is the chain is being resized to 0 bytes (not 1
1542  * byte).
1543  *
1544  * Must be passed an exclusively locked parent and chain.
1545  *
1546  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
1547  * to avoid instantiating a device buffer that conflicts with the vnode data
1548  * buffer.  However, because H2 can compress or encrypt data, the chain may
1549  * have a dio assigned to it in those situations, and they do not conflict.
1550  *
1551  * XXX return error if cannot resize.
1552  */
1553 int
1554 hammer2_chain_resize(hammer2_chain_t *chain,
1555 		     hammer2_tid_t mtid, hammer2_off_t dedup_off,
1556 		     int nradix, int flags)
1557 {
1558 	hammer2_dev_t *hmp;
1559 	size_t obytes;
1560 	size_t nbytes;
1561 	int error;
1562 
1563 	hmp = chain->hmp;
1564 
1565 	/*
1566 	 * Only data and indirect blocks can be resized for now.
1567 	 * (The volu root, inodes, and freemap elements use a fixed size).
1568 	 */
1569 	KKASSERT(chain != &hmp->vchain);
1570 	KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
1571 		 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1572 		 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT);
1573 
1574 	/*
1575 	 * Nothing to do if the element is already the proper size
1576 	 */
1577 	obytes = chain->bytes;
1578 	nbytes = (nradix) ? (1U << nradix) : 0;
1579 	if (obytes == nbytes)
1580 		return (chain->error);
1581 
1582 	/*
1583 	 * Make sure the old data is instantiated so we can copy it.  If this
1584 	 * is a data block, the device data may be superfluous since the data
1585 	 * might be in a logical block, but compressed or encrypted data is
1586 	 * another matter.
1587 	 *
1588 	 * NOTE: The modify will set BLKMAPUPD for us if BLKMAPPED is set.
1589 	 */
1590 	error = hammer2_chain_modify(chain, mtid, dedup_off, 0);
1591 	if (error)
1592 		return error;
1593 
1594 	/*
1595 	 * Reallocate the block, even if making it smaller (because different
1596 	 * block sizes may be in different regions).
1597 	 *
1598 	 * NOTE: Operation does not copy the data and may only be used
1599 	 *	 to resize data blocks in-place, or directory entry blocks
1600 	 *	 which are about to be modified in some manner.
1601 	 */
1602 	error = hammer2_freemap_alloc(chain, nbytes);
1603 	if (error)
1604 		return error;
1605 
1606 	chain->bytes = nbytes;
1607 
1608 	/*
1609 	 * We don't want the followup chain_modify() to try to copy data
1610 	 * from the old (wrong-sized) buffer.  It won't know how much to
1611 	 * copy.  This case should only occur during writes when the
1612 	 * originator already has the data to write in-hand.
1613 	 */
1614 	if (chain->dio) {
1615 		KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
1616 			 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT);
1617 		hammer2_io_brelse(&chain->dio);
1618 		chain->data = NULL;
1619 	}
1620 	return (chain->error);
1621 }
1622 
1623 /*
1624  * Set the chain modified so its data can be changed by the caller, or
1625  * install deduplicated data.  The caller must call this routine for each
1626  * set of modifications it makes, even if the chain is already flagged
1627  * MODIFIED.
1628  *
1629  * Sets bref.modify_tid to mtid only if mtid != 0.  Note that bref.modify_tid
1630  * is a CLC (cluster level change) field and is not updated by parent
1631  * propagation during a flush.
1632  *
1633  * Returns an appropriate HAMMER2_ERROR_* code, which will generally reflect
1634  * chain->error except for HAMMER2_ERROR_ENOSPC.  If the allocation fails
1635  * due to no space available, HAMMER2_ERROR_ENOSPC is returned and the chain
1636  * remains unmodified with its old data ref intact and chain->error
1637  * unchanged.
1638  *
1639  *				 Dedup Handling
1640  *
1641  * If the DEDUPABLE flag is set in the chain the storage must be reallocated
1642  * even if the chain is still flagged MODIFIED.  In this case the chain's
1643  * DEDUPABLE flag will be cleared once the new storage has been assigned.
1644  *
1645  * If the caller passes a non-zero dedup_off we will use it to assign the
1646  * new storage.  The MODIFIED flag will be *CLEARED* in this case, and
1647  * DEDUPABLE will be set (NOTE: the UPDATE flag is always set).  The caller
1648  * must not modify the data content upon return.
1649  */
1650 int
1651 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid,
1652 		     hammer2_off_t dedup_off, int flags)
1653 {
1654 	hammer2_dev_t *hmp;
1655 	hammer2_io_t *dio;
1656 	int error;
1657 	int wasinitial;
1658 	int setmodified;
1659 	int setupdate;
1660 	int newmod;
1661 	char *bdata;
1662 
1663 	hmp = chain->hmp;
1664 	KKASSERT(chain->lock.mtx_lock & MTX_EXCLUSIVE);
1665 
1666 	/*
1667 	 * Data is not optional for freemap chains (we must always be sure
1668 	 * to copy the data on COW storage allocations).
1669 	 */
1670 	if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
1671 	    chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
1672 		KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) ||
1673 			 (flags & HAMMER2_MODIFY_OPTDATA) == 0);
1674 	}
1675 
1676 	/*
1677 	 * Data must be resolved if already assigned, unless explicitly
1678 	 * flagged otherwise.  If we cannot safety load the data the
1679 	 * modification fails and we return early.
1680 	 */
1681 	if (chain->data == NULL && chain->bytes != 0 &&
1682 	    (flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
1683 	    (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) {
1684 		hammer2_chain_load_data(chain);
1685 		if (chain->error)
1686 			return (chain->error);
1687 	}
1688 	error = 0;
1689 
1690 	/*
1691 	 * Set MODIFIED to indicate that the chain has been modified.  A new
1692 	 * allocation is required when modifying a chain.
1693 	 *
1694 	 * Set UPDATE to ensure that the blockref is updated in the parent.
1695 	 *
1696 	 * If MODIFIED is already set determine if we can reuse the assigned
1697 	 * data block or if we need a new data block.
1698 	 */
1699 	if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
1700 		/*
1701 		 * Must set modified bit.
1702 		 */
1703 		atomic_add_long(&hammer2_count_modified_chains, 1);
1704 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1705 		hammer2_pfs_memory_inc(chain->pmp);  /* can be NULL */
1706 		setmodified = 1;
1707 
1708 		/*
1709 		 * We may be able to avoid a copy-on-write if the chain's
1710 		 * check mode is set to NONE and the chain's current
1711 		 * modify_tid is beyond the last explicit snapshot tid.
1712 		 *
1713 		 * This implements HAMMER2's overwrite-in-place feature.
1714 		 *
1715 		 * NOTE! This data-block cannot be used as a de-duplication
1716 		 *	 source when the check mode is set to NONE.
1717 		 */
1718 		if ((chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
1719 		     chain->bref.type == HAMMER2_BREF_TYPE_DIRENT) &&
1720 		    (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 &&
1721 		    (chain->flags & HAMMER2_CHAIN_DEDUPABLE) == 0 &&
1722 		    HAMMER2_DEC_CHECK(chain->bref.methods) ==
1723 		     HAMMER2_CHECK_NONE &&
1724 		    chain->pmp &&
1725 		    chain->bref.modify_tid >
1726 		     chain->pmp->iroot->meta.pfs_lsnap_tid) {
1727 			/*
1728 			 * Sector overwrite allowed.
1729 			 */
1730 			newmod = 0;
1731 		} else if ((hmp->hflags & HMNT2_EMERG) &&
1732 			   chain->pmp &&
1733 			   chain->bref.modify_tid >
1734 			    chain->pmp->iroot->meta.pfs_lsnap_tid) {
1735 			/*
1736 			 * If in emergency delete mode then do a modify-in-
1737 			 * place on any chain type belonging to the PFS as
1738 			 * long as it doesn't mess up a snapshot.  We might
1739 			 * be forced to do this anyway a little further down
1740 			 * in the code if the allocation fails.
1741 			 *
1742 			 * Also note that in emergency mode, these modify-in-
1743 			 * place operations are NOT SAFE.  A storage failure,
1744 			 * power failure, or panic can corrupt the filesystem.
1745 			 */
1746 			newmod = 0;
1747 		} else {
1748 			/*
1749 			 * Sector overwrite not allowed, must copy-on-write.
1750 			 */
1751 			newmod = 1;
1752 		}
1753 	} else if (chain->flags & HAMMER2_CHAIN_DEDUPABLE) {
1754 		/*
1755 		 * If the modified chain was registered for dedup we need
1756 		 * a new allocation.  This only happens for delayed-flush
1757 		 * chains (i.e. which run through the front-end buffer
1758 		 * cache).
1759 		 */
1760 		newmod = 1;
1761 		setmodified = 0;
1762 	} else {
1763 		/*
1764 		 * Already flagged modified, no new allocation is needed.
1765 		 */
1766 		newmod = 0;
1767 		setmodified = 0;
1768 	}
1769 
1770 	/*
1771 	 * Flag parent update required.
1772 	 */
1773 	if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) {
1774 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
1775 		setupdate = 1;
1776 	} else {
1777 		setupdate = 0;
1778 	}
1779 
1780 	/*
1781 	 * The XOP code returns held but unlocked focus chains.  This
1782 	 * prevents the chain from being destroyed but does not prevent
1783 	 * it from being modified.  diolk is used to interlock modifications
1784 	 * against XOP frontend accesses to the focus.
1785 	 *
1786 	 * This allows us to theoretically avoid deadlocking the frontend
1787 	 * if one of the backends lock up by not formally locking the
1788 	 * focused chain in the frontend.  In addition, the synchronization
1789 	 * code relies on this mechanism to avoid deadlocking concurrent
1790 	 * synchronization threads.
1791 	 */
1792 	lockmgr(&chain->diolk, LK_EXCLUSIVE);
1793 
1794 	/*
1795 	 * The modification or re-modification requires an allocation and
1796 	 * possible COW.  If an error occurs, the previous content and data
1797 	 * reference is retained and the modification fails.
1798 	 *
1799 	 * If dedup_off is non-zero, the caller is requesting a deduplication
1800 	 * rather than a modification.  The MODIFIED bit is not set and the
1801 	 * data offset is set to the deduplication offset.  The data cannot
1802 	 * be modified.
1803 	 *
1804 	 * NOTE: The dedup offset is allowed to be in a partially free state
1805 	 *	 and we must be sure to reset it to a fully allocated state
1806 	 *	 to force two bulkfree passes to free it again.
1807 	 *
1808 	 * NOTE: Only applicable when chain->bytes != 0.
1809 	 *
1810 	 * XXX can a chain already be marked MODIFIED without a data
1811 	 * assignment?  If not, assert here instead of testing the case.
1812 	 */
1813 	if (chain != &hmp->vchain && chain != &hmp->fchain &&
1814 	    chain->bytes) {
1815 		if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 ||
1816 		    newmod
1817 		) {
1818 			/*
1819 			 * NOTE: We do not have to remove the dedup
1820 			 *	 registration because the area is still
1821 			 *	 allocated and the underlying DIO will
1822 			 *	 still be flushed.
1823 			 */
1824 			if (dedup_off) {
1825 				chain->bref.data_off = dedup_off;
1826 				if ((int)(dedup_off & HAMMER2_OFF_MASK_RADIX))
1827 					chain->bytes = 1 <<
1828 						(int)(dedup_off &
1829 						HAMMER2_OFF_MASK_RADIX);
1830 				else
1831 					chain->bytes = 0;
1832 				chain->error = 0;
1833 				atomic_clear_int(&chain->flags,
1834 						 HAMMER2_CHAIN_MODIFIED);
1835 				atomic_add_long(&hammer2_count_modified_chains,
1836 						-1);
1837 				if (chain->pmp) {
1838 					hammer2_pfs_memory_wakeup(
1839 						chain->pmp, -1);
1840 				}
1841 				hammer2_freemap_adjust(hmp, &chain->bref,
1842 						HAMMER2_FREEMAP_DORECOVER);
1843 				atomic_set_int(&chain->flags,
1844 						HAMMER2_CHAIN_DEDUPABLE);
1845 			} else {
1846 				error = hammer2_freemap_alloc(chain,
1847 							      chain->bytes);
1848 				atomic_clear_int(&chain->flags,
1849 						HAMMER2_CHAIN_DEDUPABLE);
1850 
1851 				/*
1852 				 * If we are unable to allocate a new block
1853 				 * but we are in emergency mode, issue a
1854 				 * warning to the console and reuse the same
1855 				 * block.
1856 				 *
1857 				 * We behave as if the allocation were
1858 				 * successful.
1859 				 *
1860 				 * THIS IS IMPORTANT: These modifications
1861 				 * are virtually guaranteed to corrupt any
1862 				 * snapshots related to this filesystem.
1863 				 */
1864 				if (error && (hmp->hflags & HMNT2_EMERG)) {
1865 					error = 0;
1866 					chain->bref.flags |=
1867 						HAMMER2_BREF_FLAG_EMERG_MIP;
1868 
1869 					krateprintf(&krate_h2em,
1870 					    "hammer2: Emergency Mode WARNING: "
1871 					    "Operation will likely corrupt "
1872 					    "related snapshot: "
1873 					    "%016jx.%02x key=%016jx\n",
1874 					    chain->bref.data_off,
1875 					    chain->bref.type,
1876 					    chain->bref.key);
1877 				} else if (error == 0) {
1878 					chain->bref.flags &=
1879 						~HAMMER2_BREF_FLAG_EMERG_MIP;
1880 				}
1881 			}
1882 		}
1883 	}
1884 
1885 	/*
1886 	 * Stop here if error.  We have to undo any flag bits we might
1887 	 * have set above.
1888 	 */
1889 	if (error) {
1890 		if (setmodified) {
1891 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1892 			atomic_add_long(&hammer2_count_modified_chains, -1);
1893 			if (chain->pmp)
1894 				hammer2_pfs_memory_wakeup(chain->pmp, -1);
1895 		}
1896 		if (setupdate) {
1897 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
1898 		}
1899 		lockmgr(&chain->diolk, LK_RELEASE);
1900 
1901 		return error;
1902 	}
1903 
1904 	/*
1905 	 * Update mirror_tid and modify_tid.  modify_tid is only updated
1906 	 * if not passed as zero (during flushes, parent propagation passes
1907 	 * the value 0).
1908 	 *
1909 	 * NOTE: chain->pmp could be the device spmp.
1910 	 */
1911 	chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1;
1912 	if (mtid)
1913 		chain->bref.modify_tid = mtid;
1914 
1915 	/*
1916 	 * Set BLKMAPUPD to tell the flush code that an existing blockmap entry
1917 	 * requires updating as well as to tell the delete code that the
1918 	 * chain's blockref might not exactly match (in terms of physical size
1919 	 * or block offset) the one in the parent's blocktable.  The base key
1920 	 * of course will still match.
1921 	 */
1922 	if (chain->flags & HAMMER2_CHAIN_BLKMAPPED)
1923 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPUPD);
1924 
1925 	/*
1926 	 * Short-cut data block handling when the caller does not need an
1927 	 * actual data reference to (aka OPTDATA), as long as the chain does
1928 	 * not already have a data pointer to the data and no de-duplication
1929 	 * occurred.
1930 	 *
1931 	 * This generally means that the modifications are being done via the
1932 	 * logical buffer cache.
1933 	 *
1934 	 * NOTE: If deduplication occurred we have to run through the data
1935 	 *	 stuff to clear INITIAL, and the caller will likely want to
1936 	 *	 assign the check code anyway.  Leaving INITIAL set on a
1937 	 *	 dedup can be deadly (it can cause the block to be zero'd!).
1938 	 *
1939 	 * This code also handles bytes == 0 (most dirents).
1940 	 */
1941 	if (chain->bref.type == HAMMER2_BREF_TYPE_DATA &&
1942 	    (flags & HAMMER2_MODIFY_OPTDATA) &&
1943 	    chain->data == NULL) {
1944 		if (dedup_off == 0) {
1945 			KKASSERT(chain->dio == NULL);
1946 			goto skip2;
1947 		}
1948 	}
1949 
1950 	/*
1951 	 * Clearing the INITIAL flag (for indirect blocks) indicates that
1952 	 * we've processed the uninitialized storage allocation.
1953 	 *
1954 	 * If this flag is already clear we are likely in a copy-on-write
1955 	 * situation but we have to be sure NOT to bzero the storage if
1956 	 * no data is present.
1957 	 *
1958 	 * Clearing of NOTTESTED is allowed if the MODIFIED bit is set,
1959 	 */
1960 	if (chain->flags & HAMMER2_CHAIN_INITIAL) {
1961 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1962 		wasinitial = 1;
1963 	} else {
1964 		wasinitial = 0;
1965 	}
1966 
1967 	/*
1968 	 * Instantiate data buffer and possibly execute COW operation
1969 	 */
1970 	switch(chain->bref.type) {
1971 	case HAMMER2_BREF_TYPE_VOLUME:
1972 	case HAMMER2_BREF_TYPE_FREEMAP:
1973 		/*
1974 		 * The data is embedded, no copy-on-write operation is
1975 		 * needed.
1976 		 */
1977 		KKASSERT(chain->dio == NULL);
1978 		break;
1979 	case HAMMER2_BREF_TYPE_DIRENT:
1980 		/*
1981 		 * The data might be fully embedded.
1982 		 */
1983 		if (chain->bytes == 0) {
1984 			KKASSERT(chain->dio == NULL);
1985 			break;
1986 		}
1987 		/* fall through */
1988 	case HAMMER2_BREF_TYPE_INODE:
1989 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1990 	case HAMMER2_BREF_TYPE_DATA:
1991 	case HAMMER2_BREF_TYPE_INDIRECT:
1992 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1993 		/*
1994 		 * Perform the copy-on-write operation
1995 		 *
1996 		 * zero-fill or copy-on-write depending on whether
1997 		 * chain->data exists or not and set the dirty state for
1998 		 * the new buffer.  hammer2_io_new() will handle the
1999 		 * zero-fill.
2000 		 *
2001 		 * If a dedup_off was supplied this is an existing block
2002 		 * and no COW, copy, or further modification is required.
2003 		 */
2004 		KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain);
2005 
2006 		if (wasinitial && dedup_off == 0) {
2007 			error = hammer2_io_new(hmp, chain->bref.type,
2008 					       chain->bref.data_off,
2009 					       chain->bytes, &dio);
2010 		} else {
2011 			error = hammer2_io_bread(hmp, chain->bref.type,
2012 						 chain->bref.data_off,
2013 						 chain->bytes, &dio);
2014 		}
2015 		hammer2_adjreadcounter(chain->bref.type, chain->bytes);
2016 
2017 		/*
2018 		 * If an I/O error occurs make sure callers cannot accidently
2019 		 * modify the old buffer's contents and corrupt the filesystem.
2020 		 *
2021 		 * NOTE: hammer2_io_data() call issues bkvasync()
2022 		 */
2023 		if (error) {
2024 			kprintf("hammer2_chain_modify: hmp=%p I/O error\n",
2025 				hmp);
2026 			chain->error = HAMMER2_ERROR_EIO;
2027 			hammer2_io_brelse(&dio);
2028 			hammer2_io_brelse(&chain->dio);
2029 			chain->data = NULL;
2030 			break;
2031 		}
2032 		chain->error = 0;
2033 		bdata = hammer2_io_data(dio, chain->bref.data_off);
2034 
2035 		if (chain->data) {
2036 			/*
2037 			 * COW (unless a dedup).
2038 			 */
2039 			KKASSERT(chain->dio != NULL);
2040 			if (chain->data != (void *)bdata && dedup_off == 0) {
2041 				bcopy(chain->data, bdata, chain->bytes);
2042 			}
2043 		} else if (wasinitial == 0 && dedup_off == 0) {
2044 			/*
2045 			 * We have a problem.  We were asked to COW but
2046 			 * we don't have any data to COW with!
2047 			 */
2048 			panic("hammer2_chain_modify: having a COW %p\n",
2049 			      chain);
2050 		}
2051 
2052 		/*
2053 		 * Retire the old buffer, replace with the new.  Dirty or
2054 		 * redirty the new buffer.
2055 		 *
2056 		 * WARNING! The system buffer cache may have already flushed
2057 		 *	    the buffer, so we must be sure to [re]dirty it
2058 		 *	    for further modification.
2059 		 *
2060 		 *	    If dedup_off was supplied, the caller is not
2061 		 *	    expected to make any further modification to the
2062 		 *	    buffer.
2063 		 *
2064 		 * WARNING! hammer2_get_gdata() assumes dio never transitions
2065 		 *	    through NULL in order to optimize away unnecessary
2066 		 *	    diolk operations.
2067 		 */
2068 		{
2069 			hammer2_io_t *tio;
2070 
2071 			if ((tio = chain->dio) != NULL)
2072 				hammer2_io_bqrelse(&tio);
2073 			chain->data = (void *)bdata;
2074 			chain->dio = dio;
2075 			if (dedup_off == 0)
2076 				hammer2_io_setdirty(dio);
2077 		}
2078 		break;
2079 	default:
2080 		panic("hammer2_chain_modify: illegal non-embedded type %d",
2081 		      chain->bref.type);
2082 		break;
2083 
2084 	}
2085 skip2:
2086 	/*
2087 	 * setflush on parent indicating that the parent must recurse down
2088 	 * to us.  Do not call on chain itself which might already have it
2089 	 * set.
2090 	 */
2091 	if (chain->parent)
2092 		hammer2_chain_setflush(chain->parent);
2093 	lockmgr(&chain->diolk, LK_RELEASE);
2094 
2095 	return (chain->error);
2096 }
2097 
2098 /*
2099  * Modify the chain associated with an inode.
2100  */
2101 int
2102 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain,
2103 			hammer2_tid_t mtid, int flags)
2104 {
2105 	int error;
2106 
2107 	hammer2_inode_modify(ip);
2108 	error = hammer2_chain_modify(chain, mtid, 0, flags);
2109 
2110 	return error;
2111 }
2112 
2113 /*
2114  * This function returns the chain at the nearest key within the specified
2115  * range.  The returned chain will be referenced but not locked.
2116  *
2117  * This function will recurse through chain->rbtree as necessary and will
2118  * return a *key_nextp suitable for iteration.  *key_nextp is only set if
2119  * the iteration value is less than the current value of *key_nextp.
2120  *
2121  * The caller should use (*key_nextp) to calculate the actual range of
2122  * the returned element, which will be (key_beg to *key_nextp - 1), because
2123  * there might be another element which is superior to the returned element
2124  * and overlaps it.
2125  *
2126  * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL
2127  * chains continue to be returned.  On EOF (*key_nextp) may overflow since
2128  * it will wind up being (key_end + 1).
2129  *
2130  * WARNING!  Must be called with child's spinlock held.  Spinlock remains
2131  *	     held through the operation.
2132  */
2133 struct hammer2_chain_find_info {
2134 	hammer2_chain_t		*best;
2135 	hammer2_key_t		key_beg;
2136 	hammer2_key_t		key_end;
2137 	hammer2_key_t		key_next;
2138 };
2139 
2140 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data);
2141 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data);
2142 
2143 static
2144 hammer2_chain_t *
2145 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp,
2146 			  hammer2_key_t key_beg, hammer2_key_t key_end)
2147 {
2148 	struct hammer2_chain_find_info info;
2149 
2150 	info.best = NULL;
2151 	info.key_beg = key_beg;
2152 	info.key_end = key_end;
2153 	info.key_next = *key_nextp;
2154 
2155 	RB_SCAN(hammer2_chain_tree, &parent->core.rbtree,
2156 		hammer2_chain_find_cmp, hammer2_chain_find_callback,
2157 		&info);
2158 	*key_nextp = info.key_next;
2159 #if 0
2160 	kprintf("chain_find %p %016jx:%016jx next=%016jx\n",
2161 		parent, key_beg, key_end, *key_nextp);
2162 #endif
2163 
2164 	return (info.best);
2165 }
2166 
2167 static
2168 int
2169 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data)
2170 {
2171 	struct hammer2_chain_find_info *info = data;
2172 	hammer2_key_t child_beg;
2173 	hammer2_key_t child_end;
2174 
2175 	child_beg = child->bref.key;
2176 	child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1;
2177 
2178 	if (child_end < info->key_beg)
2179 		return(-1);
2180 	if (child_beg > info->key_end)
2181 		return(1);
2182 	return(0);
2183 }
2184 
2185 static
2186 int
2187 hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
2188 {
2189 	struct hammer2_chain_find_info *info = data;
2190 	hammer2_chain_t *best;
2191 	hammer2_key_t child_end;
2192 
2193 	if ((best = info->best) == NULL) {
2194 		/*
2195 		 * No previous best.  Assign best
2196 		 */
2197 		info->best = child;
2198 	} else if (best->bref.key <= info->key_beg &&
2199 		   child->bref.key <= info->key_beg) {
2200 		/*
2201 		 * Illegal overlap.
2202 		 */
2203 		KKASSERT(0);
2204 		/*info->best = child;*/
2205 	} else if (child->bref.key < best->bref.key) {
2206 		/*
2207 		 * Child has a nearer key and best is not flush with key_beg.
2208 		 * Set best to child.  Truncate key_next to the old best key.
2209 		 */
2210 		info->best = child;
2211 		if (info->key_next > best->bref.key || info->key_next == 0)
2212 			info->key_next = best->bref.key;
2213 	} else if (child->bref.key == best->bref.key) {
2214 		/*
2215 		 * If our current best is flush with the child then this
2216 		 * is an illegal overlap.
2217 		 *
2218 		 * key_next will automatically be limited to the smaller of
2219 		 * the two end-points.
2220 		 */
2221 		KKASSERT(0);
2222 		info->best = child;
2223 	} else {
2224 		/*
2225 		 * Keep the current best but truncate key_next to the child's
2226 		 * base.
2227 		 *
2228 		 * key_next will also automatically be limited to the smaller
2229 		 * of the two end-points (probably not necessary for this case
2230 		 * but we do it anyway).
2231 		 */
2232 		if (info->key_next > child->bref.key || info->key_next == 0)
2233 			info->key_next = child->bref.key;
2234 	}
2235 
2236 	/*
2237 	 * Always truncate key_next based on child's end-of-range.
2238 	 */
2239 	child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits);
2240 	if (child_end && (info->key_next > child_end || info->key_next == 0))
2241 		info->key_next = child_end;
2242 
2243 	return(0);
2244 }
2245 
2246 /*
2247  * Retrieve the specified chain from a media blockref, creating the
2248  * in-memory chain structure which reflects it.  The returned chain is
2249  * held and locked according to (how) (HAMMER2_RESOLVE_*).  The caller must
2250  * handle crc-checks and so forth, and should check chain->error before
2251  * assuming that the data is good.
2252  *
2253  * To handle insertion races pass the INSERT_RACE flag along with the
2254  * generation number of the core.  NULL will be returned if the generation
2255  * number changes before we have a chance to insert the chain.  Insert
2256  * races can occur because the parent might be held shared.
2257  *
2258  * Caller must hold the parent locked shared or exclusive since we may
2259  * need the parent's bref array to find our block.
2260  *
2261  * WARNING! chain->pmp is always set to NULL for any chain representing
2262  *	    part of the super-root topology.
2263  */
2264 hammer2_chain_t *
2265 hammer2_chain_get(hammer2_chain_t *parent, int generation,
2266 		  hammer2_blockref_t *bref, int how)
2267 {
2268 	hammer2_dev_t *hmp = parent->hmp;
2269 	hammer2_chain_t *chain;
2270 	int error;
2271 
2272 	/*
2273 	 * Allocate a chain structure representing the existing media
2274 	 * entry.  Resulting chain has one ref and is not locked.
2275 	 */
2276 	if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT)
2277 		chain = hammer2_chain_alloc(hmp, NULL, bref);
2278 	else
2279 		chain = hammer2_chain_alloc(hmp, parent->pmp, bref);
2280 	/* ref'd chain returned */
2281 
2282 	/*
2283 	 * Flag that the chain is in the parent's blockmap so delete/flush
2284 	 * knows what to do with it.
2285 	 */
2286 	atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED);
2287 
2288 	/*
2289 	 * chain must be locked to avoid unexpected ripouts
2290 	 */
2291 	hammer2_chain_lock(chain, how);
2292 
2293 	/*
2294 	 * Link the chain into its parent.  A spinlock is required to safely
2295 	 * access the RBTREE, and it is possible to collide with another
2296 	 * hammer2_chain_get() operation because the caller might only hold
2297 	 * a shared lock on the parent.
2298 	 *
2299 	 * NOTE: Get races can occur quite often when we distribute
2300 	 *	 asynchronous read-aheads across multiple threads.
2301 	 */
2302 	KKASSERT(parent->refs > 0);
2303 	error = hammer2_chain_insert(parent, chain,
2304 				     HAMMER2_CHAIN_INSERT_SPIN |
2305 				     HAMMER2_CHAIN_INSERT_RACE,
2306 				     generation);
2307 	if (error) {
2308 		KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
2309 		/*kprintf("chain %p get race\n", chain);*/
2310 		hammer2_chain_unlock(chain);
2311 		hammer2_chain_drop(chain);
2312 		chain = NULL;
2313 	} else {
2314 		KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
2315 	}
2316 
2317 	/*
2318 	 * Return our new chain referenced but not locked, or NULL if
2319 	 * a race occurred.
2320 	 */
2321 	return (chain);
2322 }
2323 
2324 /*
2325  * Lookup initialization/completion API
2326  */
2327 hammer2_chain_t *
2328 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags)
2329 {
2330 	hammer2_chain_ref(parent);
2331 	if (flags & HAMMER2_LOOKUP_SHARED) {
2332 		hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
2333 					   HAMMER2_RESOLVE_SHARED);
2334 	} else {
2335 		hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2336 	}
2337 	return (parent);
2338 }
2339 
2340 void
2341 hammer2_chain_lookup_done(hammer2_chain_t *parent)
2342 {
2343 	if (parent) {
2344 		hammer2_chain_unlock(parent);
2345 		hammer2_chain_drop(parent);
2346 	}
2347 }
2348 
2349 /*
2350  * Take the locked chain and return a locked parent.  The chain remains
2351  * locked on return, but may have to be temporarily unlocked to acquire
2352  * the parent.  Because of this, (chain) must be stable and cannot be
2353  * deleted while it was temporarily unlocked (typically means that (chain)
2354  * is an inode).
2355  *
2356  * Pass HAMMER2_RESOLVE_* flags in flags.
2357  *
2358  * This will work even if the chain is errored, and the caller can check
2359  * parent->error on return if desired since the parent will be locked.
2360  *
2361  * This function handles the lock order reversal.
2362  */
2363 hammer2_chain_t *
2364 hammer2_chain_getparent(hammer2_chain_t *chain, int flags)
2365 {
2366 	hammer2_chain_t *parent;
2367 
2368 	/*
2369 	 * Be careful of order, chain must be unlocked before parent
2370 	 * is locked below to avoid a deadlock.  Try it trivially first.
2371 	 */
2372 	parent = chain->parent;
2373 	if (parent == NULL)
2374 		panic("hammer2_chain_getparent: no parent");
2375 	hammer2_chain_ref(parent);
2376 	if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0)
2377 		return parent;
2378 
2379 	for (;;) {
2380 		hammer2_chain_unlock(chain);
2381 		hammer2_chain_lock(parent, flags);
2382 		hammer2_chain_lock(chain, flags);
2383 
2384 		/*
2385 		 * Parent relinking races are quite common.  We have to get
2386 		 * it right or we will blow up the block table.
2387 		 */
2388 		if (chain->parent == parent)
2389 			break;
2390 		hammer2_chain_unlock(parent);
2391 		hammer2_chain_drop(parent);
2392 		cpu_ccfence();
2393 		parent = chain->parent;
2394 		if (parent == NULL)
2395 			panic("hammer2_chain_getparent: no parent");
2396 		hammer2_chain_ref(parent);
2397 	}
2398 	return parent;
2399 }
2400 
2401 /*
2402  * Take the locked chain and return a locked parent.  The chain is unlocked
2403  * and dropped.  *chainp is set to the returned parent as a convenience.
2404  * Pass HAMMER2_RESOLVE_* flags in flags.
2405  *
2406  * This will work even if the chain is errored, and the caller can check
2407  * parent->error on return if desired since the parent will be locked.
2408  *
2409  * The chain does NOT need to be stable.  We use a tracking structure
2410  * to track the expected parent if the chain is deleted out from under us.
2411  *
2412  * This function handles the lock order reversal.
2413  */
2414 hammer2_chain_t *
2415 hammer2_chain_repparent(hammer2_chain_t **chainp, int flags)
2416 {
2417 	hammer2_chain_t *chain;
2418 	hammer2_chain_t *parent;
2419 	struct hammer2_reptrack reptrack;
2420 	struct hammer2_reptrack **repp;
2421 
2422 	/*
2423 	 * Be careful of order, chain must be unlocked before parent
2424 	 * is locked below to avoid a deadlock.  Try it trivially first.
2425 	 */
2426 	chain = *chainp;
2427 	parent = chain->parent;
2428 	if (parent == NULL) {
2429 		hammer2_spin_unex(&chain->core.spin);
2430 		panic("hammer2_chain_repparent: no parent");
2431 	}
2432 	hammer2_chain_ref(parent);
2433 	if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) {
2434 		hammer2_chain_unlock(chain);
2435 		hammer2_chain_drop(chain);
2436 		*chainp = parent;
2437 
2438 		return parent;
2439 	}
2440 
2441 	/*
2442 	 * Ok, now it gets a bit nasty.  There are multiple situations where
2443 	 * the parent might be in the middle of a deletion, or where the child
2444 	 * (chain) might be deleted the instant we let go of its lock.
2445 	 * We can potentially end up in a no-win situation!
2446 	 *
2447 	 * In particular, the indirect_maintenance() case can cause these
2448 	 * situations.
2449 	 *
2450 	 * To deal with this we install a reptrack structure in the parent
2451 	 * This reptrack structure 'owns' the parent ref and will automatically
2452 	 * migrate to the parent's parent if the parent is deleted permanently.
2453 	 */
2454 	hammer2_spin_init(&reptrack.spin, "h2reptrk");
2455 	reptrack.chain = parent;
2456 	hammer2_chain_ref(parent);		/* for the reptrack */
2457 
2458 	hammer2_spin_ex(&parent->core.spin);
2459 	reptrack.next = parent->core.reptrack;
2460 	parent->core.reptrack = &reptrack;
2461 	hammer2_spin_unex(&parent->core.spin);
2462 
2463 	hammer2_chain_unlock(chain);
2464 	hammer2_chain_drop(chain);
2465 	chain = NULL;	/* gone */
2466 
2467 	/*
2468 	 * At the top of this loop, chain is gone and parent is refd both
2469 	 * by us explicitly AND via our reptrack.  We are attempting to
2470 	 * lock parent.
2471 	 */
2472 	for (;;) {
2473 		hammer2_chain_lock(parent, flags);
2474 
2475 		if (reptrack.chain == parent)
2476 			break;
2477 		hammer2_chain_unlock(parent);
2478 		hammer2_chain_drop(parent);
2479 
2480 		kprintf("hammer2: debug REPTRACK %p->%p\n",
2481 			parent, reptrack.chain);
2482 		hammer2_spin_ex(&reptrack.spin);
2483 		parent = reptrack.chain;
2484 		hammer2_chain_ref(parent);
2485 		hammer2_spin_unex(&reptrack.spin);
2486 	}
2487 
2488 	/*
2489 	 * Once parent is locked and matches our reptrack, our reptrack
2490 	 * will be stable and we have our parent.  We can unlink our
2491 	 * reptrack.
2492 	 *
2493 	 * WARNING!  Remember that the chain lock might be shared.  Chains
2494 	 *	     locked shared have stable parent linkages.
2495 	 */
2496 	hammer2_spin_ex(&parent->core.spin);
2497 	repp = &parent->core.reptrack;
2498 	while (*repp != &reptrack)
2499 		repp = &(*repp)->next;
2500 	*repp = reptrack.next;
2501 	hammer2_spin_unex(&parent->core.spin);
2502 
2503 	hammer2_chain_drop(parent);	/* reptrack ref */
2504 	*chainp = parent;		/* return parent lock+ref */
2505 
2506 	return parent;
2507 }
2508 
2509 /*
2510  * Dispose of any linked reptrack structures in (chain) by shifting them to
2511  * (parent).  Both (chain) and (parent) must be exclusively locked.
2512  *
2513  * This is interlocked against any children of (chain) on the other side.
2514  * No children so remain as-of when this is called so we can test
2515  * core.reptrack without holding the spin-lock.
2516  *
2517  * Used whenever the caller intends to permanently delete chains related
2518  * to topological recursions (BREF_TYPE_INDIRECT, BREF_TYPE_FREEMAP_NODE),
2519  * where the chains underneath the node being deleted are given a new parent
2520  * above the node being deleted.
2521  */
2522 static
2523 void
2524 hammer2_chain_repchange(hammer2_chain_t *parent, hammer2_chain_t *chain)
2525 {
2526 	struct hammer2_reptrack *reptrack;
2527 
2528 	KKASSERT(chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree));
2529 	while (chain->core.reptrack) {
2530 		hammer2_spin_ex(&parent->core.spin);
2531 		hammer2_spin_ex(&chain->core.spin);
2532 		reptrack = chain->core.reptrack;
2533 		if (reptrack == NULL) {
2534 			hammer2_spin_unex(&chain->core.spin);
2535 			hammer2_spin_unex(&parent->core.spin);
2536 			break;
2537 		}
2538 		hammer2_spin_ex(&reptrack->spin);
2539 		chain->core.reptrack = reptrack->next;
2540 		reptrack->chain = parent;
2541 		reptrack->next = parent->core.reptrack;
2542 		parent->core.reptrack = reptrack;
2543 		hammer2_chain_ref(parent);		/* reptrack */
2544 
2545 		hammer2_spin_unex(&chain->core.spin);
2546 		hammer2_spin_unex(&parent->core.spin);
2547 		kprintf("hammer2: debug repchange %p %p->%p\n",
2548 			reptrack, chain, parent);
2549 		hammer2_chain_drop(chain);		/* reptrack */
2550 	}
2551 }
2552 
2553 /*
2554  * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive.
2555  * (*parentp) typically points to an inode but can also point to a related
2556  * indirect block and this function will recurse upwards and find the inode
2557  * or the nearest undeleted indirect block covering the key range.
2558  *
2559  * This function unconditionally sets *errorp, replacing any previous value.
2560  *
2561  * (*parentp) must be exclusive or shared locked (depending on flags) and
2562  * referenced and can be an inode or an existing indirect block within the
2563  * inode.
2564  *
2565  * If (*parent) is errored out, this function will not attempt to recurse
2566  * the radix tree and will return NULL along with an appropriate *errorp.
2567  * If NULL is returned and *errorp is 0, the requested lookup could not be
2568  * located.
2569  *
2570  * On return (*parentp) will be modified to point at the deepest parent chain
2571  * element encountered during the search, as a helper for an insertion or
2572  * deletion.
2573  *
2574  * The new (*parentp) will be locked shared or exclusive (depending on flags),
2575  * and referenced, and the old will be unlocked and dereferenced (no change
2576  * if they are both the same).  This is particularly important if the caller
2577  * wishes to insert a new chain, (*parentp) will be set properly even if NULL
2578  * is returned, as long as no error occurred.
2579  *
2580  * The matching chain will be returned locked according to flags.
2581  *
2582  * --
2583  *
2584  * NULL is returned if no match was found, but (*parentp) will still
2585  * potentially be adjusted.
2586  *
2587  * On return (*key_nextp) will point to an iterative value for key_beg.
2588  * (If NULL is returned (*key_nextp) is set to (key_end + 1)).
2589  *
2590  * This function will also recurse up the chain if the key is not within the
2591  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
2592  * can simply allow (*parentp) to float inside the loop.
2593  *
2594  * NOTE!  chain->data is not always resolved.  By default it will not be
2595  *	  resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF.  Use
2596  *	  HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/
2597  *	  BREF_TYPE_DATA as the device buffer can alias the logical file
2598  *	  buffer).
2599  */
2600 hammer2_chain_t *
2601 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp,
2602 		     hammer2_key_t key_beg, hammer2_key_t key_end,
2603 		     int *errorp, int flags)
2604 {
2605 	hammer2_chain_t *parent;
2606 	hammer2_chain_t *chain;
2607 	hammer2_blockref_t *base;
2608 	hammer2_blockref_t *bref;
2609 	hammer2_blockref_t bsave;
2610 	hammer2_key_t scan_beg;
2611 	hammer2_key_t scan_end;
2612 	int count = 0;
2613 	int how_always = HAMMER2_RESOLVE_ALWAYS;
2614 	int how_maybe = HAMMER2_RESOLVE_MAYBE;
2615 	int how;
2616 	int generation;
2617 	int maxloops = 300000;
2618 
2619 	if (flags & HAMMER2_LOOKUP_ALWAYS) {
2620 		how_maybe = how_always;
2621 		how = HAMMER2_RESOLVE_ALWAYS;
2622 	} else if (flags & HAMMER2_LOOKUP_NODATA) {
2623 		how = HAMMER2_RESOLVE_NEVER;
2624 	} else {
2625 		how = HAMMER2_RESOLVE_MAYBE;
2626 	}
2627 	if (flags & HAMMER2_LOOKUP_SHARED) {
2628 		how_maybe |= HAMMER2_RESOLVE_SHARED;
2629 		how_always |= HAMMER2_RESOLVE_SHARED;
2630 		how |= HAMMER2_RESOLVE_SHARED;
2631 	}
2632 
2633 	/*
2634 	 * Recurse (*parentp) upward if necessary until the parent completely
2635 	 * encloses the key range or we hit the inode.
2636 	 *
2637 	 * Handle races against the flusher deleting indirect nodes on its
2638 	 * way back up by continuing to recurse upward past the deletion.
2639 	 */
2640 	parent = *parentp;
2641 	*errorp = 0;
2642 
2643 	while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2644 	       parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2645 		scan_beg = parent->bref.key;
2646 		scan_end = scan_beg +
2647 			   ((hammer2_key_t)1 << parent->bref.keybits) - 1;
2648 		if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) {
2649 			if (key_beg >= scan_beg && key_end <= scan_end)
2650 				break;
2651 		}
2652 		parent = hammer2_chain_repparent(parentp, how_maybe);
2653 	}
2654 again:
2655 	if (--maxloops == 0)
2656 		panic("hammer2_chain_lookup: maxloops");
2657 
2658 	/*
2659 	 * MATCHIND case that does not require parent->data (do prior to
2660 	 * parent->error check).
2661 	 */
2662 	switch(parent->bref.type) {
2663 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2664 	case HAMMER2_BREF_TYPE_INDIRECT:
2665 		if (flags & HAMMER2_LOOKUP_MATCHIND) {
2666 			scan_beg = parent->bref.key;
2667 			scan_end = scan_beg +
2668 			       ((hammer2_key_t)1 << parent->bref.keybits) - 1;
2669 			if (key_beg == scan_beg && key_end == scan_end) {
2670 				chain = parent;
2671 				hammer2_chain_ref(chain);
2672 				hammer2_chain_lock(chain, how_maybe);
2673 				*key_nextp = scan_end + 1;
2674 				goto done;
2675 			}
2676 		}
2677 		break;
2678 	default:
2679 		break;
2680 	}
2681 
2682 	/*
2683 	 * No lookup is possible if the parent is errored.  We delayed
2684 	 * this check as long as we could to ensure that the parent backup,
2685 	 * embedded data, and MATCHIND code could still execute.
2686 	 */
2687 	if (parent->error) {
2688 		*errorp = parent->error;
2689 		return NULL;
2690 	}
2691 
2692 	/*
2693 	 * Locate the blockref array.  Currently we do a fully associative
2694 	 * search through the array.
2695 	 */
2696 	switch(parent->bref.type) {
2697 	case HAMMER2_BREF_TYPE_INODE:
2698 		/*
2699 		 * Special shortcut for embedded data returns the inode
2700 		 * itself.  Callers must detect this condition and access
2701 		 * the embedded data (the strategy code does this for us).
2702 		 *
2703 		 * This is only applicable to regular files and softlinks.
2704 		 *
2705 		 * We need a second lock on parent.  Since we already have
2706 		 * a lock we must pass LOCKAGAIN to prevent unexpected
2707 		 * blocking (we don't want to block on a second shared
2708 		 * ref if an exclusive lock is pending)
2709 		 */
2710 		if (parent->data->ipdata.meta.op_flags &
2711 		    HAMMER2_OPFLAG_DIRECTDATA) {
2712 			if (flags & HAMMER2_LOOKUP_NODIRECT) {
2713 				chain = NULL;
2714 				*key_nextp = key_end + 1;
2715 				goto done;
2716 			}
2717 			hammer2_chain_ref(parent);
2718 			hammer2_chain_lock(parent, how_always |
2719 						   HAMMER2_RESOLVE_LOCKAGAIN);
2720 			*key_nextp = key_end + 1;
2721 			return (parent);
2722 		}
2723 		base = &parent->data->ipdata.u.blockset.blockref[0];
2724 		count = HAMMER2_SET_COUNT;
2725 		break;
2726 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2727 	case HAMMER2_BREF_TYPE_INDIRECT:
2728 		/*
2729 		 * Optimize indirect blocks in the INITIAL state to avoid
2730 		 * I/O.
2731 		 *
2732 		 * Debugging: Enter permanent wait state instead of
2733 		 * panicing on unexpectedly NULL data for the moment.
2734 		 */
2735 		if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2736 			base = NULL;
2737 		} else {
2738 			if (parent->data == NULL) {
2739 				kprintf("hammer2: unexpected NULL data "
2740 					"on %p\n", parent);
2741 				while (1)
2742 					tsleep(parent, 0, "xxx", 0);
2743 			}
2744 			base = &parent->data->npdata[0];
2745 		}
2746 		count = parent->bytes / sizeof(hammer2_blockref_t);
2747 		break;
2748 	case HAMMER2_BREF_TYPE_VOLUME:
2749 		base = &parent->data->voldata.sroot_blockset.blockref[0];
2750 		count = HAMMER2_SET_COUNT;
2751 		break;
2752 	case HAMMER2_BREF_TYPE_FREEMAP:
2753 		base = &parent->data->blkset.blockref[0];
2754 		count = HAMMER2_SET_COUNT;
2755 		break;
2756 	default:
2757 		panic("hammer2_chain_lookup: unrecognized "
2758 		      "blockref(B) type: %d",
2759 		      parent->bref.type);
2760 		base = NULL;	/* safety */
2761 		count = 0;	/* safety */
2762 		break;
2763 	}
2764 
2765 	/*
2766 	 * Merged scan to find next candidate.
2767 	 *
2768 	 * hammer2_base_*() functions require the parent->core.live_* fields
2769 	 * to be synchronized.
2770 	 *
2771 	 * We need to hold the spinlock to access the block array and RB tree
2772 	 * and to interlock chain creation.
2773 	 */
2774 	if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
2775 		hammer2_chain_countbrefs(parent, base, count);
2776 
2777 	/*
2778 	 * Combined search
2779 	 */
2780 	hammer2_spin_ex(&parent->core.spin);
2781 	chain = hammer2_combined_find(parent, base, count,
2782 				      key_nextp,
2783 				      key_beg, key_end,
2784 				      &bref);
2785 	generation = parent->core.generation;
2786 
2787 	/*
2788 	 * Exhausted parent chain, iterate.
2789 	 */
2790 	if (bref == NULL) {
2791 		KKASSERT(chain == NULL);
2792 		hammer2_spin_unex(&parent->core.spin);
2793 		if (key_beg == key_end)	/* short cut single-key case */
2794 			return (NULL);
2795 
2796 		/*
2797 		 * Stop if we reached the end of the iteration.
2798 		 */
2799 		if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
2800 		    parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2801 			return (NULL);
2802 		}
2803 
2804 		/*
2805 		 * Calculate next key, stop if we reached the end of the
2806 		 * iteration, otherwise go up one level and loop.
2807 		 */
2808 		key_beg = parent->bref.key +
2809 			  ((hammer2_key_t)1 << parent->bref.keybits);
2810 		if (key_beg == 0 || key_beg > key_end)
2811 			return (NULL);
2812 		parent = hammer2_chain_repparent(parentp, how_maybe);
2813 		goto again;
2814 	}
2815 
2816 	/*
2817 	 * Selected from blockref or in-memory chain.
2818 	 */
2819 	bsave = *bref;
2820 	if (chain == NULL) {
2821 		hammer2_spin_unex(&parent->core.spin);
2822 		if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT ||
2823 		    bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2824 			chain = hammer2_chain_get(parent, generation,
2825 						  &bsave, how_maybe);
2826 		} else {
2827 			chain = hammer2_chain_get(parent, generation,
2828 						  &bsave, how);
2829 		}
2830 		if (chain == NULL)
2831 			goto again;
2832 	} else {
2833 		hammer2_chain_ref(chain);
2834 		hammer2_spin_unex(&parent->core.spin);
2835 
2836 		/*
2837 		 * chain is referenced but not locked.  We must lock the
2838 		 * chain to obtain definitive state.
2839 		 */
2840 		if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT ||
2841 		    bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2842 			hammer2_chain_lock(chain, how_maybe);
2843 		} else {
2844 			hammer2_chain_lock(chain, how);
2845 		}
2846 		KKASSERT(chain->parent == parent);
2847 	}
2848 	if (bcmp(&bsave, &chain->bref, sizeof(bsave)) ||
2849 	    chain->parent != parent) {
2850 		hammer2_chain_unlock(chain);
2851 		hammer2_chain_drop(chain);
2852 		chain = NULL;	/* SAFETY */
2853 		goto again;
2854 	}
2855 
2856 
2857 	/*
2858 	 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX)
2859 	 *
2860 	 * NOTE: Chain's key range is not relevant as there might be
2861 	 *	 one-offs within the range that are not deleted.
2862 	 *
2863 	 * NOTE: Lookups can race delete-duplicate because
2864 	 *	 delete-duplicate does not lock the parent's core
2865 	 *	 (they just use the spinlock on the core).
2866 	 */
2867 	if (chain->flags & HAMMER2_CHAIN_DELETED) {
2868 		kprintf("skip deleted chain %016jx.%02x key=%016jx\n",
2869 			chain->bref.data_off, chain->bref.type,
2870 			chain->bref.key);
2871 		hammer2_chain_unlock(chain);
2872 		hammer2_chain_drop(chain);
2873 		chain = NULL;	/* SAFETY */
2874 		key_beg = *key_nextp;
2875 		if (key_beg == 0 || key_beg > key_end)
2876 			return(NULL);
2877 		goto again;
2878 	}
2879 
2880 	/*
2881 	 * If the chain element is an indirect block it becomes the new
2882 	 * parent and we loop on it.  We must maintain our top-down locks
2883 	 * to prevent the flusher from interfering (i.e. doing a
2884 	 * delete-duplicate and leaving us recursing down a deleted chain).
2885 	 *
2886 	 * The parent always has to be locked with at least RESOLVE_MAYBE
2887 	 * so we can access its data.  It might need a fixup if the caller
2888 	 * passed incompatible flags.  Be careful not to cause a deadlock
2889 	 * as a data-load requires an exclusive lock.
2890 	 *
2891 	 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key
2892 	 * range is within the requested key range we return the indirect
2893 	 * block and do NOT loop.  This is usually only used to acquire
2894 	 * freemap nodes.
2895 	 */
2896 	if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2897 	    chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2898 		hammer2_chain_unlock(parent);
2899 		hammer2_chain_drop(parent);
2900 		*parentp = parent = chain;
2901 		chain = NULL;	/* SAFETY */
2902 		goto again;
2903 	}
2904 done:
2905 	/*
2906 	 * All done, return the locked chain.
2907 	 *
2908 	 * If the caller does not want a locked chain, replace the lock with
2909 	 * a ref.  Perhaps this can eventually be optimized to not obtain the
2910 	 * lock in the first place for situations where the data does not
2911 	 * need to be resolved.
2912 	 *
2913 	 * NOTE! A chain->error must be tested by the caller upon return.
2914 	 *	 *errorp is only set based on issues which occur while
2915 	 *	 trying to reach the chain.
2916 	 */
2917 	return (chain);
2918 }
2919 
2920 /*
2921  * After having issued a lookup we can iterate all matching keys.
2922  *
2923  * If chain is non-NULL we continue the iteration from just after it's index.
2924  *
2925  * If chain is NULL we assume the parent was exhausted and continue the
2926  * iteration at the next parent.
2927  *
2928  * If a fatal error occurs (typically an I/O error), a dummy chain is
2929  * returned with chain->error and error-identifying information set.  This
2930  * chain will assert if you try to do anything fancy with it.
2931  *
2932  * XXX Depending on where the error occurs we should allow continued iteration.
2933  *
2934  * parent must be locked on entry and remains locked throughout.  chain's
2935  * lock status must match flags.  Chain is always at least referenced.
2936  *
2937  * WARNING!  The MATCHIND flag does not apply to this function.
2938  */
2939 hammer2_chain_t *
2940 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain,
2941 		   hammer2_key_t *key_nextp,
2942 		   hammer2_key_t key_beg, hammer2_key_t key_end,
2943 		   int *errorp, int flags)
2944 {
2945 	hammer2_chain_t *parent;
2946 	int how_maybe;
2947 
2948 	/*
2949 	 * Calculate locking flags for upward recursion.
2950 	 */
2951 	how_maybe = HAMMER2_RESOLVE_MAYBE;
2952 	if (flags & HAMMER2_LOOKUP_SHARED)
2953 		how_maybe |= HAMMER2_RESOLVE_SHARED;
2954 
2955 	parent = *parentp;
2956 	*errorp = 0;
2957 
2958 	/*
2959 	 * Calculate the next index and recalculate the parent if necessary.
2960 	 */
2961 	if (chain) {
2962 		key_beg = chain->bref.key +
2963 			  ((hammer2_key_t)1 << chain->bref.keybits);
2964 		hammer2_chain_unlock(chain);
2965 		hammer2_chain_drop(chain);
2966 
2967 		/*
2968 		 * chain invalid past this point, but we can still do a
2969 		 * pointer comparison w/parent.
2970 		 *
2971 		 * Any scan where the lookup returned degenerate data embedded
2972 		 * in the inode has an invalid index and must terminate.
2973 		 */
2974 		if (chain == parent)
2975 			return(NULL);
2976 		if (key_beg == 0 || key_beg > key_end)
2977 			return(NULL);
2978 		chain = NULL;
2979 	} else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
2980 		   parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2981 		/*
2982 		 * We reached the end of the iteration.
2983 		 */
2984 		return (NULL);
2985 	} else {
2986 		/*
2987 		 * Continue iteration with next parent unless the current
2988 		 * parent covers the range.
2989 		 *
2990 		 * (This also handles the case of a deleted, empty indirect
2991 		 * node).
2992 		 */
2993 		key_beg = parent->bref.key +
2994 			  ((hammer2_key_t)1 << parent->bref.keybits);
2995 		if (key_beg == 0 || key_beg > key_end)
2996 			return (NULL);
2997 		parent = hammer2_chain_repparent(parentp, how_maybe);
2998 	}
2999 
3000 	/*
3001 	 * And execute
3002 	 */
3003 	return (hammer2_chain_lookup(parentp, key_nextp,
3004 				     key_beg, key_end,
3005 				     errorp, flags));
3006 }
3007 
3008 /*
3009  * Caller wishes to iterate chains under parent, loading new chains into
3010  * chainp.  Caller must initialize *chainp to NULL and *firstp to 1, and
3011  * then call hammer2_chain_scan() repeatedly until a non-zero return.
3012  * During the scan, *firstp will be set to 0 and (*chainp) will be replaced
3013  * with the returned chain for the scan.  The returned *chainp will be
3014  * locked and referenced.  Any prior contents will be unlocked and dropped.
3015  *
3016  * Caller should check the return value.  A normal scan EOF will return
3017  * exactly HAMMER2_ERROR_EOF.  Any other non-zero value indicates an
3018  * error trying to access parent data.  Any error in the returned chain
3019  * must be tested separately by the caller.
3020  *
3021  * (*chainp) is dropped on each scan, but will only be set if the returned
3022  * element itself can recurse.  Leaf elements are NOT resolved, loaded, or
3023  * returned via *chainp.  The caller will get their bref only.
3024  *
3025  * The raw scan function is similar to lookup/next but does not seek to a key.
3026  * Blockrefs are iterated via first_bref = (parent, NULL) and
3027  * next_chain = (parent, bref).
3028  *
3029  * The passed-in parent must be locked and its data resolved.  The function
3030  * nominally returns a locked and referenced *chainp != NULL for chains
3031  * the caller might need to recurse on (and will dipose of any *chainp passed
3032  * in).  The caller must check the chain->bref.type either way.
3033  */
3034 int
3035 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp,
3036 		   hammer2_blockref_t *bref, int *firstp,
3037 		   int flags)
3038 {
3039 	hammer2_blockref_t *base;
3040 	hammer2_blockref_t *bref_ptr;
3041 	hammer2_key_t key;
3042 	hammer2_key_t next_key;
3043 	hammer2_chain_t *chain = NULL;
3044 	int count = 0;
3045 	int how;
3046 	int generation;
3047 	int maxloops = 300000;
3048 	int error;
3049 
3050 	error = 0;
3051 
3052 	/*
3053 	 * Scan flags borrowed from lookup.
3054 	 */
3055 	if (flags & HAMMER2_LOOKUP_ALWAYS) {
3056 		how = HAMMER2_RESOLVE_ALWAYS;
3057 	} else if (flags & HAMMER2_LOOKUP_NODATA) {
3058 		how = HAMMER2_RESOLVE_NEVER;
3059 	} else {
3060 		how = HAMMER2_RESOLVE_MAYBE;
3061 	}
3062 	if (flags & HAMMER2_LOOKUP_SHARED) {
3063 		how |= HAMMER2_RESOLVE_SHARED;
3064 	}
3065 
3066 	/*
3067 	 * Calculate key to locate first/next element, unlocking the previous
3068 	 * element as we go.  Be careful, the key calculation can overflow.
3069 	 *
3070 	 * (also reset bref to NULL)
3071 	 */
3072 	if (*firstp) {
3073 		key = 0;
3074 		*firstp = 0;
3075 	} else {
3076 		key = bref->key + ((hammer2_key_t)1 << bref->keybits);
3077 		if ((chain = *chainp) != NULL) {
3078 			*chainp = NULL;
3079 			hammer2_chain_unlock(chain);
3080 			hammer2_chain_drop(chain);
3081 			chain = NULL;
3082 		}
3083 		if (key == 0) {
3084 			error |= HAMMER2_ERROR_EOF;
3085 			goto done;
3086 		}
3087 	}
3088 
3089 again:
3090 	if (parent->error) {
3091 		error = parent->error;
3092 		goto done;
3093 	}
3094 	if (--maxloops == 0)
3095 		panic("hammer2_chain_scan: maxloops");
3096 
3097 	/*
3098 	 * Locate the blockref array.  Currently we do a fully associative
3099 	 * search through the array.
3100 	 */
3101 	switch(parent->bref.type) {
3102 	case HAMMER2_BREF_TYPE_INODE:
3103 		/*
3104 		 * An inode with embedded data has no sub-chains.
3105 		 *
3106 		 * WARNING! Bulk scan code may pass a static chain marked
3107 		 *	    as BREF_TYPE_INODE with a copy of the volume
3108 		 *	    root blockset to snapshot the volume.
3109 		 */
3110 		if (parent->data->ipdata.meta.op_flags &
3111 		    HAMMER2_OPFLAG_DIRECTDATA) {
3112 			error |= HAMMER2_ERROR_EOF;
3113 			goto done;
3114 		}
3115 		base = &parent->data->ipdata.u.blockset.blockref[0];
3116 		count = HAMMER2_SET_COUNT;
3117 		break;
3118 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3119 	case HAMMER2_BREF_TYPE_INDIRECT:
3120 		/*
3121 		 * Optimize indirect blocks in the INITIAL state to avoid
3122 		 * I/O.
3123 		 */
3124 		if (parent->flags & HAMMER2_CHAIN_INITIAL) {
3125 			base = NULL;
3126 		} else {
3127 			if (parent->data == NULL)
3128 				panic("parent->data is NULL");
3129 			base = &parent->data->npdata[0];
3130 		}
3131 		count = parent->bytes / sizeof(hammer2_blockref_t);
3132 		break;
3133 	case HAMMER2_BREF_TYPE_VOLUME:
3134 		base = &parent->data->voldata.sroot_blockset.blockref[0];
3135 		count = HAMMER2_SET_COUNT;
3136 		break;
3137 	case HAMMER2_BREF_TYPE_FREEMAP:
3138 		base = &parent->data->blkset.blockref[0];
3139 		count = HAMMER2_SET_COUNT;
3140 		break;
3141 	default:
3142 		panic("hammer2_chain_scan: unrecognized blockref type: %d",
3143 		      parent->bref.type);
3144 		base = NULL;	/* safety */
3145 		count = 0;	/* safety */
3146 		break;
3147 	}
3148 
3149 	/*
3150 	 * Merged scan to find next candidate.
3151 	 *
3152 	 * hammer2_base_*() functions require the parent->core.live_* fields
3153 	 * to be synchronized.
3154 	 *
3155 	 * We need to hold the spinlock to access the block array and RB tree
3156 	 * and to interlock chain creation.
3157 	 */
3158 	if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
3159 		hammer2_chain_countbrefs(parent, base, count);
3160 
3161 	next_key = 0;
3162 	bref_ptr = NULL;
3163 	hammer2_spin_ex(&parent->core.spin);
3164 	chain = hammer2_combined_find(parent, base, count,
3165 				      &next_key,
3166 				      key, HAMMER2_KEY_MAX,
3167 				      &bref_ptr);
3168 	generation = parent->core.generation;
3169 
3170 	/*
3171 	 * Exhausted parent chain, we're done.
3172 	 */
3173 	if (bref_ptr == NULL) {
3174 		hammer2_spin_unex(&parent->core.spin);
3175 		KKASSERT(chain == NULL);
3176 		error |= HAMMER2_ERROR_EOF;
3177 		goto done;
3178 	}
3179 
3180 	/*
3181 	 * Copy into the supplied stack-based blockref.
3182 	 */
3183 	*bref = *bref_ptr;
3184 
3185 	/*
3186 	 * Selected from blockref or in-memory chain.
3187 	 */
3188 	if (chain == NULL) {
3189 		switch(bref->type) {
3190 		case HAMMER2_BREF_TYPE_INODE:
3191 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3192 		case HAMMER2_BREF_TYPE_INDIRECT:
3193 		case HAMMER2_BREF_TYPE_VOLUME:
3194 		case HAMMER2_BREF_TYPE_FREEMAP:
3195 			/*
3196 			 * Recursion, always get the chain
3197 			 */
3198 			hammer2_spin_unex(&parent->core.spin);
3199 			chain = hammer2_chain_get(parent, generation,
3200 						  bref, how);
3201 			if (chain == NULL)
3202 				goto again;
3203 			break;
3204 		default:
3205 			/*
3206 			 * No recursion, do not waste time instantiating
3207 			 * a chain, just iterate using the bref.
3208 			 */
3209 			hammer2_spin_unex(&parent->core.spin);
3210 			break;
3211 		}
3212 	} else {
3213 		/*
3214 		 * Recursion or not we need the chain in order to supply
3215 		 * the bref.
3216 		 */
3217 		hammer2_chain_ref(chain);
3218 		hammer2_spin_unex(&parent->core.spin);
3219 		hammer2_chain_lock(chain, how);
3220 	}
3221 	if (chain &&
3222 	    (bcmp(bref, &chain->bref, sizeof(*bref)) ||
3223 	     chain->parent != parent)) {
3224 		hammer2_chain_unlock(chain);
3225 		hammer2_chain_drop(chain);
3226 		chain = NULL;
3227 		goto again;
3228 	}
3229 
3230 	/*
3231 	 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX)
3232 	 *
3233 	 * NOTE: chain's key range is not relevant as there might be
3234 	 *	 one-offs within the range that are not deleted.
3235 	 *
3236 	 * NOTE: XXX this could create problems with scans used in
3237 	 *	 situations other than mount-time recovery.
3238 	 *
3239 	 * NOTE: Lookups can race delete-duplicate because
3240 	 *	 delete-duplicate does not lock the parent's core
3241 	 *	 (they just use the spinlock on the core).
3242 	 */
3243 	if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
3244 		hammer2_chain_unlock(chain);
3245 		hammer2_chain_drop(chain);
3246 		chain = NULL;
3247 
3248 		key = next_key;
3249 		if (key == 0) {
3250 			error |= HAMMER2_ERROR_EOF;
3251 			goto done;
3252 		}
3253 		goto again;
3254 	}
3255 
3256 done:
3257 	/*
3258 	 * All done, return the bref or NULL, supply chain if necessary.
3259 	 */
3260 	if (chain)
3261 		*chainp = chain;
3262 	return (error);
3263 }
3264 
3265 /*
3266  * Create and return a new hammer2 system memory structure of the specified
3267  * key, type and size and insert it under (*parentp).  This is a full
3268  * insertion, based on the supplied key/keybits, and may involve creating
3269  * indirect blocks and moving other chains around via delete/duplicate.
3270  *
3271  * This call can be made with parent == NULL as long as a non -1 methods
3272  * is supplied.  hmp must also be supplied in this situation (otherwise
3273  * hmp is extracted from the supplied parent).  The chain will be detached
3274  * from the topology.  A later call with both parent and chain can be made
3275  * to attach it.
3276  *
3277  * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION
3278  * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING
3279  * FULL.  This typically means that the caller is creating the chain after
3280  * doing a hammer2_chain_lookup().
3281  *
3282  * (*parentp) must be exclusive locked and may be replaced on return
3283  * depending on how much work the function had to do.
3284  *
3285  * (*parentp) must not be errored or this function will assert.
3286  *
3287  * (*chainp) usually starts out NULL and returns the newly created chain,
3288  * but if the caller desires the caller may allocate a disconnected chain
3289  * and pass it in instead.
3290  *
3291  * This function should NOT be used to insert INDIRECT blocks.  It is
3292  * typically used to create/insert inodes and data blocks.
3293  *
3294  * Caller must pass-in an exclusively locked parent the new chain is to
3295  * be inserted under, and optionally pass-in a disconnected, exclusively
3296  * locked chain to insert (else we create a new chain).  The function will
3297  * adjust (*parentp) as necessary, create or connect the chain, and
3298  * return an exclusively locked chain in *chainp.
3299  *
3300  * When creating a PFSROOT inode under the super-root, pmp is typically NULL
3301  * and will be reassigned.
3302  *
3303  * NOTE: returns HAMMER_ERROR_* flags
3304  */
3305 int
3306 hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp,
3307 		     hammer2_dev_t *hmp, hammer2_pfs_t *pmp, int methods,
3308 		     hammer2_key_t key, int keybits, int type, size_t bytes,
3309 		     hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags)
3310 {
3311 	hammer2_chain_t *chain;
3312 	hammer2_chain_t *parent;
3313 	hammer2_blockref_t *base;
3314 	hammer2_blockref_t dummy;
3315 	int allocated = 0;
3316 	int error = 0;
3317 	int count;
3318 	int maxloops = 300000;
3319 
3320 	/*
3321 	 * Topology may be crossing a PFS boundary.
3322 	 */
3323 	parent = *parentp;
3324 	if (parent) {
3325 		KKASSERT(hammer2_mtx_owned(&parent->lock));
3326 		KKASSERT(parent->error == 0);
3327 		hmp = parent->hmp;
3328 	}
3329 	chain = *chainp;
3330 
3331 	if (chain == NULL) {
3332 		/*
3333 		 * First allocate media space and construct the dummy bref,
3334 		 * then allocate the in-memory chain structure.  Set the
3335 		 * INITIAL flag for fresh chains which do not have embedded
3336 		 * data.
3337 		 */
3338 		bzero(&dummy, sizeof(dummy));
3339 		dummy.type = type;
3340 		dummy.key = key;
3341 		dummy.keybits = keybits;
3342 		dummy.data_off = hammer2_getradix(bytes);
3343 
3344 		/*
3345 		 * Inherit methods from parent by default.  Primarily used
3346 		 * for BREF_TYPE_DATA.  Non-data types *must* be set to
3347 		 * a non-NONE check algorithm.
3348 		 */
3349 		if (methods == HAMMER2_METH_DEFAULT)
3350 			dummy.methods = parent->bref.methods;
3351 		else
3352 			dummy.methods = (uint8_t)methods;
3353 
3354 		if (type != HAMMER2_BREF_TYPE_DATA &&
3355 		    HAMMER2_DEC_CHECK(dummy.methods) == HAMMER2_CHECK_NONE) {
3356 			dummy.methods |=
3357 				HAMMER2_ENC_CHECK(HAMMER2_CHECK_DEFAULT);
3358 		}
3359 
3360 		chain = hammer2_chain_alloc(hmp, pmp, &dummy);
3361 
3362 		/*
3363 		 * Lock the chain manually, chain_lock will load the chain
3364 		 * which we do NOT want to do.  (note: chain->refs is set
3365 		 * to 1 by chain_alloc() for us, but lockcnt is not).
3366 		 */
3367 		chain->lockcnt = 1;
3368 		hammer2_mtx_ex(&chain->lock);
3369 		allocated = 1;
3370 
3371 		/*
3372 		 * Set INITIAL to optimize I/O.  The flag will generally be
3373 		 * processed when we call hammer2_chain_modify().
3374 		 */
3375 		switch(type) {
3376 		case HAMMER2_BREF_TYPE_VOLUME:
3377 		case HAMMER2_BREF_TYPE_FREEMAP:
3378 			panic("hammer2_chain_create: called with volume type");
3379 			break;
3380 		case HAMMER2_BREF_TYPE_INDIRECT:
3381 			panic("hammer2_chain_create: cannot be used to"
3382 			      "create indirect block");
3383 			break;
3384 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3385 			panic("hammer2_chain_create: cannot be used to"
3386 			      "create freemap root or node");
3387 			break;
3388 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
3389 			KKASSERT(bytes == sizeof(chain->data->bmdata));
3390 			/* fall through */
3391 		case HAMMER2_BREF_TYPE_DIRENT:
3392 		case HAMMER2_BREF_TYPE_INODE:
3393 		case HAMMER2_BREF_TYPE_DATA:
3394 		default:
3395 			/*
3396 			 * leave chain->data NULL, set INITIAL
3397 			 */
3398 			KKASSERT(chain->data == NULL);
3399 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
3400 			break;
3401 		}
3402 	} else {
3403 		/*
3404 		 * We are reattaching a previously deleted chain, possibly
3405 		 * under a new parent and possibly with a new key/keybits.
3406 		 * The chain does not have to be in a modified state.  The
3407 		 * UPDATE flag will be set later on in this routine.
3408 		 *
3409 		 * Do NOT mess with the current state of the INITIAL flag.
3410 		 */
3411 		chain->bref.key = key;
3412 		chain->bref.keybits = keybits;
3413 		if (chain->flags & HAMMER2_CHAIN_DELETED)
3414 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED);
3415 		KKASSERT(chain->parent == NULL);
3416 	}
3417 
3418 	/*
3419 	 * Set the appropriate bref flag if requested.
3420 	 *
3421 	 * NOTE! Callers can call this function to move chains without
3422 	 *	 knowing about special flags, so don't clear bref flags
3423 	 *	 here!
3424 	 */
3425 	if (flags & HAMMER2_INSERT_PFSROOT)
3426 		chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
3427 
3428 	if (parent == NULL)
3429 		goto skip;
3430 
3431 	/*
3432 	 * Calculate how many entries we have in the blockref array and
3433 	 * determine if an indirect block is required when inserting into
3434 	 * the parent.
3435 	 */
3436 again:
3437 	if (--maxloops == 0)
3438 		panic("hammer2_chain_create: maxloops");
3439 
3440 	switch(parent->bref.type) {
3441 	case HAMMER2_BREF_TYPE_INODE:
3442 		if ((parent->data->ipdata.meta.op_flags &
3443 		     HAMMER2_OPFLAG_DIRECTDATA) != 0) {
3444 			kprintf("hammer2: parent set for direct-data! "
3445 				"pkey=%016jx ckey=%016jx\n",
3446 				parent->bref.key,
3447 				chain->bref.key);
3448 	        }
3449 		KKASSERT((parent->data->ipdata.meta.op_flags &
3450 			  HAMMER2_OPFLAG_DIRECTDATA) == 0);
3451 		KKASSERT(parent->data != NULL);
3452 		base = &parent->data->ipdata.u.blockset.blockref[0];
3453 		count = HAMMER2_SET_COUNT;
3454 		break;
3455 	case HAMMER2_BREF_TYPE_INDIRECT:
3456 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3457 		if (parent->flags & HAMMER2_CHAIN_INITIAL)
3458 			base = NULL;
3459 		else
3460 			base = &parent->data->npdata[0];
3461 		count = parent->bytes / sizeof(hammer2_blockref_t);
3462 		break;
3463 	case HAMMER2_BREF_TYPE_VOLUME:
3464 		KKASSERT(parent->data != NULL);
3465 		base = &parent->data->voldata.sroot_blockset.blockref[0];
3466 		count = HAMMER2_SET_COUNT;
3467 		break;
3468 	case HAMMER2_BREF_TYPE_FREEMAP:
3469 		KKASSERT(parent->data != NULL);
3470 		base = &parent->data->blkset.blockref[0];
3471 		count = HAMMER2_SET_COUNT;
3472 		break;
3473 	default:
3474 		panic("hammer2_chain_create: unrecognized blockref type: %d",
3475 		      parent->bref.type);
3476 		base = NULL;
3477 		count = 0;
3478 		break;
3479 	}
3480 
3481 	/*
3482 	 * Make sure we've counted the brefs
3483 	 */
3484 	if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
3485 		hammer2_chain_countbrefs(parent, base, count);
3486 
3487 	KASSERT(parent->core.live_count >= 0 &&
3488 		parent->core.live_count <= count,
3489 		("bad live_count %d/%d (%02x, %d)",
3490 			parent->core.live_count, count,
3491 			parent->bref.type, parent->bytes));
3492 
3493 	/*
3494 	 * If no free blockref could be found we must create an indirect
3495 	 * block and move a number of blockrefs into it.  With the parent
3496 	 * locked we can safely lock each child in order to delete+duplicate
3497 	 * it without causing a deadlock.
3498 	 *
3499 	 * This may return the new indirect block or the old parent depending
3500 	 * on where the key falls.  NULL is returned on error.
3501 	 */
3502 	if (parent->core.live_count == count) {
3503 		hammer2_chain_t *nparent;
3504 
3505 		KKASSERT((flags & HAMMER2_INSERT_SAMEPARENT) == 0);
3506 
3507 		nparent = hammer2_chain_create_indirect(parent, key, keybits,
3508 							mtid, type, &error);
3509 		if (nparent == NULL) {
3510 			if (allocated)
3511 				hammer2_chain_drop(chain);
3512 			chain = NULL;
3513 			goto done;
3514 		}
3515 		if (parent != nparent) {
3516 			hammer2_chain_unlock(parent);
3517 			hammer2_chain_drop(parent);
3518 			parent = *parentp = nparent;
3519 		}
3520 		goto again;
3521 	}
3522 
3523 	/*
3524 	 * fall through if parent, or skip to here if no parent.
3525 	 */
3526 skip:
3527 	if (chain->flags & HAMMER2_CHAIN_DELETED)
3528 		kprintf("Inserting deleted chain @%016jx\n",
3529 			chain->bref.key);
3530 
3531 	/*
3532 	 * Link the chain into its parent.
3533 	 */
3534 	if (chain->parent != NULL)
3535 		panic("hammer2: hammer2_chain_create: chain already connected");
3536 	KKASSERT(chain->parent == NULL);
3537 	if (parent) {
3538 		KKASSERT(parent->core.live_count < count);
3539 		hammer2_chain_insert(parent, chain,
3540 				     HAMMER2_CHAIN_INSERT_SPIN |
3541 				     HAMMER2_CHAIN_INSERT_LIVE,
3542 				     0);
3543 	}
3544 
3545 	if (allocated) {
3546 		/*
3547 		 * Mark the newly created chain modified.  This will cause
3548 		 * UPDATE to be set and process the INITIAL flag.
3549 		 *
3550 		 * Device buffers are not instantiated for DATA elements
3551 		 * as these are handled by logical buffers.
3552 		 *
3553 		 * Indirect and freemap node indirect blocks are handled
3554 		 * by hammer2_chain_create_indirect() and not by this
3555 		 * function.
3556 		 *
3557 		 * Data for all other bref types is expected to be
3558 		 * instantiated (INODE, LEAF).
3559 		 */
3560 		switch(chain->bref.type) {
3561 		case HAMMER2_BREF_TYPE_DATA:
3562 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
3563 		case HAMMER2_BREF_TYPE_DIRENT:
3564 		case HAMMER2_BREF_TYPE_INODE:
3565 			error = hammer2_chain_modify(chain, mtid, dedup_off,
3566 						     HAMMER2_MODIFY_OPTDATA);
3567 			break;
3568 		default:
3569 			/*
3570 			 * Remaining types are not supported by this function.
3571 			 * In particular, INDIRECT and LEAF_NODE types are
3572 			 * handled by create_indirect().
3573 			 */
3574 			panic("hammer2_chain_create: bad type: %d",
3575 			      chain->bref.type);
3576 			/* NOT REACHED */
3577 			break;
3578 		}
3579 	} else {
3580 		/*
3581 		 * When reconnecting a chain we must set UPDATE and
3582 		 * setflush so the flush recognizes that it must update
3583 		 * the bref in the parent.
3584 		 */
3585 		if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0)
3586 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
3587 	}
3588 
3589 	/*
3590 	 * We must setflush(parent) to ensure that it recurses through to
3591 	 * chain.  setflush(chain) might not work because ONFLUSH is possibly
3592 	 * already set in the chain (so it won't recurse up to set it in the
3593 	 * parent).
3594 	 */
3595 	if (parent)
3596 		hammer2_chain_setflush(parent);
3597 
3598 done:
3599 	*chainp = chain;
3600 
3601 	return (error);
3602 }
3603 
3604 /*
3605  * Move the chain from its old parent to a new parent.  The chain must have
3606  * already been deleted or already disconnected (or never associated) with
3607  * a parent.  The chain is reassociated with the new parent and the deleted
3608  * flag will be cleared (no longer deleted).  The chain's modification state
3609  * is not altered.
3610  *
3611  * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION
3612  * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING
3613  * FULL.  This typically means that the caller is creating the chain after
3614  * doing a hammer2_chain_lookup().
3615  *
3616  * Neither (parent) or (chain) can be errored.
3617  *
3618  * If (parent) is non-NULL then the chain is inserted under the parent.
3619  *
3620  * If (parent) is NULL then the newly duplicated chain is not inserted
3621  * anywhere, similar to if it had just been chain_alloc()'d (suitable for
3622  * passing into hammer2_chain_create() after this function returns).
3623  *
3624  * WARNING! This function calls create which means it can insert indirect
3625  *	    blocks.  This can cause other unrelated chains in the parent to
3626  *	    be moved to a newly inserted indirect block in addition to the
3627  *	    specific chain.
3628  */
3629 void
3630 hammer2_chain_rename(hammer2_chain_t **parentp, hammer2_chain_t *chain,
3631 		     hammer2_tid_t mtid, int flags)
3632 {
3633 	hammer2_blockref_t *bref;
3634 	hammer2_chain_t *parent;
3635 
3636 	/*
3637 	 * WARNING!  We should never resolve DATA to device buffers
3638 	 *	     (XXX allow it if the caller did?), and since
3639 	 *	     we currently do not have the logical buffer cache
3640 	 *	     buffer in-hand to fix its cached physical offset
3641 	 *	     we also force the modify code to not COW it. XXX
3642 	 *
3643 	 * NOTE!     We allow error'd chains to be renamed.  The bref itself
3644 	 *	     is good and can be renamed.  The content, however, may
3645 	 *	     be inaccessible.
3646 	 */
3647 	KKASSERT(chain->parent == NULL);
3648 	/*KKASSERT(chain->error == 0); allow */
3649 	bref = &chain->bref;
3650 
3651 	/*
3652 	 * If parent is not NULL the duplicated chain will be entered under
3653 	 * the parent and the UPDATE bit set to tell flush to update
3654 	 * the blockref.
3655 	 *
3656 	 * We must setflush(parent) to ensure that it recurses through to
3657 	 * chain.  setflush(chain) might not work because ONFLUSH is possibly
3658 	 * already set in the chain (so it won't recurse up to set it in the
3659 	 * parent).
3660 	 *
3661 	 * Having both chains locked is extremely important for atomicy.
3662 	 */
3663 	if (parentp && (parent = *parentp) != NULL) {
3664 		KKASSERT(hammer2_mtx_owned(&parent->lock));
3665 		KKASSERT(parent->refs > 0);
3666 		KKASSERT(parent->error == 0);
3667 
3668 		hammer2_chain_create(parentp, &chain, NULL, chain->pmp,
3669 				     HAMMER2_METH_DEFAULT,
3670 				     bref->key, bref->keybits, bref->type,
3671 				     chain->bytes, mtid, 0, flags);
3672 		KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE);
3673 		hammer2_chain_setflush(*parentp);
3674 	}
3675 }
3676 
3677 /*
3678  * This works in tandem with delete_obref() to install a blockref in
3679  * (typically) an indirect block that is associated with the chain being
3680  * moved to *parentp.
3681  *
3682  * The reason we need this function is that the caller needs to maintain
3683  * the blockref as it was, and not generate a new blockref for what might
3684  * be a modified chain.  Otherwise stuff will leak into the flush that
3685  * the flush code's FLUSH_INODE_STOP flag is unable to catch.
3686  *
3687  * It is EXTREMELY important that we properly set CHAIN_BLKMAPUPD and
3688  * CHAIN_UPDATE.  We must set BLKMAPUPD if the bref does not match, and
3689  * we must clear CHAIN_UPDATE (that was likely set by the chain_rename) if
3690  * it does.  Otherwise we can end up in a situation where H2 is unable to
3691  * clean up the in-memory chain topology.
3692  *
3693  * The reason for this is that flushes do not generally flush through
3694  * BREF_TYPE_INODE chains and depend on a hammer2_inode_t queued to syncq
3695  * or sideq to properly flush and dispose of the related inode chain's flags.
3696  * Situations where the inode is not actually modified by the frontend,
3697  * but where we have to move the related chains around as we insert or cleanup
3698  * indirect blocks, can leave us with a 'dirty' (non-disposable) in-memory
3699  * inode chain that does not have a hammer2_inode_t associated with it.
3700  */
3701 static void
3702 hammer2_chain_rename_obref(hammer2_chain_t **parentp, hammer2_chain_t *chain,
3703 			   hammer2_tid_t mtid, int flags,
3704 			   hammer2_blockref_t *obref)
3705 {
3706 	hammer2_chain_rename(parentp, chain, mtid, flags);
3707 
3708 	if (obref->type != HAMMER2_BREF_TYPE_EMPTY) {
3709 		hammer2_blockref_t *tbase;
3710 		int tcount;
3711 
3712 		KKASSERT((chain->flags & HAMMER2_CHAIN_BLKMAPPED) == 0);
3713 		hammer2_chain_modify(*parentp, mtid, 0, 0);
3714 		tbase = hammer2_chain_base_and_count(*parentp, &tcount);
3715 		hammer2_base_insert(*parentp, tbase, tcount, chain, obref);
3716 		if (bcmp(obref, &chain->bref, sizeof(chain->bref))) {
3717 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPUPD |
3718 						      HAMMER2_CHAIN_UPDATE);
3719 		} else {
3720 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
3721 		}
3722 	}
3723 }
3724 
3725 /*
3726  * Helper function for deleting chains.
3727  *
3728  * The chain is removed from the live view (the RBTREE) as well as the parent's
3729  * blockmap.  Both chain and its parent must be locked.
3730  *
3731  * parent may not be errored.  chain can be errored.
3732  */
3733 static int
3734 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain,
3735 			     hammer2_tid_t mtid, int flags,
3736 			     hammer2_blockref_t *obref)
3737 {
3738 	int error = 0;
3739 
3740 	KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
3741 	KKASSERT(chain->parent == parent);
3742 
3743 	if (chain->flags & HAMMER2_CHAIN_BLKMAPPED) {
3744 		/*
3745 		 * Chain is blockmapped, so there must be a parent.
3746 		 * Atomically remove the chain from the parent and remove
3747 		 * the blockmap entry.  The parent must be set modified
3748 		 * to remove the blockmap entry.
3749 		 */
3750 		hammer2_blockref_t *base;
3751 		int count;
3752 
3753 		KKASSERT(parent != NULL);
3754 		KKASSERT(parent->error == 0);
3755 		KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0);
3756 		error = hammer2_chain_modify(parent, mtid, 0, 0);
3757 		if (error)
3758 			goto done;
3759 
3760 		/*
3761 		 * Calculate blockmap pointer
3762 		 */
3763 		KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
3764 		hammer2_spin_ex(&chain->core.spin);
3765 		hammer2_spin_ex(&parent->core.spin);
3766 
3767 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
3768 		atomic_add_int(&parent->core.live_count, -1);
3769 		++parent->core.generation;
3770 		RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain);
3771 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
3772 		--parent->core.chain_count;
3773 		chain->parent = NULL;
3774 
3775 		switch(parent->bref.type) {
3776 		case HAMMER2_BREF_TYPE_INODE:
3777 			/*
3778 			 * Access the inode's block array.  However, there
3779 			 * is no block array if the inode is flagged
3780 			 * DIRECTDATA.
3781 			 */
3782 			if (parent->data &&
3783 			    (parent->data->ipdata.meta.op_flags &
3784 			     HAMMER2_OPFLAG_DIRECTDATA) == 0) {
3785 				base =
3786 				   &parent->data->ipdata.u.blockset.blockref[0];
3787 			} else {
3788 				base = NULL;
3789 			}
3790 			count = HAMMER2_SET_COUNT;
3791 			break;
3792 		case HAMMER2_BREF_TYPE_INDIRECT:
3793 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3794 			if (parent->data)
3795 				base = &parent->data->npdata[0];
3796 			else
3797 				base = NULL;
3798 			count = parent->bytes / sizeof(hammer2_blockref_t);
3799 			break;
3800 		case HAMMER2_BREF_TYPE_VOLUME:
3801 			base = &parent->data->voldata.
3802 					sroot_blockset.blockref[0];
3803 			count = HAMMER2_SET_COUNT;
3804 			break;
3805 		case HAMMER2_BREF_TYPE_FREEMAP:
3806 			base = &parent->data->blkset.blockref[0];
3807 			count = HAMMER2_SET_COUNT;
3808 			break;
3809 		default:
3810 			base = NULL;
3811 			count = 0;
3812 			panic("_hammer2_chain_delete_helper: "
3813 			      "unrecognized blockref type: %d",
3814 			      parent->bref.type);
3815 			break;
3816 		}
3817 
3818 		/*
3819 		 * delete blockmapped chain from its parent.
3820 		 *
3821 		 * The parent is not affected by any statistics in chain
3822 		 * which are pending synchronization.  That is, there is
3823 		 * nothing to undo in the parent since they have not yet
3824 		 * been incorporated into the parent.
3825 		 *
3826 		 * The parent is affected by statistics stored in inodes.
3827 		 * Those have already been synchronized, so they must be
3828 		 * undone.  XXX split update possible w/delete in middle?
3829 		 */
3830 		if (base) {
3831 			hammer2_base_delete(parent, base, count, chain, obref);
3832 		}
3833 		hammer2_spin_unex(&parent->core.spin);
3834 		hammer2_spin_unex(&chain->core.spin);
3835 	} else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) {
3836 		/*
3837 		 * Chain is not blockmapped but a parent is present.
3838 		 * Atomically remove the chain from the parent.  There is
3839 		 * no blockmap entry to remove.
3840 		 *
3841 		 * Because chain was associated with a parent but not
3842 		 * synchronized, the chain's *_count_up fields contain
3843 		 * inode adjustment statistics which must be undone.
3844 		 */
3845 		hammer2_spin_ex(&chain->core.spin);
3846 		hammer2_spin_ex(&parent->core.spin);
3847 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
3848 		atomic_add_int(&parent->core.live_count, -1);
3849 		++parent->core.generation;
3850 		RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain);
3851 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
3852 		--parent->core.chain_count;
3853 		chain->parent = NULL;
3854 		hammer2_spin_unex(&parent->core.spin);
3855 		hammer2_spin_unex(&chain->core.spin);
3856 	} else {
3857 		/*
3858 		 * Chain is not blockmapped and has no parent.  This
3859 		 * is a degenerate case.
3860 		 */
3861 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
3862 	}
3863 done:
3864 	return error;
3865 }
3866 
3867 /*
3868  * Create an indirect block that covers one or more of the elements in the
3869  * current parent.  Either returns the existing parent with no locking or
3870  * ref changes or returns the new indirect block locked and referenced
3871  * and leaving the original parent lock/ref intact as well.
3872  *
3873  * If an error occurs, NULL is returned and *errorp is set to the H2 error.
3874  *
3875  * The returned chain depends on where the specified key falls.
3876  *
3877  * The key/keybits for the indirect mode only needs to follow three rules:
3878  *
3879  * (1) That all elements underneath it fit within its key space and
3880  *
3881  * (2) That all elements outside it are outside its key space.
3882  *
3883  * (3) When creating the new indirect block any elements in the current
3884  *     parent that fit within the new indirect block's keyspace must be
3885  *     moved into the new indirect block.
3886  *
3887  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
3888  *     keyspace the the current parent, but lookup/iteration rules will
3889  *     ensure (and must ensure) that rule (2) for all parents leading up
3890  *     to the nearest inode or the root volume header is adhered to.  This
3891  *     is accomplished by always recursing through matching keyspaces in
3892  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
3893  *
3894  * The current implementation calculates the current worst-case keyspace by
3895  * iterating the current parent and then divides it into two halves, choosing
3896  * whichever half has the most elements (not necessarily the half containing
3897  * the requested key).
3898  *
3899  * We can also opt to use the half with the least number of elements.  This
3900  * causes lower-numbered keys (aka logical file offsets) to recurse through
3901  * fewer indirect blocks and higher-numbered keys to recurse through more.
3902  * This also has the risk of not moving enough elements to the new indirect
3903  * block and being forced to create several indirect blocks before the element
3904  * can be inserted.
3905  *
3906  * Must be called with an exclusively locked parent.
3907  *
3908  * NOTE: *errorp set to HAMMER_ERROR_* flags
3909  */
3910 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent,
3911 				hammer2_key_t *keyp, int keybits,
3912 				hammer2_blockref_t *base, int count);
3913 static int hammer2_chain_indkey_file(hammer2_chain_t *parent,
3914 				hammer2_key_t *keyp, int keybits,
3915 				hammer2_blockref_t *base, int count,
3916 				int ncount);
3917 static int hammer2_chain_indkey_dir(hammer2_chain_t *parent,
3918 				hammer2_key_t *keyp, int keybits,
3919 				hammer2_blockref_t *base, int count,
3920 				int ncount);
3921 static
3922 hammer2_chain_t *
3923 hammer2_chain_create_indirect(hammer2_chain_t *parent,
3924 			      hammer2_key_t create_key, int create_bits,
3925 			      hammer2_tid_t mtid, int for_type, int *errorp)
3926 {
3927 	hammer2_dev_t *hmp;
3928 	hammer2_blockref_t *base;
3929 	hammer2_blockref_t *bref;
3930 	hammer2_blockref_t bsave;
3931 	hammer2_blockref_t dummy;
3932 	hammer2_chain_t *chain;
3933 	hammer2_chain_t *ichain;
3934 	hammer2_key_t key = create_key;
3935 	hammer2_key_t key_beg;
3936 	hammer2_key_t key_end;
3937 	hammer2_key_t key_next;
3938 	int keybits = create_bits;
3939 	int count;
3940 	int ncount;
3941 	int nbytes;
3942 	int loops;
3943 	int error;
3944 	int reason;
3945 	int generation;
3946 	int maxloops = 300000;
3947 
3948 	/*
3949 	 * Calculate the base blockref pointer or NULL if the chain
3950 	 * is known to be empty.  We need to calculate the array count
3951 	 * for RB lookups either way.
3952 	 */
3953 	hmp = parent->hmp;
3954 	KKASSERT(hammer2_mtx_owned(&parent->lock));
3955 
3956 	/*
3957 	 * Pre-modify the parent now to avoid having to deal with error
3958 	 * processing if we tried to later (in the middle of our loop).
3959 	 *
3960 	 * We are going to be moving bref's around, the indirect blocks
3961 	 * cannot be in an initial state.  Do not pass MODIFY_OPTDATA.
3962 	 */
3963 	*errorp = hammer2_chain_modify(parent, mtid, 0, 0);
3964 	if (*errorp) {
3965 		kprintf("hammer2_chain_create_indirect: error %08x %s\n",
3966 			*errorp, hammer2_error_str(*errorp));
3967 		return NULL;
3968 	}
3969 	KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0);
3970 
3971 	/*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/
3972 	base = hammer2_chain_base_and_count(parent, &count);
3973 
3974 	/*
3975 	 * How big should our new indirect block be?  It has to be at least
3976 	 * as large as its parent for splits to work properly.
3977 	 *
3978 	 * The freemap uses a specific indirect block size.  The number of
3979 	 * levels are built dynamically and ultimately depend on the size
3980 	 * volume.  Because freemap blocks are taken from the reserved areas
3981 	 * of the volume our goal is efficiency (fewer levels) and not so
3982 	 * much to save disk space.
3983 	 *
3984 	 * The first indirect block level for a directory usually uses
3985 	 * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries).  Due to
3986 	 * the hash mechanism, this typically gives us a nominal
3987 	 * 32 * 4 entries with one level of indirection.
3988 	 *
3989 	 * We use HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs) for FILE
3990 	 * indirect blocks.  The initial 4 entries in the inode gives us
3991 	 * 256KB.  Up to 4 indirect blocks gives us 32MB.  Three levels
3992 	 * of indirection gives us 137GB, and so forth.  H2 can support
3993 	 * huge file sizes but they are not typical, so we try to stick
3994 	 * with compactness and do not use a larger indirect block size.
3995 	 *
3996 	 * We could use 64KB (PBUFSIZE), giving us 512 blockrefs, but
3997 	 * due to the way indirect blocks are created this usually winds
3998 	 * up being extremely inefficient for small files.  Even though
3999 	 * 16KB requires more levels of indirection for very large files,
4000 	 * the 16KB records can be ganged together into 64KB DIOs.
4001 	 */
4002 	if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
4003 	    for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
4004 		nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE;
4005 	} else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
4006 		if (parent->data->ipdata.meta.type ==
4007 		    HAMMER2_OBJTYPE_DIRECTORY)
4008 			nbytes = HAMMER2_IND_BYTES_MIN;	/* 4KB = 32 entries */
4009 		else
4010 			nbytes = HAMMER2_IND_BYTES_NOM;	/* 16KB = ~8MB file */
4011 
4012 	} else {
4013 		nbytes = HAMMER2_IND_BYTES_NOM;
4014 	}
4015 	if (nbytes < count * sizeof(hammer2_blockref_t)) {
4016 		KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE &&
4017 			 for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF);
4018 		nbytes = count * sizeof(hammer2_blockref_t);
4019 	}
4020 	ncount = nbytes / sizeof(hammer2_blockref_t);
4021 
4022 	/*
4023 	 * When creating an indirect block for a freemap node or leaf
4024 	 * the key/keybits must be fitted to static radix levels because
4025 	 * particular radix levels use particular reserved blocks in the
4026 	 * related zone.
4027 	 *
4028 	 * This routine calculates the key/radix of the indirect block
4029 	 * we need to create, and whether it is on the high-side or the
4030 	 * low-side.
4031 	 */
4032 	switch(for_type) {
4033 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
4034 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
4035 		keybits = hammer2_chain_indkey_freemap(parent, &key, keybits,
4036 						       base, count);
4037 		break;
4038 	case HAMMER2_BREF_TYPE_DATA:
4039 		keybits = hammer2_chain_indkey_file(parent, &key, keybits,
4040 						    base, count, ncount);
4041 		break;
4042 	case HAMMER2_BREF_TYPE_DIRENT:
4043 	case HAMMER2_BREF_TYPE_INODE:
4044 		keybits = hammer2_chain_indkey_dir(parent, &key, keybits,
4045 						   base, count, ncount);
4046 		break;
4047 	default:
4048 		panic("illegal indirect block for bref type %d", for_type);
4049 		break;
4050 	}
4051 
4052 	/*
4053 	 * Normalize the key for the radix being represented, keeping the
4054 	 * high bits and throwing away the low bits.
4055 	 */
4056 	key &= ~(((hammer2_key_t)1 << keybits) - 1);
4057 
4058 	/*
4059 	 * Ok, create our new indirect block
4060 	 */
4061 	bzero(&dummy, sizeof(dummy));
4062 	if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
4063 	    for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
4064 		dummy.type = HAMMER2_BREF_TYPE_FREEMAP_NODE;
4065 	} else {
4066 		dummy.type = HAMMER2_BREF_TYPE_INDIRECT;
4067 	}
4068 	dummy.key = key;
4069 	dummy.keybits = keybits;
4070 	dummy.data_off = hammer2_getradix(nbytes);
4071 	dummy.methods =
4072 		HAMMER2_ENC_CHECK(HAMMER2_DEC_CHECK(parent->bref.methods)) |
4073 		HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
4074 
4075 	ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy);
4076 	atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
4077 	hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE);
4078 	/* ichain has one ref at this point */
4079 
4080 	/*
4081 	 * We have to mark it modified to allocate its block, but use
4082 	 * OPTDATA to allow it to remain in the INITIAL state.  Otherwise
4083 	 * it won't be acted upon by the flush code.
4084 	 *
4085 	 * XXX remove OPTDATA, we need a fully initialized indirect block to
4086 	 * be able to move the original blockref.
4087 	 */
4088 	*errorp = hammer2_chain_modify(ichain, mtid, 0, 0);
4089 	if (*errorp) {
4090 		kprintf("hammer2_chain_create_indirect: error %08x %s\n",
4091 			*errorp, hammer2_error_str(*errorp));
4092 		hammer2_chain_unlock(ichain);
4093 		hammer2_chain_drop(ichain);
4094 		return NULL;
4095 	}
4096 	KKASSERT((ichain->flags & HAMMER2_CHAIN_INITIAL) == 0);
4097 
4098 	/*
4099 	 * Iterate the original parent and move the matching brefs into
4100 	 * the new indirect block.
4101 	 *
4102 	 * XXX handle flushes.
4103 	 */
4104 	key_beg = 0;
4105 	key_end = HAMMER2_KEY_MAX;
4106 	key_next = 0;	/* avoid gcc warnings */
4107 	hammer2_spin_ex(&parent->core.spin);
4108 	loops = 0;
4109 	reason = 0;
4110 
4111 	for (;;) {
4112 		/*
4113 		 * Parent may have been modified, relocating its block array.
4114 		 * Reload the base pointer.
4115 		 */
4116 		base = hammer2_chain_base_and_count(parent, &count);
4117 
4118 		if (++loops > 100000) {
4119 		    hammer2_spin_unex(&parent->core.spin);
4120 		    panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n",
4121 			  reason, parent, base, count, key_next);
4122 		}
4123 
4124 		/*
4125 		 * NOTE: spinlock stays intact, returned chain (if not NULL)
4126 		 *	 is not referenced or locked which means that we
4127 		 *	 cannot safely check its flagged / deletion status
4128 		 *	 until we lock it.
4129 		 */
4130 		chain = hammer2_combined_find(parent, base, count,
4131 					      &key_next,
4132 					      key_beg, key_end,
4133 					      &bref);
4134 		generation = parent->core.generation;
4135 		if (bref == NULL)
4136 			break;
4137 		key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4138 
4139 		/*
4140 		 * Skip keys that are not within the key/radix of the new
4141 		 * indirect block.  They stay in the parent.
4142 		 */
4143 		if (rounddown2(key ^ bref->key, (hammer2_key_t)1 << keybits) != 0) {
4144 			goto next_key_spinlocked;
4145 		}
4146 
4147 		/*
4148 		 * Load the new indirect block by acquiring the related
4149 		 * chains (potentially from media as it might not be
4150 		 * in-memory).  Then move it to the new parent (ichain).
4151 		 *
4152 		 * chain is referenced but not locked.  We must lock the
4153 		 * chain to obtain definitive state.
4154 		 */
4155 		bsave = *bref;
4156 		if (chain) {
4157 			/*
4158 			 * Use chain already present in the RBTREE
4159 			 */
4160 			hammer2_chain_ref(chain);
4161 			hammer2_spin_unex(&parent->core.spin);
4162 			hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
4163 		} else {
4164 			/*
4165 			 * Get chain for blockref element.  _get returns NULL
4166 			 * on insertion race.
4167 			 */
4168 			hammer2_spin_unex(&parent->core.spin);
4169 			chain = hammer2_chain_get(parent, generation, &bsave,
4170 						  HAMMER2_RESOLVE_NEVER);
4171 			if (chain == NULL) {
4172 				reason = 1;
4173 				hammer2_spin_ex(&parent->core.spin);
4174 				continue;
4175 			}
4176 		}
4177 
4178 		/*
4179 		 * This is always live so if the chain has been deleted
4180 		 * we raced someone and we have to retry.
4181 		 *
4182 		 * NOTE: Lookups can race delete-duplicate because
4183 		 *	 delete-duplicate does not lock the parent's core
4184 		 *	 (they just use the spinlock on the core).
4185 		 *
4186 		 *	 (note reversed logic for this one)
4187 		 */
4188 		if (bcmp(&bsave, &chain->bref, sizeof(bsave)) ||
4189 		    chain->parent != parent ||
4190 		    (chain->flags & HAMMER2_CHAIN_DELETED)) {
4191 			hammer2_chain_unlock(chain);
4192 			hammer2_chain_drop(chain);
4193 			if (hammer2_debug & 0x0040) {
4194 				kprintf("LOST PARENT RETRY "
4195 				"RETRY (%p,%p)->%p %08x\n",
4196 				parent, chain->parent, chain, chain->flags);
4197 			}
4198 			hammer2_spin_ex(&parent->core.spin);
4199 			continue;
4200 		}
4201 
4202 		/*
4203 		 * Shift the chain to the indirect block.
4204 		 *
4205 		 * WARNING! No reason for us to load chain data, pass NOSTATS
4206 		 *	    to prevent delete/insert from trying to access
4207 		 *	    inode stats (and thus asserting if there is no
4208 		 *	    chain->data loaded).
4209 		 *
4210 		 * WARNING! The (parent, chain) deletion may modify the parent
4211 		 *	    and invalidate the base pointer.
4212 		 *
4213 		 * WARNING! Parent must already be marked modified, so we
4214 		 *	    can assume that chain_delete always suceeds.
4215 		 *
4216 		 * WARNING! hammer2_chain_repchange() does not have to be
4217 		 *	    called (and doesn't work anyway because we are
4218 		 *	    only doing a partial shift).  A recursion that is
4219 		 *	    in-progress can continue at the current parent
4220 		 *	    and will be able to properly find its next key.
4221 		 */
4222 		error = hammer2_chain_delete_obref(parent, chain, mtid, 0,
4223 						   &bsave);
4224 		KKASSERT(error == 0);
4225 		hammer2_chain_rename_obref(&ichain, chain, mtid, 0, &bsave);
4226 		hammer2_chain_unlock(chain);
4227 		hammer2_chain_drop(chain);
4228 		KKASSERT(parent->refs > 0);
4229 		chain = NULL;
4230 		base = NULL;	/* safety */
4231 		hammer2_spin_ex(&parent->core.spin);
4232 next_key_spinlocked:
4233 		if (--maxloops == 0)
4234 			panic("hammer2_chain_create_indirect: maxloops");
4235 		reason = 4;
4236 		if (key_next == 0 || key_next > key_end)
4237 			break;
4238 		key_beg = key_next;
4239 		/* loop */
4240 	}
4241 	hammer2_spin_unex(&parent->core.spin);
4242 
4243 	/*
4244 	 * Insert the new indirect block into the parent now that we've
4245 	 * cleared out some entries in the parent.  We calculated a good
4246 	 * insertion index in the loop above (ichain->index).
4247 	 *
4248 	 * We don't have to set UPDATE here because we mark ichain
4249 	 * modified down below (so the normal modified -> flush -> set-moved
4250 	 * sequence applies).
4251 	 *
4252 	 * The insertion shouldn't race as this is a completely new block
4253 	 * and the parent is locked.
4254 	 */
4255 	base = NULL;	/* safety, parent modify may change address */
4256 	KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
4257 	KKASSERT(parent->core.live_count < count);
4258 	hammer2_chain_insert(parent, ichain,
4259 			     HAMMER2_CHAIN_INSERT_SPIN |
4260 			     HAMMER2_CHAIN_INSERT_LIVE,
4261 			     0);
4262 
4263 	/*
4264 	 * Make sure flushes propogate after our manual insertion.
4265 	 */
4266 	hammer2_chain_setflush(ichain);
4267 	hammer2_chain_setflush(parent);
4268 
4269 	/*
4270 	 * Figure out what to return.
4271 	 */
4272 	if (rounddown2(create_key ^ key, (hammer2_key_t)1 << keybits) != 0) {
4273 		/*
4274 		 * Key being created is outside the key range,
4275 		 * return the original parent.
4276 		 */
4277 		hammer2_chain_unlock(ichain);
4278 		hammer2_chain_drop(ichain);
4279 	} else {
4280 		/*
4281 		 * Otherwise its in the range, return the new parent.
4282 		 * (leave both the new and old parent locked).
4283 		 */
4284 		parent = ichain;
4285 	}
4286 
4287 	return(parent);
4288 }
4289 
4290 /*
4291  * Do maintenance on an indirect chain.  Both parent and chain are locked.
4292  *
4293  * Returns non-zero if (chain) is deleted, either due to being empty or
4294  * because its children were safely moved into the parent.
4295  */
4296 int
4297 hammer2_chain_indirect_maintenance(hammer2_chain_t *parent,
4298 				   hammer2_chain_t *chain)
4299 {
4300 	hammer2_blockref_t *chain_base;
4301 	hammer2_blockref_t *base;
4302 	hammer2_blockref_t *bref;
4303 	hammer2_blockref_t bsave;
4304 	hammer2_key_t key_next;
4305 	hammer2_key_t key_beg;
4306 	hammer2_key_t key_end;
4307 	hammer2_chain_t *sub;
4308 	int chain_count;
4309 	int count;
4310 	int error;
4311 	int generation;
4312 
4313 	/*
4314 	 * Make sure we have an accurate live_count
4315 	 */
4316 	if ((chain->flags & (HAMMER2_CHAIN_INITIAL |
4317 			     HAMMER2_CHAIN_COUNTEDBREFS)) == 0) {
4318 		base = &chain->data->npdata[0];
4319 		count = chain->bytes / sizeof(hammer2_blockref_t);
4320 		hammer2_chain_countbrefs(chain, base, count);
4321 	}
4322 
4323 	/*
4324 	 * If the indirect block is empty we can delete it.
4325 	 * (ignore deletion error)
4326 	 */
4327 	if (chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)) {
4328 		hammer2_chain_delete(parent, chain,
4329 				     chain->bref.modify_tid,
4330 				     HAMMER2_DELETE_PERMANENT);
4331 		hammer2_chain_repchange(parent, chain);
4332 		return 1;
4333 	}
4334 
4335 	base = hammer2_chain_base_and_count(parent, &count);
4336 
4337 	if ((parent->flags & (HAMMER2_CHAIN_INITIAL |
4338 			     HAMMER2_CHAIN_COUNTEDBREFS)) == 0) {
4339 		hammer2_chain_countbrefs(parent, base, count);
4340 	}
4341 
4342 	/*
4343 	 * Determine if we can collapse chain into parent, calculate
4344 	 * hysteresis for chain emptiness.
4345 	 */
4346 	if (parent->core.live_count + chain->core.live_count - 1 > count)
4347 		return 0;
4348 	chain_count = chain->bytes / sizeof(hammer2_blockref_t);
4349 	if (chain->core.live_count > chain_count * 3 / 4)
4350 		return 0;
4351 
4352 	/*
4353 	 * Ok, theoretically we can collapse chain's contents into
4354 	 * parent.  chain is locked, but any in-memory children of chain
4355 	 * are not.  For this to work, we must be able to dispose of any
4356 	 * in-memory children of chain.
4357 	 *
4358 	 * For now require that there are no in-memory children of chain.
4359 	 *
4360 	 * WARNING! Both chain and parent must remain locked across this
4361 	 *	    entire operation.
4362 	 */
4363 
4364 	/*
4365 	 * Parent must be marked modified.  Don't try to collapse it if we
4366 	 * can't mark it modified.  Once modified, destroy chain to make room
4367 	 * and to get rid of what will be a conflicting key (this is included
4368 	 * in the calculation above).  Finally, move the children of chain
4369 	 * into chain's parent.
4370 	 *
4371 	 * This order creates an accounting problem for bref.embed.stats
4372 	 * because we destroy chain before we remove its children.  Any
4373 	 * elements whos blockref is already synchronized will be counted
4374 	 * twice.  To deal with the problem we clean out chain's stats prior
4375 	 * to deleting it.
4376 	 */
4377 	error = hammer2_chain_modify(parent, 0, 0, 0);
4378 	if (error) {
4379 		krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n",
4380 			    hammer2_error_str(error));
4381 		return 0;
4382 	}
4383 	error = hammer2_chain_modify(chain, chain->bref.modify_tid, 0, 0);
4384 	if (error) {
4385 		krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n",
4386 			    hammer2_error_str(error));
4387 		return 0;
4388 	}
4389 
4390 	chain->bref.embed.stats.inode_count = 0;
4391 	chain->bref.embed.stats.data_count = 0;
4392 	error = hammer2_chain_delete(parent, chain,
4393 				     chain->bref.modify_tid,
4394 				     HAMMER2_DELETE_PERMANENT);
4395 	KKASSERT(error == 0);
4396 
4397 	/*
4398 	 * The combined_find call requires core.spin to be held.  One would
4399 	 * think there wouldn't be any conflicts since we hold chain
4400 	 * exclusively locked, but the caching mechanism for 0-ref children
4401 	 * does not require a chain lock.
4402 	 */
4403 	hammer2_spin_ex(&chain->core.spin);
4404 
4405 	key_next = 0;
4406 	key_beg = 0;
4407 	key_end = HAMMER2_KEY_MAX;
4408 	for (;;) {
4409 		chain_base = &chain->data->npdata[0];
4410 		chain_count = chain->bytes / sizeof(hammer2_blockref_t);
4411 		sub = hammer2_combined_find(chain, chain_base, chain_count,
4412 					    &key_next,
4413 					    key_beg, key_end,
4414 					    &bref);
4415 		generation = chain->core.generation;
4416 		if (bref == NULL)
4417 			break;
4418 		key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4419 
4420 		bsave = *bref;
4421 		if (sub) {
4422 			hammer2_chain_ref(sub);
4423 			hammer2_spin_unex(&chain->core.spin);
4424 			hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER);
4425 		} else {
4426 			hammer2_spin_unex(&chain->core.spin);
4427 			sub = hammer2_chain_get(chain, generation, &bsave,
4428 						HAMMER2_RESOLVE_NEVER);
4429 			if (sub == NULL) {
4430 				hammer2_spin_ex(&chain->core.spin);
4431 				continue;
4432 			}
4433 		}
4434 		if (bcmp(&bsave, &sub->bref, sizeof(bsave)) ||
4435 		    sub->parent != chain ||
4436 		    (sub->flags & HAMMER2_CHAIN_DELETED)) {
4437 			hammer2_chain_unlock(sub);
4438 			hammer2_chain_drop(sub);
4439 			hammer2_spin_ex(&chain->core.spin);
4440 			sub = NULL;	/* safety */
4441 			continue;
4442 		}
4443 		error = hammer2_chain_delete_obref(chain, sub,
4444 						   sub->bref.modify_tid, 0,
4445 						   &bsave);
4446 		KKASSERT(error == 0);
4447 		hammer2_chain_rename_obref(&parent, sub,
4448 				     sub->bref.modify_tid,
4449 				     HAMMER2_INSERT_SAMEPARENT, &bsave);
4450 		hammer2_chain_unlock(sub);
4451 		hammer2_chain_drop(sub);
4452 		hammer2_spin_ex(&chain->core.spin);
4453 
4454 		if (key_next == 0)
4455 			break;
4456 		key_beg = key_next;
4457 	}
4458 	hammer2_spin_unex(&chain->core.spin);
4459 
4460 	hammer2_chain_repchange(parent, chain);
4461 
4462 	return 1;
4463 }
4464 
4465 /*
4466  * Freemap indirect blocks
4467  *
4468  * Calculate the keybits and highside/lowside of the freemap node the
4469  * caller is creating.
4470  *
4471  * This routine will specify the next higher-level freemap key/radix
4472  * representing the lowest-ordered set.  By doing so, eventually all
4473  * low-ordered sets will be moved one level down.
4474  *
4475  * We have to be careful here because the freemap reserves a limited
4476  * number of blocks for a limited number of levels.  So we can't just
4477  * push indiscriminately.
4478  */
4479 int
4480 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp,
4481 			     int keybits, hammer2_blockref_t *base, int count)
4482 {
4483 	hammer2_chain_t *chain;
4484 	hammer2_blockref_t *bref;
4485 	hammer2_key_t key;
4486 	hammer2_key_t key_beg;
4487 	hammer2_key_t key_end;
4488 	hammer2_key_t key_next;
4489 	int maxloops = 300000;
4490 
4491 	key = *keyp;
4492 	keybits = 64;
4493 
4494 	/*
4495 	 * Calculate the range of keys in the array being careful to skip
4496 	 * slots which are overridden with a deletion.
4497 	 */
4498 	key_beg = 0;
4499 	key_end = HAMMER2_KEY_MAX;
4500 	hammer2_spin_ex(&parent->core.spin);
4501 
4502 	for (;;) {
4503 		if (--maxloops == 0) {
4504 			panic("indkey_freemap shit %p %p:%d\n",
4505 			      parent, base, count);
4506 		}
4507 		chain = hammer2_combined_find(parent, base, count,
4508 					      &key_next,
4509 					      key_beg, key_end,
4510 					      &bref);
4511 
4512 		/*
4513 		 * Exhausted search
4514 		 */
4515 		if (bref == NULL)
4516 			break;
4517 
4518 		/*
4519 		 * Skip deleted chains.
4520 		 */
4521 		if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
4522 			if (key_next == 0 || key_next > key_end)
4523 				break;
4524 			key_beg = key_next;
4525 			continue;
4526 		}
4527 
4528 		/*
4529 		 * Use the full live (not deleted) element for the scan
4530 		 * iteration.  HAMMER2 does not allow partial replacements.
4531 		 *
4532 		 * XXX should be built into hammer2_combined_find().
4533 		 */
4534 		key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4535 
4536 		if (keybits > bref->keybits) {
4537 			key = bref->key;
4538 			keybits = bref->keybits;
4539 		} else if (keybits == bref->keybits && bref->key < key) {
4540 			key = bref->key;
4541 		}
4542 		if (key_next == 0)
4543 			break;
4544 		key_beg = key_next;
4545 	}
4546 	hammer2_spin_unex(&parent->core.spin);
4547 
4548 	/*
4549 	 * Return the keybits for a higher-level FREEMAP_NODE covering
4550 	 * this node.
4551 	 */
4552 	switch(keybits) {
4553 	case HAMMER2_FREEMAP_LEVEL0_RADIX:
4554 		keybits = HAMMER2_FREEMAP_LEVEL1_RADIX;
4555 		break;
4556 	case HAMMER2_FREEMAP_LEVEL1_RADIX:
4557 		keybits = HAMMER2_FREEMAP_LEVEL2_RADIX;
4558 		break;
4559 	case HAMMER2_FREEMAP_LEVEL2_RADIX:
4560 		keybits = HAMMER2_FREEMAP_LEVEL3_RADIX;
4561 		break;
4562 	case HAMMER2_FREEMAP_LEVEL3_RADIX:
4563 		keybits = HAMMER2_FREEMAP_LEVEL4_RADIX;
4564 		break;
4565 	case HAMMER2_FREEMAP_LEVEL4_RADIX:
4566 		keybits = HAMMER2_FREEMAP_LEVEL5_RADIX;
4567 		break;
4568 	case HAMMER2_FREEMAP_LEVEL5_RADIX:
4569 		panic("hammer2_chain_indkey_freemap: level too high");
4570 		break;
4571 	default:
4572 		panic("hammer2_chain_indkey_freemap: bad radix");
4573 		break;
4574 	}
4575 	*keyp = key;
4576 
4577 	return (keybits);
4578 }
4579 
4580 /*
4581  * File indirect blocks
4582  *
4583  * Calculate the key/keybits for the indirect block to create by scanning
4584  * existing keys.  The key being created is also passed in *keyp and can be
4585  * inside or outside the indirect block.  Regardless, the indirect block
4586  * must hold at least two keys in order to guarantee sufficient space.
4587  *
4588  * We use a modified version of the freemap's fixed radix tree, but taylored
4589  * for file data.  Basically we configure an indirect block encompassing the
4590  * smallest key.
4591  */
4592 static int
4593 hammer2_chain_indkey_file(hammer2_chain_t *parent, hammer2_key_t *keyp,
4594 			    int keybits, hammer2_blockref_t *base, int count,
4595 			    int ncount)
4596 {
4597 	hammer2_chain_t *chain;
4598 	hammer2_blockref_t *bref;
4599 	hammer2_key_t key;
4600 	hammer2_key_t key_beg;
4601 	hammer2_key_t key_end;
4602 	hammer2_key_t key_next;
4603 	int nradix;
4604 	int maxloops = 300000;
4605 
4606 	key = *keyp;
4607 	keybits = 64;
4608 
4609 	/*
4610 	 * Calculate the range of keys in the array being careful to skip
4611 	 * slots which are overridden with a deletion.
4612 	 *
4613 	 * Locate the smallest key.
4614 	 */
4615 	key_beg = 0;
4616 	key_end = HAMMER2_KEY_MAX;
4617 	hammer2_spin_ex(&parent->core.spin);
4618 
4619 	for (;;) {
4620 		if (--maxloops == 0) {
4621 			panic("indkey_freemap shit %p %p:%d\n",
4622 			      parent, base, count);
4623 		}
4624 		chain = hammer2_combined_find(parent, base, count,
4625 					      &key_next,
4626 					      key_beg, key_end,
4627 					      &bref);
4628 
4629 		/*
4630 		 * Exhausted search
4631 		 */
4632 		if (bref == NULL)
4633 			break;
4634 
4635 		/*
4636 		 * Skip deleted chains.
4637 		 */
4638 		if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
4639 			if (key_next == 0 || key_next > key_end)
4640 				break;
4641 			key_beg = key_next;
4642 			continue;
4643 		}
4644 
4645 		/*
4646 		 * Use the full live (not deleted) element for the scan
4647 		 * iteration.  HAMMER2 does not allow partial replacements.
4648 		 *
4649 		 * XXX should be built into hammer2_combined_find().
4650 		 */
4651 		key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4652 
4653 		if (keybits > bref->keybits) {
4654 			key = bref->key;
4655 			keybits = bref->keybits;
4656 		} else if (keybits == bref->keybits && bref->key < key) {
4657 			key = bref->key;
4658 		}
4659 		if (key_next == 0)
4660 			break;
4661 		key_beg = key_next;
4662 	}
4663 	hammer2_spin_unex(&parent->core.spin);
4664 
4665 	/*
4666 	 * Calculate the static keybits for a higher-level indirect block
4667 	 * that contains the key.
4668 	 */
4669 	*keyp = key;
4670 
4671 	switch(ncount) {
4672 	case HAMMER2_IND_COUNT_MIN:
4673 		nradix = HAMMER2_IND_RADIX_MIN - HAMMER2_BLOCKREF_RADIX;
4674 		break;
4675 	case HAMMER2_IND_COUNT_NOM:
4676 		nradix = HAMMER2_IND_RADIX_NOM - HAMMER2_BLOCKREF_RADIX;
4677 		break;
4678 	case HAMMER2_IND_COUNT_MAX:
4679 		nradix = HAMMER2_IND_RADIX_MAX - HAMMER2_BLOCKREF_RADIX;
4680 		break;
4681 	default:
4682 		panic("bad ncount %d\n", ncount);
4683 		nradix = 0;
4684 		break;
4685 	}
4686 
4687 	/*
4688 	 * The largest radix that can be returned for an indirect block is
4689 	 * 63 bits.  (The largest practical indirect block radix is actually
4690 	 * 62 bits because the top-level inode or volume root contains four
4691 	 * entries, but allow 63 to be returned).
4692 	 */
4693 	if (nradix >= 64)
4694 		nradix = 63;
4695 
4696 	return keybits + nradix;
4697 }
4698 
4699 #if 1
4700 
4701 /*
4702  * Directory indirect blocks.
4703  *
4704  * Covers both the inode index (directory of inodes), and directory contents
4705  * (filenames hardlinked to inodes).
4706  *
4707  * Because directory keys are hashed we generally try to cut the space in
4708  * half.  We accomodate the inode index (which tends to have linearly
4709  * increasing inode numbers) by ensuring that the keyspace is at least large
4710  * enough to fill up the indirect block being created.
4711  */
4712 static int
4713 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp,
4714 			 int keybits, hammer2_blockref_t *base, int count,
4715 			 int ncount)
4716 {
4717 	hammer2_blockref_t *bref;
4718 	hammer2_chain_t	*chain;
4719 	hammer2_key_t key_beg;
4720 	hammer2_key_t key_end;
4721 	hammer2_key_t key_next;
4722 	hammer2_key_t key;
4723 	int nkeybits;
4724 	int locount;
4725 	int hicount;
4726 	int maxloops = 300000;
4727 
4728 	/*
4729 	 * NOTE: We can't take a shortcut here anymore for inodes because
4730 	 *	 the root directory can contain a mix of inodes and directory
4731 	 *	 entries (we used to just return 63 if parent->bref.type was
4732 	 *	 HAMMER2_BREF_TYPE_INODE.
4733 	 */
4734 	key = *keyp;
4735 	locount = 0;
4736 	hicount = 0;
4737 
4738 	/*
4739 	 * Calculate the range of keys in the array being careful to skip
4740 	 * slots which are overridden with a deletion.
4741 	 */
4742 	key_beg = 0;
4743 	key_end = HAMMER2_KEY_MAX;
4744 	hammer2_spin_ex(&parent->core.spin);
4745 
4746 	for (;;) {
4747 		if (--maxloops == 0) {
4748 			panic("indkey_freemap shit %p %p:%d\n",
4749 			      parent, base, count);
4750 		}
4751 		chain = hammer2_combined_find(parent, base, count,
4752 					      &key_next,
4753 					      key_beg, key_end,
4754 					      &bref);
4755 
4756 		/*
4757 		 * Exhausted search
4758 		 */
4759 		if (bref == NULL)
4760 			break;
4761 
4762 		/*
4763 		 * Deleted object
4764 		 */
4765 		if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
4766 			if (key_next == 0 || key_next > key_end)
4767 				break;
4768 			key_beg = key_next;
4769 			continue;
4770 		}
4771 
4772 		/*
4773 		 * Use the full live (not deleted) element for the scan
4774 		 * iteration.  HAMMER2 does not allow partial replacements.
4775 		 *
4776 		 * XXX should be built into hammer2_combined_find().
4777 		 */
4778 		key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4779 
4780 		/*
4781 		 * Expand our calculated key range (key, keybits) to fit
4782 		 * the scanned key.  nkeybits represents the full range
4783 		 * that we will later cut in half (two halves @ nkeybits - 1).
4784 		 */
4785 		nkeybits = keybits;
4786 		if (nkeybits < bref->keybits) {
4787 			if (bref->keybits > 64) {
4788 				kprintf("bad bref chain %p bref %p\n",
4789 					chain, bref);
4790 				Debugger("fubar");
4791 			}
4792 			nkeybits = bref->keybits;
4793 		}
4794 		while (nkeybits < 64 &&
4795 		       rounddown2(key ^ bref->key, (hammer2_key_t)1 << nkeybits) != 0) {
4796 			++nkeybits;
4797 		}
4798 
4799 		/*
4800 		 * If the new key range is larger we have to determine
4801 		 * which side of the new key range the existing keys fall
4802 		 * under by checking the high bit, then collapsing the
4803 		 * locount into the hicount or vise-versa.
4804 		 */
4805 		if (keybits != nkeybits) {
4806 			if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
4807 				hicount += locount;
4808 				locount = 0;
4809 			} else {
4810 				locount += hicount;
4811 				hicount = 0;
4812 			}
4813 			keybits = nkeybits;
4814 		}
4815 
4816 		/*
4817 		 * The newly scanned key will be in the lower half or the
4818 		 * upper half of the (new) key range.
4819 		 */
4820 		if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
4821 			++hicount;
4822 		else
4823 			++locount;
4824 
4825 		if (key_next == 0)
4826 			break;
4827 		key_beg = key_next;
4828 	}
4829 	hammer2_spin_unex(&parent->core.spin);
4830 	bref = NULL;	/* now invalid (safety) */
4831 
4832 	/*
4833 	 * Adjust keybits to represent half of the full range calculated
4834 	 * above (radix 63 max) for our new indirect block.
4835 	 */
4836 	--keybits;
4837 
4838 	/*
4839 	 * Expand keybits to hold at least ncount elements.  ncount will be
4840 	 * a power of 2.  This is to try to completely fill leaf nodes (at
4841 	 * least for keys which are not hashes).
4842 	 *
4843 	 * We aren't counting 'in' or 'out', we are counting 'high side'
4844 	 * and 'low side' based on the bit at (1LL << keybits).  We want
4845 	 * everything to be inside in these cases so shift it all to
4846 	 * the low or high side depending on the new high bit.
4847 	 */
4848 	while (((hammer2_key_t)1 << keybits) < ncount) {
4849 		++keybits;
4850 		if (key & ((hammer2_key_t)1 << keybits)) {
4851 			hicount += locount;
4852 			locount = 0;
4853 		} else {
4854 			locount += hicount;
4855 			hicount = 0;
4856 		}
4857 	}
4858 
4859 	if (hicount > locount)
4860 		key |= (hammer2_key_t)1 << keybits;
4861 	else
4862 		key &= ~(hammer2_key_t)1 << keybits;
4863 
4864 	*keyp = key;
4865 
4866 	return (keybits);
4867 }
4868 
4869 #else
4870 
4871 /*
4872  * Directory indirect blocks.
4873  *
4874  * Covers both the inode index (directory of inodes), and directory contents
4875  * (filenames hardlinked to inodes).
4876  *
4877  * Because directory keys are hashed we generally try to cut the space in
4878  * half.  We accomodate the inode index (which tends to have linearly
4879  * increasing inode numbers) by ensuring that the keyspace is at least large
4880  * enough to fill up the indirect block being created.
4881  */
4882 static int
4883 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp,
4884 			 int keybits, hammer2_blockref_t *base, int count,
4885 			 int ncount)
4886 {
4887 	hammer2_blockref_t *bref;
4888 	hammer2_chain_t	*chain;
4889 	hammer2_key_t key_beg;
4890 	hammer2_key_t key_end;
4891 	hammer2_key_t key_next;
4892 	hammer2_key_t key;
4893 	int nkeybits;
4894 	int locount;
4895 	int hicount;
4896 	int maxloops = 300000;
4897 
4898 	/*
4899 	 * Shortcut if the parent is the inode.  In this situation the
4900 	 * parent has 4+1 directory entries and we are creating an indirect
4901 	 * block capable of holding many more.
4902 	 */
4903 	if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
4904 		return 63;
4905 	}
4906 
4907 	key = *keyp;
4908 	locount = 0;
4909 	hicount = 0;
4910 
4911 	/*
4912 	 * Calculate the range of keys in the array being careful to skip
4913 	 * slots which are overridden with a deletion.
4914 	 */
4915 	key_beg = 0;
4916 	key_end = HAMMER2_KEY_MAX;
4917 	hammer2_spin_ex(&parent->core.spin);
4918 
4919 	for (;;) {
4920 		if (--maxloops == 0) {
4921 			panic("indkey_freemap shit %p %p:%d\n",
4922 			      parent, base, count);
4923 		}
4924 		chain = hammer2_combined_find(parent, base, count,
4925 					      &key_next,
4926 					      key_beg, key_end,
4927 					      &bref);
4928 
4929 		/*
4930 		 * Exhausted search
4931 		 */
4932 		if (bref == NULL)
4933 			break;
4934 
4935 		/*
4936 		 * Deleted object
4937 		 */
4938 		if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
4939 			if (key_next == 0 || key_next > key_end)
4940 				break;
4941 			key_beg = key_next;
4942 			continue;
4943 		}
4944 
4945 		/*
4946 		 * Use the full live (not deleted) element for the scan
4947 		 * iteration.  HAMMER2 does not allow partial replacements.
4948 		 *
4949 		 * XXX should be built into hammer2_combined_find().
4950 		 */
4951 		key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4952 
4953 		/*
4954 		 * Expand our calculated key range (key, keybits) to fit
4955 		 * the scanned key.  nkeybits represents the full range
4956 		 * that we will later cut in half (two halves @ nkeybits - 1).
4957 		 */
4958 		nkeybits = keybits;
4959 		if (nkeybits < bref->keybits) {
4960 			if (bref->keybits > 64) {
4961 				kprintf("bad bref chain %p bref %p\n",
4962 					chain, bref);
4963 				Debugger("fubar");
4964 			}
4965 			nkeybits = bref->keybits;
4966 		}
4967 		while (nkeybits < 64 &&
4968 		       (~(((hammer2_key_t)1 << nkeybits) - 1) &
4969 		        (key ^ bref->key)) != 0) {
4970 			++nkeybits;
4971 		}
4972 
4973 		/*
4974 		 * If the new key range is larger we have to determine
4975 		 * which side of the new key range the existing keys fall
4976 		 * under by checking the high bit, then collapsing the
4977 		 * locount into the hicount or vise-versa.
4978 		 */
4979 		if (keybits != nkeybits) {
4980 			if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
4981 				hicount += locount;
4982 				locount = 0;
4983 			} else {
4984 				locount += hicount;
4985 				hicount = 0;
4986 			}
4987 			keybits = nkeybits;
4988 		}
4989 
4990 		/*
4991 		 * The newly scanned key will be in the lower half or the
4992 		 * upper half of the (new) key range.
4993 		 */
4994 		if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
4995 			++hicount;
4996 		else
4997 			++locount;
4998 
4999 		if (key_next == 0)
5000 			break;
5001 		key_beg = key_next;
5002 	}
5003 	hammer2_spin_unex(&parent->core.spin);
5004 	bref = NULL;	/* now invalid (safety) */
5005 
5006 	/*
5007 	 * Adjust keybits to represent half of the full range calculated
5008 	 * above (radix 63 max) for our new indirect block.
5009 	 */
5010 	--keybits;
5011 
5012 	/*
5013 	 * Expand keybits to hold at least ncount elements.  ncount will be
5014 	 * a power of 2.  This is to try to completely fill leaf nodes (at
5015 	 * least for keys which are not hashes).
5016 	 *
5017 	 * We aren't counting 'in' or 'out', we are counting 'high side'
5018 	 * and 'low side' based on the bit at (1LL << keybits).  We want
5019 	 * everything to be inside in these cases so shift it all to
5020 	 * the low or high side depending on the new high bit.
5021 	 */
5022 	while (((hammer2_key_t)1 << keybits) < ncount) {
5023 		++keybits;
5024 		if (key & ((hammer2_key_t)1 << keybits)) {
5025 			hicount += locount;
5026 			locount = 0;
5027 		} else {
5028 			locount += hicount;
5029 			hicount = 0;
5030 		}
5031 	}
5032 
5033 	if (hicount > locount)
5034 		key |= (hammer2_key_t)1 << keybits;
5035 	else
5036 		key &= ~(hammer2_key_t)1 << keybits;
5037 
5038 	*keyp = key;
5039 
5040 	return (keybits);
5041 }
5042 
5043 #endif
5044 
5045 /*
5046  * Sets CHAIN_DELETED and remove the chain's blockref from the parent if
5047  * it exists.
5048  *
5049  * Both parent and chain must be locked exclusively.
5050  *
5051  * This function will modify the parent if the blockref requires removal
5052  * from the parent's block table.
5053  *
5054  * This function is NOT recursive.  Any entity already pushed into the
5055  * chain (such as an inode) may still need visibility into its contents,
5056  * as well as the ability to read and modify the contents.  For example,
5057  * for an unlinked file which is still open.
5058  *
5059  * Also note that the flusher is responsible for cleaning up empty
5060  * indirect blocks.
5061  */
5062 int
5063 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain,
5064 		     hammer2_tid_t mtid, int flags)
5065 {
5066 	int error = 0;
5067 
5068 	KKASSERT(hammer2_mtx_owned(&chain->lock));
5069 
5070 	/*
5071 	 * Nothing to do if already marked.
5072 	 *
5073 	 * We need the spinlock on the core whos RBTREE contains chain
5074 	 * to protect against races.
5075 	 */
5076 	if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) {
5077 		KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 &&
5078 			 chain->parent == parent);
5079 		error = _hammer2_chain_delete_helper(parent, chain,
5080 						     mtid, flags, NULL);
5081 	}
5082 
5083 	/*
5084 	 * Permanent deletions mark the chain as destroyed.
5085 	 *
5086 	 * NOTE: We do not setflush the chain unless the deletion is
5087 	 *	 permanent, since the deletion of a chain does not actually
5088 	 *	 require it to be flushed.
5089 	 */
5090 	if (error == 0) {
5091 		if (flags & HAMMER2_DELETE_PERMANENT) {
5092 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
5093 			hammer2_chain_setflush(chain);
5094 		}
5095 	}
5096 
5097 	return error;
5098 }
5099 
5100 static int
5101 hammer2_chain_delete_obref(hammer2_chain_t *parent, hammer2_chain_t *chain,
5102 		     hammer2_tid_t mtid, int flags,
5103 		     hammer2_blockref_t *obref)
5104 {
5105 	int error = 0;
5106 
5107 	KKASSERT(hammer2_mtx_owned(&chain->lock));
5108 
5109 	/*
5110 	 * Nothing to do if already marked.
5111 	 *
5112 	 * We need the spinlock on the core whos RBTREE contains chain
5113 	 * to protect against races.
5114 	 */
5115 	obref->type = HAMMER2_BREF_TYPE_EMPTY;
5116 	if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) {
5117 		KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 &&
5118 			 chain->parent == parent);
5119 		error = _hammer2_chain_delete_helper(parent, chain,
5120 						     mtid, flags, obref);
5121 	}
5122 
5123 	/*
5124 	 * Permanent deletions mark the chain as destroyed.
5125 	 *
5126 	 * NOTE: We do not setflush the chain unless the deletion is
5127 	 *	 permanent, since the deletion of a chain does not actually
5128 	 *	 require it to be flushed.
5129 	 */
5130 	if (error == 0) {
5131 		if (flags & HAMMER2_DELETE_PERMANENT) {
5132 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
5133 			hammer2_chain_setflush(chain);
5134 		}
5135 	}
5136 
5137 	return error;
5138 }
5139 
5140 /*
5141  * Returns the index of the nearest element in the blockref array >= elm.
5142  * Returns (count) if no element could be found.
5143  *
5144  * Sets *key_nextp to the next key for loop purposes but does not modify
5145  * it if the next key would be higher than the current value of *key_nextp.
5146  * Note that *key_nexp can overflow to 0, which should be tested by the
5147  * caller.
5148  *
5149  * WARNING!  Must be called with parent's spinlock held.  Spinlock remains
5150  *	     held through the operation.
5151  */
5152 static int
5153 hammer2_base_find(hammer2_chain_t *parent,
5154 		  hammer2_blockref_t *base, int count,
5155 		  hammer2_key_t *key_nextp,
5156 		  hammer2_key_t key_beg, hammer2_key_t key_end)
5157 {
5158 	hammer2_blockref_t *scan;
5159 	hammer2_key_t scan_end;
5160 	int i;
5161 	int limit;
5162 
5163 	/*
5164 	 * Require the live chain's already have their core's counted
5165 	 * so we can optimize operations.
5166 	 */
5167 	KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS);
5168 
5169 	/*
5170 	 * Degenerate case
5171 	 */
5172 	if (count == 0 || base == NULL)
5173 		return(count);
5174 
5175 	/*
5176 	 * Sequential optimization using parent->cache_index.  This is
5177 	 * the most likely scenario.
5178 	 *
5179 	 * We can avoid trailing empty entries on live chains, otherwise
5180 	 * we might have to check the whole block array.
5181 	 */
5182 	i = parent->cache_index;	/* SMP RACE OK */
5183 	cpu_ccfence();
5184 	limit = parent->core.live_zero;
5185 	if (i >= limit)
5186 		i = limit - 1;
5187 	if (i < 0)
5188 		i = 0;
5189 	KKASSERT(i < count);
5190 
5191 	/*
5192 	 * Search backwards
5193 	 */
5194 	scan = &base[i];
5195 	while (i > 0 && (scan->type == HAMMER2_BREF_TYPE_EMPTY ||
5196 	    scan->key > key_beg)) {
5197 		--scan;
5198 		--i;
5199 	}
5200 	parent->cache_index = i;
5201 
5202 	/*
5203 	 * Search forwards, stop when we find a scan element which
5204 	 * encloses the key or until we know that there are no further
5205 	 * elements.
5206 	 */
5207 	while (i < count) {
5208 		if (scan->type != HAMMER2_BREF_TYPE_EMPTY) {
5209 			scan_end = scan->key +
5210 				   ((hammer2_key_t)1 << scan->keybits) - 1;
5211 			if (scan->key > key_beg || scan_end >= key_beg)
5212 				break;
5213 		}
5214 		if (i >= limit)
5215 			return (count);
5216 		++scan;
5217 		++i;
5218 	}
5219 	if (i != count) {
5220 		parent->cache_index = i;
5221 		if (i >= limit) {
5222 			i = count;
5223 		} else {
5224 			scan_end = scan->key +
5225 				   ((hammer2_key_t)1 << scan->keybits);
5226 			if (scan_end && (*key_nextp > scan_end ||
5227 					 *key_nextp == 0)) {
5228 				*key_nextp = scan_end;
5229 			}
5230 		}
5231 	}
5232 	return (i);
5233 }
5234 
5235 /*
5236  * Do a combined search and return the next match either from the blockref
5237  * array or from the in-memory chain.  Sets *brefp to the returned bref in
5238  * both cases, or sets it to NULL if the search exhausted.  Only returns
5239  * a non-NULL chain if the search matched from the in-memory chain.
5240  *
5241  * When no in-memory chain has been found and a non-NULL bref is returned
5242  * in *brefp.
5243  *
5244  *
5245  * The returned chain is not locked or referenced.  Use the returned bref
5246  * to determine if the search exhausted or not.  Iterate if the base find
5247  * is chosen but matches a deleted chain.
5248  *
5249  * WARNING!  Must be called with parent's spinlock held.  Spinlock remains
5250  *	     held through the operation.
5251  */
5252 static hammer2_chain_t *
5253 hammer2_combined_find(hammer2_chain_t *parent,
5254 		      hammer2_blockref_t *base, int count,
5255 		      hammer2_key_t *key_nextp,
5256 		      hammer2_key_t key_beg, hammer2_key_t key_end,
5257 		      hammer2_blockref_t **brefp)
5258 {
5259 	hammer2_blockref_t *bref;
5260 	hammer2_chain_t *chain;
5261 	int i;
5262 
5263 	/*
5264 	 * Lookup in block array and in rbtree.
5265 	 */
5266 	*key_nextp = key_end + 1;
5267 	i = hammer2_base_find(parent, base, count, key_nextp,
5268 			      key_beg, key_end);
5269 	chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end);
5270 
5271 	/*
5272 	 * Neither matched
5273 	 */
5274 	if (i == count && chain == NULL) {
5275 		*brefp = NULL;
5276 		return(NULL);
5277 	}
5278 
5279 	/*
5280 	 * Only chain matched.
5281 	 */
5282 	if (i == count) {
5283 		bref = &chain->bref;
5284 		goto found;
5285 	}
5286 
5287 	/*
5288 	 * Only blockref matched.
5289 	 */
5290 	if (chain == NULL) {
5291 		bref = &base[i];
5292 		goto found;
5293 	}
5294 
5295 	/*
5296 	 * Both in-memory and blockref matched, select the nearer element.
5297 	 *
5298 	 * If both are flush with the left-hand side or both are the
5299 	 * same distance away, select the chain.  In this situation the
5300 	 * chain must have been loaded from the matching blockmap.
5301 	 */
5302 	if ((chain->bref.key <= key_beg && base[i].key <= key_beg) ||
5303 	    chain->bref.key == base[i].key) {
5304 		KKASSERT(chain->bref.key == base[i].key);
5305 		bref = &chain->bref;
5306 		goto found;
5307 	}
5308 
5309 	/*
5310 	 * Select the nearer key
5311 	 */
5312 	if (chain->bref.key < base[i].key) {
5313 		bref = &chain->bref;
5314 	} else {
5315 		bref = &base[i];
5316 		chain = NULL;
5317 	}
5318 
5319 	/*
5320 	 * If the bref is out of bounds we've exhausted our search.
5321 	 */
5322 found:
5323 	if (bref->key > key_end) {
5324 		*brefp = NULL;
5325 		chain = NULL;
5326 	} else {
5327 		*brefp = bref;
5328 	}
5329 	return(chain);
5330 }
5331 
5332 /*
5333  * Locate the specified block array element and delete it.  The element
5334  * must exist.
5335  *
5336  * The spin lock on the related chain must be held.
5337  *
5338  * NOTE: live_count was adjusted when the chain was deleted, so it does not
5339  *	 need to be adjusted when we commit the media change.
5340  */
5341 void
5342 hammer2_base_delete(hammer2_chain_t *parent,
5343 		    hammer2_blockref_t *base, int count,
5344 		    hammer2_chain_t *chain,
5345 		    hammer2_blockref_t *obref)
5346 {
5347 	hammer2_blockref_t *elm = &chain->bref;
5348 	hammer2_blockref_t *scan;
5349 	hammer2_key_t key_next;
5350 	int i;
5351 
5352 	/*
5353 	 * Delete element.  Expect the element to exist.
5354 	 *
5355 	 * XXX see caller, flush code not yet sophisticated enough to prevent
5356 	 *     re-flushed in some cases.
5357 	 */
5358 	key_next = 0; /* max range */
5359 	i = hammer2_base_find(parent, base, count, &key_next,
5360 			      elm->key, elm->key);
5361 	scan = &base[i];
5362 	if (i == count || scan->type == HAMMER2_BREF_TYPE_EMPTY ||
5363 	    scan->key != elm->key ||
5364 	    ((chain->flags & HAMMER2_CHAIN_BLKMAPUPD) == 0 &&
5365 	     scan->keybits != elm->keybits)) {
5366 		hammer2_spin_unex(&parent->core.spin);
5367 		panic("delete base %p element not found at %d/%d elm %p\n",
5368 		      base, i, count, elm);
5369 		return;
5370 	}
5371 
5372 	/*
5373 	 * Update stats and zero the entry.
5374 	 *
5375 	 * NOTE: Handle radix == 0 (0 bytes) case.
5376 	 */
5377 	if ((int)(scan->data_off & HAMMER2_OFF_MASK_RADIX)) {
5378 		parent->bref.embed.stats.data_count -= (hammer2_off_t)1 <<
5379 				(int)(scan->data_off & HAMMER2_OFF_MASK_RADIX);
5380 	}
5381 	switch(scan->type) {
5382 	case HAMMER2_BREF_TYPE_INODE:
5383 		--parent->bref.embed.stats.inode_count;
5384 		/* fall through */
5385 	case HAMMER2_BREF_TYPE_DATA:
5386 		if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) {
5387 			atomic_set_int(&chain->flags,
5388 				       HAMMER2_CHAIN_HINT_LEAF_COUNT);
5389 		} else {
5390 			if (parent->bref.leaf_count)
5391 				--parent->bref.leaf_count;
5392 		}
5393 		/* fall through */
5394 	case HAMMER2_BREF_TYPE_INDIRECT:
5395 		if (scan->type != HAMMER2_BREF_TYPE_DATA) {
5396 			parent->bref.embed.stats.data_count -=
5397 				scan->embed.stats.data_count;
5398 			parent->bref.embed.stats.inode_count -=
5399 				scan->embed.stats.inode_count;
5400 		}
5401 		if (scan->type == HAMMER2_BREF_TYPE_INODE)
5402 			break;
5403 		if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) {
5404 			atomic_set_int(&chain->flags,
5405 				       HAMMER2_CHAIN_HINT_LEAF_COUNT);
5406 		} else {
5407 			if (parent->bref.leaf_count <= scan->leaf_count)
5408 				parent->bref.leaf_count = 0;
5409 			else
5410 				parent->bref.leaf_count -= scan->leaf_count;
5411 		}
5412 		break;
5413 	case HAMMER2_BREF_TYPE_DIRENT:
5414 		if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) {
5415 			atomic_set_int(&chain->flags,
5416 				       HAMMER2_CHAIN_HINT_LEAF_COUNT);
5417 		} else {
5418 			if (parent->bref.leaf_count)
5419 				--parent->bref.leaf_count;
5420 		}
5421 	default:
5422 		break;
5423 	}
5424 
5425 	if (obref)
5426 		*obref = *scan;
5427 	bzero(scan, sizeof(*scan));
5428 
5429 	/*
5430 	 * We can only optimize parent->core.live_zero for live chains.
5431 	 */
5432 	if (parent->core.live_zero == i + 1) {
5433 		while (--i >= 0 && base[i].type == HAMMER2_BREF_TYPE_EMPTY)
5434 			;
5435 		parent->core.live_zero = i + 1;
5436 	}
5437 
5438 	/*
5439 	 * Clear appropriate blockmap flags in chain.
5440 	 */
5441 	atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED |
5442 					HAMMER2_CHAIN_BLKMAPUPD);
5443 }
5444 
5445 /*
5446  * Insert the specified element.  The block array must not already have the
5447  * element and must have space available for the insertion.
5448  *
5449  * The spin lock on the related chain must be held.
5450  *
5451  * NOTE: live_count was adjusted when the chain was deleted, so it does not
5452  *	 need to be adjusted when we commit the media change.
5453  */
5454 void
5455 hammer2_base_insert(hammer2_chain_t *parent,
5456 		    hammer2_blockref_t *base, int count,
5457 		    hammer2_chain_t *chain, hammer2_blockref_t *elm)
5458 {
5459 	hammer2_key_t key_next;
5460 	hammer2_key_t xkey;
5461 	int i;
5462 	int j;
5463 	int k;
5464 	int l;
5465 	int u = 1;
5466 
5467 	/*
5468 	 * Insert new element.  Expect the element to not already exist
5469 	 * unless we are replacing it.
5470 	 *
5471 	 * XXX see caller, flush code not yet sophisticated enough to prevent
5472 	 *     re-flushed in some cases.
5473 	 */
5474 	key_next = 0; /* max range */
5475 	i = hammer2_base_find(parent, base, count, &key_next,
5476 			      elm->key, elm->key);
5477 
5478 	/*
5479 	 * Shortcut fill optimization, typical ordered insertion(s) may not
5480 	 * require a search.
5481 	 */
5482 	KKASSERT(i >= 0 && i <= count);
5483 
5484 	/*
5485 	 * Set appropriate blockmap flags in chain (if not NULL)
5486 	 */
5487 	if (chain)
5488 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED);
5489 
5490 	/*
5491 	 * Update stats and zero the entry
5492 	 */
5493 	if ((int)(elm->data_off & HAMMER2_OFF_MASK_RADIX)) {
5494 		parent->bref.embed.stats.data_count += (hammer2_off_t)1 <<
5495 				(int)(elm->data_off & HAMMER2_OFF_MASK_RADIX);
5496 	}
5497 	switch(elm->type) {
5498 	case HAMMER2_BREF_TYPE_INODE:
5499 		++parent->bref.embed.stats.inode_count;
5500 		/* fall through */
5501 	case HAMMER2_BREF_TYPE_DATA:
5502 		if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX)
5503 			++parent->bref.leaf_count;
5504 		/* fall through */
5505 	case HAMMER2_BREF_TYPE_INDIRECT:
5506 		if (elm->type != HAMMER2_BREF_TYPE_DATA) {
5507 			parent->bref.embed.stats.data_count +=
5508 				elm->embed.stats.data_count;
5509 			parent->bref.embed.stats.inode_count +=
5510 				elm->embed.stats.inode_count;
5511 		}
5512 		if (elm->type == HAMMER2_BREF_TYPE_INODE)
5513 			break;
5514 		if (parent->bref.leaf_count + elm->leaf_count <
5515 		    HAMMER2_BLOCKREF_LEAF_MAX) {
5516 			parent->bref.leaf_count += elm->leaf_count;
5517 		} else {
5518 			parent->bref.leaf_count = HAMMER2_BLOCKREF_LEAF_MAX;
5519 		}
5520 		break;
5521 	case HAMMER2_BREF_TYPE_DIRENT:
5522 		if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX)
5523 			++parent->bref.leaf_count;
5524 		break;
5525 	default:
5526 		break;
5527 	}
5528 
5529 
5530 	/*
5531 	 * We can only optimize parent->core.live_zero for live chains.
5532 	 */
5533 	if (i == count && parent->core.live_zero < count) {
5534 		i = parent->core.live_zero++;
5535 		base[i] = *elm;
5536 		return;
5537 	}
5538 
5539 	xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1;
5540 	if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) {
5541 		hammer2_spin_unex(&parent->core.spin);
5542 		panic("insert base %p overlapping elements at %d elm %p\n",
5543 		      base, i, elm);
5544 	}
5545 
5546 	/*
5547 	 * Try to find an empty slot before or after.
5548 	 */
5549 	j = i;
5550 	k = i;
5551 	while (j > 0 || k < count) {
5552 		--j;
5553 		if (j >= 0 && base[j].type == HAMMER2_BREF_TYPE_EMPTY) {
5554 			if (j == i - 1) {
5555 				base[j] = *elm;
5556 			} else {
5557 				bcopy(&base[j+1], &base[j],
5558 				      (i - j - 1) * sizeof(*base));
5559 				base[i - 1] = *elm;
5560 			}
5561 			goto validate;
5562 		}
5563 		++k;
5564 		if (k < count && base[k].type == HAMMER2_BREF_TYPE_EMPTY) {
5565 			bcopy(&base[i], &base[i+1],
5566 			      (k - i) * sizeof(hammer2_blockref_t));
5567 			base[i] = *elm;
5568 
5569 			/*
5570 			 * We can only update parent->core.live_zero for live
5571 			 * chains.
5572 			 */
5573 			if (parent->core.live_zero <= k)
5574 				parent->core.live_zero = k + 1;
5575 			u = 2;
5576 			goto validate;
5577 		}
5578 	}
5579 	panic("hammer2_base_insert: no room!");
5580 
5581 	/*
5582 	 * Debugging
5583 	 */
5584 validate:
5585 	key_next = 0;
5586 	for (l = 0; l < count; ++l) {
5587 		if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) {
5588 			key_next = base[l].key +
5589 				   ((hammer2_key_t)1 << base[l].keybits) - 1;
5590 			break;
5591 		}
5592 	}
5593 	while (++l < count) {
5594 		if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) {
5595 			if (base[l].key <= key_next)
5596 				panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l);
5597 			key_next = base[l].key +
5598 				   ((hammer2_key_t)1 << base[l].keybits) - 1;
5599 
5600 		}
5601 	}
5602 
5603 }
5604 
5605 #if 0
5606 
5607 /*
5608  * Sort the blockref array for the chain.  Used by the flush code to
5609  * sort the blockref[] array.
5610  *
5611  * The chain must be exclusively locked AND spin-locked.
5612  */
5613 typedef hammer2_blockref_t *hammer2_blockref_p;
5614 
5615 static
5616 int
5617 hammer2_base_sort_callback(const void *v1, const void *v2)
5618 {
5619 	hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1;
5620 	hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2;
5621 
5622 	/*
5623 	 * Make sure empty elements are placed at the end of the array
5624 	 */
5625 	if (bref1->type == HAMMER2_BREF_TYPE_EMPTY) {
5626 		if (bref2->type == HAMMER2_BREF_TYPE_EMPTY)
5627 			return(0);
5628 		return(1);
5629 	} else if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) {
5630 		return(-1);
5631 	}
5632 
5633 	/*
5634 	 * Sort by key
5635 	 */
5636 	if (bref1->key < bref2->key)
5637 		return(-1);
5638 	if (bref1->key > bref2->key)
5639 		return(1);
5640 	return(0);
5641 }
5642 
5643 void
5644 hammer2_base_sort(hammer2_chain_t *chain)
5645 {
5646 	hammer2_blockref_t *base;
5647 	int count;
5648 
5649 	switch(chain->bref.type) {
5650 	case HAMMER2_BREF_TYPE_INODE:
5651 		/*
5652 		 * Special shortcut for embedded data returns the inode
5653 		 * itself.  Callers must detect this condition and access
5654 		 * the embedded data (the strategy code does this for us).
5655 		 *
5656 		 * This is only applicable to regular files and softlinks.
5657 		 */
5658 		if (chain->data->ipdata.meta.op_flags &
5659 		    HAMMER2_OPFLAG_DIRECTDATA) {
5660 			return;
5661 		}
5662 		base = &chain->data->ipdata.u.blockset.blockref[0];
5663 		count = HAMMER2_SET_COUNT;
5664 		break;
5665 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
5666 	case HAMMER2_BREF_TYPE_INDIRECT:
5667 		/*
5668 		 * Optimize indirect blocks in the INITIAL state to avoid
5669 		 * I/O.
5670 		 */
5671 		KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0);
5672 		base = &chain->data->npdata[0];
5673 		count = chain->bytes / sizeof(hammer2_blockref_t);
5674 		break;
5675 	case HAMMER2_BREF_TYPE_VOLUME:
5676 		base = &chain->data->voldata.sroot_blockset.blockref[0];
5677 		count = HAMMER2_SET_COUNT;
5678 		break;
5679 	case HAMMER2_BREF_TYPE_FREEMAP:
5680 		base = &chain->data->blkset.blockref[0];
5681 		count = HAMMER2_SET_COUNT;
5682 		break;
5683 	default:
5684 		panic("hammer2_base_sort: unrecognized "
5685 		      "blockref(A) type: %d",
5686 		      chain->bref.type);
5687 		base = NULL;	/* safety */
5688 		count = 0;	/* safety */
5689 		break;
5690 	}
5691 	kqsort(base, count, sizeof(*base), hammer2_base_sort_callback);
5692 }
5693 
5694 #endif
5695 
5696 /*
5697  * Set the check data for a chain.  This can be a heavy-weight operation
5698  * and typically only runs on-flush.  For file data check data is calculated
5699  * when the logical buffers are flushed.
5700  */
5701 void
5702 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata)
5703 {
5704 	atomic_clear_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED);
5705 
5706 	switch(HAMMER2_DEC_CHECK(chain->bref.methods)) {
5707 	case HAMMER2_CHECK_NONE:
5708 		break;
5709 	case HAMMER2_CHECK_DISABLED:
5710 		break;
5711 	case HAMMER2_CHECK_ISCSI32:
5712 		chain->bref.check.iscsi32.value =
5713 			hammer2_icrc32(bdata, chain->bytes);
5714 		break;
5715 	case HAMMER2_CHECK_XXHASH64:
5716 		chain->bref.check.xxhash64.value =
5717 			XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED);
5718 		break;
5719 	case HAMMER2_CHECK_SHA192:
5720 		assert(0); /* XXX unsupported */
5721 		/*
5722 		{
5723 			SHA256_CTX hash_ctx;
5724 			union {
5725 				uint8_t digest[SHA256_DIGEST_LENGTH];
5726 				uint64_t digest64[SHA256_DIGEST_LENGTH/8];
5727 			} u;
5728 
5729 			SHA256_Init(&hash_ctx);
5730 			SHA256_Update(&hash_ctx, bdata, chain->bytes);
5731 			SHA256_Final(u.digest, &hash_ctx);
5732 			u.digest64[2] ^= u.digest64[3];
5733 			bcopy(u.digest,
5734 			      chain->bref.check.sha192.data,
5735 			      sizeof(chain->bref.check.sha192.data));
5736 		}
5737 		*/
5738 		break;
5739 	case HAMMER2_CHECK_FREEMAP:
5740 		chain->bref.check.freemap.icrc32 =
5741 			hammer2_icrc32(bdata, chain->bytes);
5742 		break;
5743 	default:
5744 		kprintf("hammer2_chain_setcheck: unknown check type %02x\n",
5745 			chain->bref.methods);
5746 		break;
5747 	}
5748 }
5749 
5750 /*
5751  * Characterize a failed check code and try to trace back to the inode.
5752  */
5753 static void
5754 hammer2_characterize_failed_chain(hammer2_chain_t *chain, uint64_t check,
5755 				  int bits)
5756 {
5757 	hammer2_chain_t *lchain;
5758 	hammer2_chain_t *ochain;
5759 	int did;
5760 
5761 	did = krateprintf(&krate_h2chk,
5762 		"chain %016jx.%02x (%s) meth=%02x CHECK FAIL "
5763 		"(flags=%08x, bref/data ",
5764 		chain->bref.data_off,
5765 		chain->bref.type,
5766 		hammer2_bref_type_str(chain->bref.type),
5767 		chain->bref.methods,
5768 		chain->flags);
5769 	if (did == 0)
5770 		return;
5771 
5772 	if (bits == 32) {
5773 		kprintf("%08x/%08x)\n",
5774 			chain->bref.check.iscsi32.value,
5775 			(uint32_t)check);
5776 	} else {
5777 		kprintf("%016jx/%016jx)\n",
5778 			chain->bref.check.xxhash64.value,
5779 			check);
5780 	}
5781 
5782 	/*
5783 	 * Run up the chains to try to find the governing inode so we
5784 	 * can report it.
5785 	 *
5786 	 * XXX This error reporting is not really MPSAFE
5787 	 */
5788 	ochain = chain;
5789 	lchain = chain;
5790 	while (chain && chain->bref.type != HAMMER2_BREF_TYPE_INODE) {
5791 		lchain = chain;
5792 		chain = chain->parent;
5793 	}
5794 
5795 	if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
5796 	    ((chain->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) == 0 ||
5797 	     (lchain->bref.key & HAMMER2_DIRHASH_VISIBLE))) {
5798 		kprintf("   Resides at/in inode %ld\n",
5799 			chain->bref.key);
5800 	} else if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
5801 		kprintf("   Resides in inode index - CRITICAL!!!\n");
5802 	} else {
5803 		kprintf("   Resides in root index - CRITICAL!!!\n");
5804 	}
5805 	if (ochain->hmp) {
5806 		const char *pfsname = "UNKNOWN";
5807 		int i;
5808 
5809 		if (ochain->pmp) {
5810 			for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
5811 				if (ochain->pmp->pfs_hmps[i] == ochain->hmp &&
5812 				    ochain->pmp->pfs_names[i]) {
5813 					pfsname = ochain->pmp->pfs_names[i];
5814 					break;
5815 				}
5816 			}
5817 		}
5818 		kprintf("   In pfs %s on device %s\n",
5819 			pfsname, ochain->hmp->devrepname);
5820 	}
5821 }
5822 
5823 /*
5824  * Returns non-zero on success, 0 on failure.
5825  */
5826 int
5827 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata)
5828 {
5829 	uint32_t check32;
5830 	uint64_t check64;
5831 	int r;
5832 
5833 	if (chain->flags & HAMMER2_CHAIN_NOTTESTED)
5834 		return 1;
5835 
5836 	switch(HAMMER2_DEC_CHECK(chain->bref.methods)) {
5837 	case HAMMER2_CHECK_NONE:
5838 		r = 1;
5839 		break;
5840 	case HAMMER2_CHECK_DISABLED:
5841 		r = 1;
5842 		break;
5843 	case HAMMER2_CHECK_ISCSI32:
5844 		check32 = hammer2_icrc32(bdata, chain->bytes);
5845 		r = (chain->bref.check.iscsi32.value == check32);
5846 		if (r == 0) {
5847 			hammer2_characterize_failed_chain(chain, check32, 32);
5848 		}
5849 		hammer2_process_icrc32 += chain->bytes;
5850 		break;
5851 	case HAMMER2_CHECK_XXHASH64:
5852 		check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED);
5853 		r = (chain->bref.check.xxhash64.value == check64);
5854 		if (r == 0) {
5855 			hammer2_characterize_failed_chain(chain, check64, 64);
5856 		}
5857 		hammer2_process_xxhash64 += chain->bytes;
5858 		break;
5859 	case HAMMER2_CHECK_SHA192:
5860 		assert(0); /* XXX unsupported */
5861 		/*
5862 		{
5863 			SHA256_CTX hash_ctx;
5864 			union {
5865 				uint8_t digest[SHA256_DIGEST_LENGTH];
5866 				uint64_t digest64[SHA256_DIGEST_LENGTH/8];
5867 			} u;
5868 
5869 			SHA256_Init(&hash_ctx);
5870 			SHA256_Update(&hash_ctx, bdata, chain->bytes);
5871 			SHA256_Final(u.digest, &hash_ctx);
5872 			u.digest64[2] ^= u.digest64[3];
5873 			if (bcmp(u.digest,
5874 				 chain->bref.check.sha192.data,
5875 			         sizeof(chain->bref.check.sha192.data)) == 0) {
5876 				r = 1;
5877 			} else {
5878 				r = 0;
5879 				krateprintf(&krate_h2chk,
5880 					"chain %016jx.%02x meth=%02x "
5881 					"CHECK FAIL\n",
5882 					chain->bref.data_off,
5883 					chain->bref.type,
5884 					chain->bref.methods);
5885 			}
5886 		}
5887 		*/
5888 		break;
5889 	case HAMMER2_CHECK_FREEMAP:
5890 		r = (chain->bref.check.freemap.icrc32 ==
5891 		     hammer2_icrc32(bdata, chain->bytes));
5892 		if (r == 0) {
5893 			int did;
5894 
5895 			did = krateprintf(&krate_h2chk,
5896 					  "chain %016jx.%02x meth=%02x "
5897 					  "CHECK FAIL\n",
5898 					  chain->bref.data_off,
5899 					  chain->bref.type,
5900 					  chain->bref.methods);
5901 			if (did) {
5902 				kprintf("freemap.icrc %08x icrc32 %08x (%d)\n",
5903 					chain->bref.check.freemap.icrc32,
5904 					hammer2_icrc32(bdata, chain->bytes),
5905 					chain->bytes);
5906 				if (chain->dio) {
5907 					kprintf("dio %p buf %016jx,%ld "
5908 						"bdata %p/%p\n",
5909 						chain->dio,
5910 						(intmax_t)chain->dio->bp->b_loffset,
5911 						chain->dio->bp->b_bufsize,
5912 						bdata,
5913 						chain->dio->bp->b_data);
5914 				}
5915 			}
5916 		}
5917 		break;
5918 	default:
5919 		kprintf("hammer2_chain_testcheck: unknown check type %02x\n",
5920 			chain->bref.methods);
5921 		r = 1;
5922 		break;
5923 	}
5924 	return r;
5925 }
5926 
5927 /*
5928  * Acquire the chain and parent representing the specified inode for the
5929  * device at the specified cluster index.
5930  *
5931  * The flags passed in are LOOKUP flags, not RESOLVE flags.
5932  *
5933  * If we are unable to locate the inode, HAMMER2_ERROR_EIO or HAMMER2_ERROR_CHECK
5934  * is returned.  In case of error, *chainp and/or *parentp may still be returned
5935  * non-NULL.
5936  *
5937  * The caller may pass-in a locked *parentp and/or *chainp, or neither.
5938  * They will be unlocked and released by this function.  The *parentp and
5939  * *chainp representing the located inode are returned locked.
5940  *
5941  * The returned error includes any error on the returned chain in addition to
5942  * errors incurred while trying to lookup the inode.  However, a chain->error
5943  * might not be recognized if HAMMER2_LOOKUP_NODATA is passed.  This flag may
5944  * not be passed to this function.
5945  */
5946 int
5947 hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum,
5948 			 int clindex, int flags,
5949 			 hammer2_chain_t **parentp, hammer2_chain_t **chainp)
5950 {
5951 	hammer2_chain_t *parent;
5952 	hammer2_chain_t *rchain;
5953 	hammer2_key_t key_dummy;
5954 	hammer2_inode_t *ip;
5955 	int resolve_flags;
5956 	int error;
5957 
5958 	KKASSERT((flags & HAMMER2_LOOKUP_NODATA) == 0);
5959 
5960 	resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ?
5961 			HAMMER2_RESOLVE_SHARED : 0;
5962 
5963 	/*
5964 	 * Caller expects us to replace these.
5965 	 */
5966 	if (*chainp) {
5967 		hammer2_chain_unlock(*chainp);
5968 		hammer2_chain_drop(*chainp);
5969 		*chainp = NULL;
5970 	}
5971 	if (*parentp) {
5972 		hammer2_chain_unlock(*parentp);
5973 		hammer2_chain_drop(*parentp);
5974 		*parentp = NULL;
5975 	}
5976 
5977 	/*
5978 	 * Be very careful, this is a backend function and we CANNOT
5979 	 * lock any frontend inode structure we find.  But we have to
5980 	 * look the inode up this way first in case it exists but is
5981 	 * detached from the radix tree.
5982 	 */
5983 	ip = hammer2_inode_lookup(pmp, inum);
5984 	if (ip) {
5985 		*chainp = hammer2_inode_chain_and_parent(ip, clindex,
5986 						       parentp,
5987 						       resolve_flags);
5988 		hammer2_inode_drop(ip);
5989 		if (*chainp)
5990 			return (*chainp)->error;
5991 		hammer2_chain_unlock(*chainp);
5992 		hammer2_chain_drop(*chainp);
5993 		*chainp = NULL;
5994 		if (*parentp) {
5995 			hammer2_chain_unlock(*parentp);
5996 			hammer2_chain_drop(*parentp);
5997 			*parentp = NULL;
5998 		}
5999 	}
6000 
6001 	/*
6002 	 * Inodes hang off of the iroot (bit 63 is clear, differentiating
6003 	 * inodes from root directory entries in the key lookup).
6004 	 */
6005 	parent = hammer2_inode_chain(pmp->iroot, clindex, resolve_flags);
6006 	rchain = NULL;
6007 	if (parent) {
6008 		/*
6009 		 * NOTE: rchain can be returned as NULL even if error == 0
6010 		 *	 (i.e. not found)
6011 		 */
6012 		rchain = hammer2_chain_lookup(&parent, &key_dummy,
6013 					      inum, inum,
6014 					      &error, flags);
6015 		/*
6016 		 * Propagate a chain-specific error to caller.
6017 		 *
6018 		 * If the chain is not errored, we must still validate that the inode
6019 		 * number is correct, because all hell will break loose if it isn't
6020 		 * correct.  It should always be correct so print to the console and
6021 		 * simulate a CHECK error if it is not.
6022 		 */
6023 		if (error == 0 && rchain) {
6024 			error = rchain->error;
6025 			if (error == 0 && rchain->data) {
6026 				if (inum != rchain->data->ipdata.meta.inum) {
6027 					kprintf("hammer2_chain_inode_find: lookup inum %ld, "
6028 						"got valid inode but with inum %ld\n",
6029 						inum, rchain->data->ipdata.meta.inum);
6030 					error = HAMMER2_ERROR_CHECK;
6031 					rchain->error = error;
6032 				}
6033 			}
6034 		}
6035 	} else {
6036 		error = HAMMER2_ERROR_EIO;
6037 	}
6038 	*parentp = parent;
6039 	*chainp = rchain;
6040 
6041 	return error;
6042 }
6043 
6044 /*
6045  * Used by the bulkscan code to snapshot the synchronized storage for
6046  * a volume, allowing it to be scanned concurrently against normal
6047  * operation.
6048  */
6049 hammer2_chain_t *
6050 hammer2_chain_bulksnap(hammer2_dev_t *hmp)
6051 {
6052 	hammer2_chain_t *copy;
6053 
6054 	copy = hammer2_chain_alloc(hmp, hmp->spmp, &hmp->vchain.bref);
6055 	copy->data = kmalloc(sizeof(copy->data->voldata),
6056 			     hmp->mmsg, M_WAITOK | M_ZERO);
6057 	hammer2_voldata_lock(hmp);
6058 	copy->data->voldata = hmp->volsync;
6059 	hammer2_voldata_unlock(hmp);
6060 
6061 	return copy;
6062 }
6063 
6064 void
6065 hammer2_chain_bulkdrop(hammer2_chain_t *copy)
6066 {
6067 	KKASSERT(copy->bref.type == HAMMER2_BREF_TYPE_VOLUME);
6068 	KKASSERT(copy->data);
6069 	kfree(copy->data, copy->hmp->mmsg);
6070 	copy->data = NULL;
6071 	hammer2_chain_drop(copy);
6072 }
6073 
6074 /*
6075  * Returns non-zero if the chain (INODE or DIRENT) matches the
6076  * filename.
6077  */
6078 int
6079 hammer2_chain_dirent_test(hammer2_chain_t *chain, const char *name,
6080 			  size_t name_len)
6081 {
6082 	const hammer2_inode_data_t *ripdata;
6083 
6084 	if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
6085 		ripdata = &chain->data->ipdata;
6086 		if (ripdata->meta.name_len == name_len &&
6087 		    bcmp(ripdata->filename, name, name_len) == 0) {
6088 			return 1;
6089 		}
6090 	}
6091 	if (chain->bref.type == HAMMER2_BREF_TYPE_DIRENT &&
6092 	    chain->bref.embed.dirent.namlen == name_len) {
6093 		if (name_len > sizeof(chain->bref.check.buf) &&
6094 		    bcmp(chain->data->buf, name, name_len) == 0) {
6095 			return 1;
6096 		}
6097 		if (name_len <= sizeof(chain->bref.check.buf) &&
6098 		    bcmp(chain->bref.check.buf, name, name_len) == 0) {
6099 			return 1;
6100 		}
6101 	}
6102 	return 0;
6103 }
6104 
6105 /*
6106  * Debugging
6107  */
6108 void
6109 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int bi, int *countp,
6110 		   char pfx, u_int flags)
6111 {
6112 	hammer2_chain_t *scan;
6113 	hammer2_chain_t *parent;
6114 
6115 	--*countp;
6116 	if (*countp == 0) {
6117 		kprintf("%*.*s...\n", tab, tab, "");
6118 		return;
6119 	}
6120 	if (*countp < 0)
6121 		return;
6122 	kprintf("%*.*s%c-chain %p %s.%-3d %016jx %016jx/%-2d mir=%016jx\n",
6123 		tab, tab, "", pfx, chain,
6124 		hammer2_bref_type_str(chain->bref.type), bi,
6125 		chain->bref.data_off, chain->bref.key, chain->bref.keybits,
6126 		chain->bref.mirror_tid);
6127 
6128 	kprintf("%*.*s      [%08x] (%s) refs=%d",
6129 		tab, tab, "",
6130 		chain->flags,
6131 		((chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
6132 		chain->data) ?  (char *)chain->data->ipdata.filename : "?"),
6133 		chain->refs);
6134 
6135 	parent = chain->parent;
6136 	if (parent)
6137 		kprintf("\n%*.*s      p=%p [pflags %08x prefs %d]",
6138 			tab, tab, "",
6139 			parent, parent->flags, parent->refs);
6140 	if (RB_EMPTY(&chain->core.rbtree)) {
6141 		kprintf("\n");
6142 	} else {
6143 		int bi = 0;
6144 		kprintf(" {\n");
6145 		RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree) {
6146 			if ((scan->flags & flags) || flags == (u_int)-1) {
6147 				hammer2_dump_chain(scan, tab + 4, bi, countp,
6148 						   'a', flags);
6149 			}
6150 			bi++;
6151 		}
6152 		if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data)
6153 			kprintf("%*.*s}(%s)\n", tab, tab, "",
6154 				chain->data->ipdata.filename);
6155 		else
6156 			kprintf("%*.*s}\n", tab, tab, "");
6157 	}
6158 }
6159 
6160 void
6161 hammer2_dump_chains(hammer2_dev_t *hmp, char vpfx, char fpfx)
6162 {
6163 	int dumpcnt;
6164 
6165 	dumpcnt = 50;
6166 	hammer2_dump_chain(&hmp->vchain, 0, 0, &dumpcnt, vpfx, (u_int)-1);
6167 
6168 	dumpcnt = 50;
6169 	hammer2_dump_chain(&hmp->fchain, 0, 0, &dumpcnt, fpfx, (u_int)-1);
6170 }
6171