xref: /dragonfly/sys/vfs/hammer2/hammer2_chain.c (revision a72de5ad)
1 /*
2  * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * and Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem implements most of the core support functions for
37  * the hammer2_chain structure.
38  *
39  * Chains are the in-memory version on media objects (volume header, inodes,
40  * indirect blocks, data blocks, etc).  Chains represent a portion of the
41  * HAMMER2 topology.
42  *
43  * Chains are no-longer delete-duplicated.  Instead, the original in-memory
44  * chain will be moved along with its block reference (e.g. for things like
45  * renames, hardlink operations, modifications, etc), and will be indexed
46  * on a secondary list for flush handling instead of propagating a flag
47  * upward to the root.
48  *
49  * Concurrent front-end operations can still run against backend flushes
50  * as long as they do not cross the current flush boundary.  An operation
51  * running above the current flush (in areas not yet flushed) can become
52  * part of the current flush while ano peration running below the current
53  * flush can become part of the next flush.
54  */
55 #include <sys/cdefs.h>
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/types.h>
59 #include <sys/lock.h>
60 #include <sys/kern_syscall.h>
61 #include <sys/uuid.h>
62 
63 #include <crypto/sha2/sha2.h>
64 
65 #include "hammer2.h"
66 
67 static hammer2_chain_t *hammer2_chain_create_indirect(
68 		hammer2_chain_t *parent,
69 		hammer2_key_t key, int keybits,
70 		hammer2_tid_t mtid, int for_type, int *errorp);
71 static hammer2_io_t *hammer2_chain_drop_data(hammer2_chain_t *chain);
72 static hammer2_chain_t *hammer2_combined_find(
73 		hammer2_chain_t *parent,
74 		hammer2_blockref_t *base, int count,
75 		hammer2_key_t *key_nextp,
76 		hammer2_key_t key_beg, hammer2_key_t key_end,
77 		hammer2_blockref_t **bresp);
78 
79 static struct krate krate_h2me = { .freq = 1 };
80 
81 /*
82  * Basic RBTree for chains (core->rbtree and core->dbtree).  Chains cannot
83  * overlap in the RB trees.  Deleted chains are moved from rbtree to either
84  * dbtree or to dbq.
85  *
86  * Chains in delete-duplicate sequences can always iterate through core_entry
87  * to locate the live version of the chain.
88  */
89 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
90 
91 int
92 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
93 {
94 	hammer2_key_t c1_beg;
95 	hammer2_key_t c1_end;
96 	hammer2_key_t c2_beg;
97 	hammer2_key_t c2_end;
98 
99 	/*
100 	 * Compare chains.  Overlaps are not supposed to happen and catch
101 	 * any software issues early we count overlaps as a match.
102 	 */
103 	c1_beg = chain1->bref.key;
104 	c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1;
105 	c2_beg = chain2->bref.key;
106 	c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1;
107 
108 	if (c1_end < c2_beg)	/* fully to the left */
109 		return(-1);
110 	if (c1_beg > c2_end)	/* fully to the right */
111 		return(1);
112 	return(0);		/* overlap (must not cross edge boundary) */
113 }
114 
115 /*
116  * Assert that a chain has no media data associated with it.
117  */
118 static __inline void
119 hammer2_chain_assert_no_data(hammer2_chain_t *chain)
120 {
121 	KKASSERT(chain->dio == NULL);
122 	if (chain->bref.type != HAMMER2_BREF_TYPE_VOLUME &&
123 	    chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP &&
124 	    chain->data) {
125 		panic("hammer2_assert_no_data: chain %p still has data", chain);
126 	}
127 }
128 
129 /*
130  * Make a chain visible to the flusher.  The flusher needs to be able to
131  * do flushes of subdirectory chains or single files so it does a top-down
132  * recursion using the ONFLUSH flag for the recursion.  It locates MODIFIED
133  * or UPDATE chains and flushes back up the chain to the volume root.
134  *
135  * This routine sets ONFLUSH upward until it hits the volume root.  For
136  * simplicity we ignore PFSROOT boundaries whos rules can be complex.
137  * Extra ONFLUSH flagging doesn't hurt the filesystem.
138  */
139 void
140 hammer2_chain_setflush(hammer2_chain_t *chain)
141 {
142 	hammer2_chain_t *parent;
143 
144 	if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) {
145 		hammer2_spin_sh(&chain->core.spin);
146 		while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) {
147 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
148 			if ((parent = chain->parent) == NULL)
149 				break;
150 			hammer2_spin_sh(&parent->core.spin);
151 			hammer2_spin_unsh(&chain->core.spin);
152 			chain = parent;
153 		}
154 		hammer2_spin_unsh(&chain->core.spin);
155 	}
156 }
157 
158 /*
159  * Allocate a new disconnected chain element representing the specified
160  * bref.  chain->refs is set to 1 and the passed bref is copied to
161  * chain->bref.  chain->bytes is derived from the bref.
162  *
163  * chain->pmp inherits pmp unless the chain is an inode (other than the
164  * super-root inode).
165  *
166  * NOTE: Returns a referenced but unlocked (because there is no core) chain.
167  */
168 hammer2_chain_t *
169 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp,
170 		    hammer2_blockref_t *bref)
171 {
172 	hammer2_chain_t *chain;
173 	u_int bytes;
174 
175 	/*
176 	 * Special case - radix of 0 indicates a chain that does not
177 	 * need a data reference (context is completely embedded in the
178 	 * bref).
179 	 */
180 	if ((int)(bref->data_off & HAMMER2_OFF_MASK_RADIX))
181 		bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
182 	else
183 		bytes = 0;
184 
185 	atomic_add_long(&hammer2_chain_allocs, 1);
186 
187 	/*
188 	 * Construct the appropriate system structure.
189 	 */
190 	switch(bref->type) {
191 	case HAMMER2_BREF_TYPE_DIRENT:
192 	case HAMMER2_BREF_TYPE_INODE:
193 	case HAMMER2_BREF_TYPE_INDIRECT:
194 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
195 	case HAMMER2_BREF_TYPE_DATA:
196 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
197 		/*
198 		 * Chain's are really only associated with the hmp but we
199 		 * maintain a pmp association for per-mount memory tracking
200 		 * purposes.  The pmp can be NULL.
201 		 */
202 		chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO);
203 		break;
204 	case HAMMER2_BREF_TYPE_VOLUME:
205 	case HAMMER2_BREF_TYPE_FREEMAP:
206 		/*
207 		 * Only hammer2_chain_bulksnap() calls this function with these
208 		 * types.
209 		 */
210 		chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO);
211 		break;
212 	default:
213 		chain = NULL;
214 		panic("hammer2_chain_alloc: unrecognized blockref type: %d",
215 		      bref->type);
216 	}
217 
218 	/*
219 	 * Initialize the new chain structure.  pmp must be set to NULL for
220 	 * chains belonging to the super-root topology of a device mount.
221 	 */
222 	if (pmp == hmp->spmp)
223 		chain->pmp = NULL;
224 	else
225 		chain->pmp = pmp;
226 	chain->hmp = hmp;
227 	chain->bref = *bref;
228 	chain->bytes = bytes;
229 	chain->refs = 1;
230 	chain->flags = HAMMER2_CHAIN_ALLOCATED;
231 
232 	/*
233 	 * Set the PFS boundary flag if this chain represents a PFS root.
234 	 */
235 	if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT)
236 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_PFSBOUNDARY);
237 	hammer2_chain_core_init(chain);
238 
239 	return (chain);
240 }
241 
242 /*
243  * Initialize a chain's core structure.  This structure used to be allocated
244  * but is now embedded.
245  *
246  * The core is not locked.  No additional refs on the chain are made.
247  * (trans) must not be NULL if (core) is not NULL.
248  */
249 void
250 hammer2_chain_core_init(hammer2_chain_t *chain)
251 {
252 	/*
253 	 * Fresh core under nchain (no multi-homing of ochain's
254 	 * sub-tree).
255 	 */
256 	RB_INIT(&chain->core.rbtree);	/* live chains */
257 	hammer2_mtx_init(&chain->lock, "h2chain");
258 }
259 
260 /*
261  * Add a reference to a chain element, preventing its destruction.
262  *
263  * (can be called with spinlock held)
264  */
265 void
266 hammer2_chain_ref(hammer2_chain_t *chain)
267 {
268 	if (atomic_fetchadd_int(&chain->refs, 1) == 0) {
269 		/*
270 		 * 0->non-zero transition must ensure that chain is removed
271 		 * from the LRU list.
272 		 *
273 		 * NOTE: Already holding lru_spin here so we cannot call
274 		 *	 hammer2_chain_ref() to get it off lru_list, do
275 		 *	 it manually.
276 		 */
277 		if (chain->flags & HAMMER2_CHAIN_ONLRU) {
278 			hammer2_pfs_t *pmp = chain->pmp;
279 			hammer2_spin_ex(&pmp->lru_spin);
280 			if (chain->flags & HAMMER2_CHAIN_ONLRU) {
281 				atomic_add_int(&pmp->lru_count, -1);
282 				atomic_clear_int(&chain->flags,
283 						 HAMMER2_CHAIN_ONLRU);
284 				TAILQ_REMOVE(&pmp->lru_list, chain, lru_node);
285 			}
286 			hammer2_spin_unex(&pmp->lru_spin);
287 		}
288 	}
289 #if 0
290 	kprintf("REFC %p %d %08x\n", chain, chain->refs - 1, chain->flags);
291 	print_backtrace(8);
292 #endif
293 }
294 
295 /*
296  * Ref a locked chain and force the data to be held across an unlock.
297  * Chain must be currently locked.  The user of the chain who desires
298  * to release the hold must call hammer2_chain_lock_unhold() to relock
299  * and unhold the chain, then unlock normally, or may simply call
300  * hammer2_chain_drop_unhold() (which is safer against deadlocks).
301  */
302 void
303 hammer2_chain_ref_hold(hammer2_chain_t *chain)
304 {
305 	atomic_add_int(&chain->lockcnt, 1);
306 	hammer2_chain_ref(chain);
307 }
308 
309 /*
310  * Insert the chain in the core rbtree.
311  *
312  * Normal insertions are placed in the live rbtree.  Insertion of a deleted
313  * chain is a special case used by the flush code that is placed on the
314  * unstaged deleted list to avoid confusing the live view.
315  */
316 #define HAMMER2_CHAIN_INSERT_SPIN	0x0001
317 #define HAMMER2_CHAIN_INSERT_LIVE	0x0002
318 #define HAMMER2_CHAIN_INSERT_RACE	0x0004
319 
320 static
321 int
322 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain,
323 		     int flags, int generation)
324 {
325 	hammer2_chain_t *xchain;
326 	int error = 0;
327 
328 	if (flags & HAMMER2_CHAIN_INSERT_SPIN)
329 		hammer2_spin_ex(&parent->core.spin);
330 
331 	/*
332 	 * Interlocked by spinlock, check for race
333 	 */
334 	if ((flags & HAMMER2_CHAIN_INSERT_RACE) &&
335 	    parent->core.generation != generation) {
336 		error = HAMMER2_ERROR_EAGAIN;
337 		goto failed;
338 	}
339 
340 	/*
341 	 * Insert chain
342 	 */
343 	xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain);
344 	KASSERT(xchain == NULL,
345 		("hammer2_chain_insert: collision %p %p (key=%016jx)",
346 		chain, xchain, chain->bref.key));
347 	atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
348 	chain->parent = parent;
349 	++parent->core.chain_count;
350 	++parent->core.generation;	/* XXX incs for _get() too, XXX */
351 
352 	/*
353 	 * We have to keep track of the effective live-view blockref count
354 	 * so the create code knows when to push an indirect block.
355 	 */
356 	if (flags & HAMMER2_CHAIN_INSERT_LIVE)
357 		atomic_add_int(&parent->core.live_count, 1);
358 failed:
359 	if (flags & HAMMER2_CHAIN_INSERT_SPIN)
360 		hammer2_spin_unex(&parent->core.spin);
361 	return error;
362 }
363 
364 /*
365  * Drop the caller's reference to the chain.  When the ref count drops to
366  * zero this function will try to disassociate the chain from its parent and
367  * deallocate it, then recursely drop the parent using the implied ref
368  * from the chain's chain->parent.
369  *
370  * Nobody should own chain's mutex on the 1->0 transition, unless this drop
371  * races an acquisition by another cpu.  Therefore we can loop if we are
372  * unable to acquire the mutex, and refs is unlikely to be 1 unless we again
373  * race against another drop.
374  */
375 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain);
376 
377 void
378 hammer2_chain_drop(hammer2_chain_t *chain)
379 {
380 	u_int refs;
381 
382 	if (hammer2_debug & 0x200000)
383 		Debugger("drop");
384 #if 0
385 	kprintf("DROP %p %d %08x\n", chain, chain->refs - 1, chain->flags);
386 	print_backtrace(8);
387 #endif
388 
389 	KKASSERT(chain->refs > 0);
390 
391 	while (chain) {
392 		refs = chain->refs;
393 		cpu_ccfence();
394 		KKASSERT(refs > 0);
395 
396 		if (refs == 1) {
397 			if (mtx_lock_ex_try(&chain->lock) == 0)
398 				chain = hammer2_chain_lastdrop(chain);
399 			/* retry the same chain */
400 		} else {
401 			if (atomic_cmpset_int(&chain->refs, refs, refs - 1))
402 				break;
403 			/* retry the same chain */
404 		}
405 		cpu_pause();
406 	}
407 }
408 
409 /*
410  * Unhold a held and probably not-locked chain, ensure that the data is
411  * dropped on the 1->0 transition of lockcnt by obtaining an exclusive
412  * lock and then simply unlocking the chain.
413  */
414 void
415 hammer2_chain_drop_unhold(hammer2_chain_t *chain)
416 {
417 	u_int lockcnt;
418 	int iter = 0;
419 
420 	for (;;) {
421 		lockcnt = chain->lockcnt;
422 		cpu_ccfence();
423 		if (lockcnt > 1) {
424 			if (atomic_cmpset_int(&chain->lockcnt,
425 					      lockcnt, lockcnt - 1)) {
426 				break;
427 			}
428 		} else if (mtx_lock_ex_try(&chain->lock) == 0) {
429 			hammer2_chain_unlock(chain);
430 			break;
431 		} else {
432 			/*
433 			 * This situation can easily occur on SMP due to
434 			 * the gap inbetween the 1->0 transition and the
435 			 * final unlock.  We cannot safely block on the
436 			 * mutex because lockcnt might go above 1.
437 			 *
438 			 * XXX Sleep for one tick if it takes too long.
439 			 */
440 			if (++iter > 1000) {
441 				if (iter > 1000 + hz) {
442 					kprintf("hammer2: h2race1 %p\n", chain);
443 					iter = 1000;
444 				}
445 				tsleep(&iter, 0, "h2race1", 1);
446 			}
447 			cpu_pause();
448 		}
449 	}
450 	hammer2_chain_drop(chain);
451 }
452 
453 /*
454  * Handles the (potential) last drop of chain->refs from 1->0.  Called with
455  * the mutex exclusively locked, refs == 1, and lockcnt 0.  SMP races are
456  * possible against refs and lockcnt.  We must dispose of the mutex on chain.
457  *
458  * This function returns an unlocked chain for recursive drop or NULL.  It
459  * can return the same chain if it determines it has raced another ref.
460  *
461  * --
462  *
463  * When two chains need to be recursively dropped we use the chain we
464  * would otherwise free to placehold the additional chain.  It's a bit
465  * convoluted but we can't just recurse without potentially blowing out
466  * the kernel stack.
467  *
468  * The chain cannot be freed if it has any children.
469  * The chain cannot be freed if flagged MODIFIED unless we can dispose of it.
470  * The chain cannot be freed if flagged UPDATE unless we can dispose of it.
471  * Any dedup registration can remain intact.
472  *
473  * The core spinlock is allowed to nest child-to-parent (not parent-to-child).
474  */
475 static
476 hammer2_chain_t *
477 hammer2_chain_lastdrop(hammer2_chain_t *chain)
478 {
479 	hammer2_pfs_t *pmp;
480 	hammer2_dev_t *hmp;
481 	hammer2_chain_t *parent;
482 	hammer2_chain_t *rdrop;
483 #if 0
484 	hammer2_io_t *dio;
485 #endif
486 
487 #if 0
488 	/*
489 	 * On last drop if there is no parent and data_off is good (at
490 	 * least does not represent the volume root), the modified chain
491 	 * is probably going to be destroyed.  We have to make sure that
492 	 * the data area is not registered for dedup.
493 	 *
494 	 * XXX removed. In fact, we do not have to make sure that the
495 	 *     data area is not registered for dedup.  The data area
496 	 *     can, in fact, still be used for dedup because it is
497 	 *     still allocated in the freemap and the underlying I/O
498 	 *     will still be flushed.
499 	 */
500 	if (chain->parent == NULL &&
501 	    (chain->flags & HAMMER2_CHAIN_MODIFIED) &&
502 	    (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) {
503 		hmp = chain->hmp;
504 		hammer2_io_dedup_delete(hmp, chain->bref.type,
505 					chain->bref.data_off, chain->bytes);
506 	}
507 #endif
508 	/*
509 	 * We need chain's spinlock to interlock the sub-tree test.
510 	 * We already have chain's mutex, protecting chain->parent.
511 	 *
512 	 * Remember that chain->refs can be in flux.
513 	 */
514 	hammer2_spin_ex(&chain->core.spin);
515 
516 	if ((parent = chain->parent) != NULL) {
517 		/*
518 		 * If the chain has a parent the UPDATE bit prevents scrapping
519 		 * as the chain is needed to properly flush the parent.  Try
520 		 * to complete the 1->0 transition and return NULL.  Retry
521 		 * (return chain) if we are unable to complete the 1->0
522 		 * transition, else return NULL (nothing more to do).
523 		 *
524 		 * If the chain has a parent the MODIFIED bit prevents
525 		 * scrapping.
526 		 *
527 		 * Chains with UPDATE/MODIFIED are *not* put on the LRU list!
528 		 */
529 		if (chain->flags & (HAMMER2_CHAIN_UPDATE |
530 				    HAMMER2_CHAIN_MODIFIED)) {
531 			if (atomic_cmpset_int(&chain->refs, 1, 0)) {
532 				hammer2_spin_unex(&chain->core.spin);
533 #if 0
534 				dio = hammer2_chain_drop_data(chain, 0);
535 				if (dio)
536 					hammer2_io_bqrelse(&dio);
537 #endif
538 				hammer2_chain_assert_no_data(chain);
539 				hammer2_mtx_unlock(&chain->lock);
540 				chain = NULL;
541 			} else {
542 				hammer2_spin_unex(&chain->core.spin);
543 				hammer2_mtx_unlock(&chain->lock);
544 			}
545 			return (chain);
546 		}
547 		/* spinlock still held */
548 	} else {
549 		/*
550 		 * The chain has no parent and can be flagged for destruction.
551 		 * Since it has no parent, UPDATE can also be cleared.
552 		 */
553 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
554 		if (chain->flags & HAMMER2_CHAIN_UPDATE)
555 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
556 
557 		/*
558 		 * If the chain has children we must still flush the chain.
559 		 * Any dedup is already handled by the underlying DIO, so
560 		 * we do not have to specifically flush it here.
561 		 *
562 		 * In the case where it has children, the DESTROY flag test
563 		 * in the flush code will prevent unnecessary flushes of
564 		 * MODIFIED chains that are not flagged DEDUP so don't worry
565 		 * about that here.
566 		 */
567 		if (chain->core.chain_count) {
568 			/*
569 			 * Put on flushq (should ensure refs > 1), retry
570 			 * the drop.
571 			 */
572 			hammer2_spin_unex(&chain->core.spin);
573 			hammer2_delayed_flush(chain);
574 			hammer2_mtx_unlock(&chain->lock);
575 
576 			return(chain);	/* retry drop */
577 		}
578 
579 		/*
580 		 * Otherwise we can scrap the MODIFIED bit if it is set,
581 		 * and continue along the freeing path.
582 		 *
583 		 * Be sure to clean-out any dedup bits.  Without a parent
584 		 * this chain will no longer be visible to the flush code.
585 		 * Easy check data_off to avoid the volume root.
586 		 */
587 		if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
588 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
589 			atomic_add_long(&hammer2_count_modified_chains, -1);
590 			if (chain->pmp)
591 				hammer2_pfs_memory_wakeup(chain->pmp);
592 		}
593 		/* spinlock still held */
594 	}
595 
596 	/* spinlock still held */
597 #if 0
598 	dio = NULL;
599 #endif
600 
601 	/*
602 	 * If any children exist we must leave the chain intact with refs == 0.
603 	 * They exist because chains are retained below us which have refs or
604 	 * may require flushing.
605 	 *
606 	 * Retry (return chain) if we fail to transition the refs to 0, else
607 	 * return NULL indication nothing more to do.
608 	 *
609 	 * Chains with children are NOT put on the LRU list.
610 	 */
611 	if (chain->core.chain_count) {
612 		if (atomic_cmpset_int(&chain->refs, 1, 0)) {
613 			hammer2_spin_unex(&chain->core.spin);
614 			hammer2_chain_assert_no_data(chain);
615 			hammer2_mtx_unlock(&chain->lock);
616 			chain = NULL;
617 		} else {
618 			hammer2_spin_unex(&chain->core.spin);
619 			hammer2_mtx_unlock(&chain->lock);
620 		}
621 		return (chain);
622 	}
623 	/* spinlock still held */
624 	/* no chains left under us */
625 
626 	/*
627 	 * chain->core has no children left so no accessors can get to our
628 	 * chain from there.  Now we have to lock the parent core to interlock
629 	 * remaining possible accessors that might bump chain's refs before
630 	 * we can safely drop chain's refs with intent to free the chain.
631 	 */
632 	hmp = chain->hmp;
633 	pmp = chain->pmp;	/* can be NULL */
634 	rdrop = NULL;
635 
636 	parent = chain->parent;
637 
638 	/*
639 	 * WARNING! chain's spin lock is still held here, and other spinlocks
640 	 *	    will be acquired and released in the code below.  We
641 	 *	    cannot be making fancy procedure calls!
642 	 */
643 
644 	/*
645 	 * We can cache the chain if it is associated with a pmp
646 	 * and not flagged as being destroyed or requesting a full
647 	 * release.  In this situation the chain is not removed
648 	 * from its parent, i.e. it can still be looked up.
649 	 *
650 	 * We intentionally do not cache DATA chains because these
651 	 * were likely used to load data into the logical buffer cache
652 	 * and will not be accessed again for some time.
653 	 */
654 	if ((chain->flags &
655 	     (HAMMER2_CHAIN_DESTROY | HAMMER2_CHAIN_RELEASE)) == 0 &&
656 	    chain->pmp &&
657 	    chain->bref.type != HAMMER2_BREF_TYPE_DATA) {
658 		if (parent)
659 			hammer2_spin_ex(&parent->core.spin);
660 		if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
661 			/*
662 			 * 1->0 transition failed, retry.  Do not drop
663 			 * the chain's data yet!
664 			 */
665 			if (parent)
666 				hammer2_spin_unex(&parent->core.spin);
667 			hammer2_spin_unex(&chain->core.spin);
668 			hammer2_mtx_unlock(&chain->lock);
669 
670 			return(chain);
671 		}
672 
673 		/*
674 		 * Success
675 		 */
676 #if 0
677 		dio = hammer2_chain_drop_data(chain, 1);
678 #endif
679 		hammer2_chain_assert_no_data(chain);
680 
681 		KKASSERT((chain->flags & HAMMER2_CHAIN_ONLRU) == 0);
682 		hammer2_spin_ex(&pmp->lru_spin);
683 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONLRU);
684 		TAILQ_INSERT_TAIL(&pmp->lru_list, chain, lru_node);
685 
686 		/*
687 		 * If we are over the LRU limit we need to drop something.
688 		 */
689 		if (pmp->lru_count > HAMMER2_LRU_LIMIT) {
690 			rdrop = TAILQ_FIRST(&pmp->lru_list);
691 			atomic_clear_int(&rdrop->flags, HAMMER2_CHAIN_ONLRU);
692 			TAILQ_REMOVE(&pmp->lru_list, rdrop, lru_node);
693 			atomic_add_int(&rdrop->refs, 1);
694 			atomic_set_int(&rdrop->flags, HAMMER2_CHAIN_RELEASE);
695 		} else {
696 			atomic_add_int(&pmp->lru_count, 1);
697 		}
698 		hammer2_spin_unex(&pmp->lru_spin);
699 		if (parent) {
700 			hammer2_spin_unex(&parent->core.spin);
701 			parent = NULL;	/* safety */
702 		}
703 		hammer2_spin_unex(&chain->core.spin);
704 		hammer2_mtx_unlock(&chain->lock);
705 #if 0
706 		if (dio)
707 			hammer2_io_bqrelse(&dio);
708 #endif
709 
710 		return rdrop;
711 		/* NOT REACHED */
712 	}
713 
714 	/*
715 	 * Spinlock the parent and try to drop the last ref on chain.
716 	 * On success determine if we should dispose of the chain
717 	 * (remove the chain from its parent, etc).
718 	 *
719 	 * (normal core locks are top-down recursive but we define
720 	 * core spinlocks as bottom-up recursive, so this is safe).
721 	 */
722 	if (parent) {
723 		hammer2_spin_ex(&parent->core.spin);
724 		if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
725 #if 0
726 			/* XXX remove, don't try to drop data on fail */
727 			hammer2_spin_unex(&parent->core.spin);
728 			dio = hammer2_chain_drop_data(chain, 0);
729 			hammer2_spin_unex(&chain->core.spin);
730 			if (dio)
731 				hammer2_io_bqrelse(&dio);
732 #endif
733 			/*
734 			 * 1->0 transition failed, retry.
735 			 */
736 			hammer2_spin_unex(&parent->core.spin);
737 			hammer2_spin_unex(&chain->core.spin);
738 			hammer2_mtx_unlock(&chain->lock);
739 
740 			return(chain);
741 		}
742 
743 		/*
744 		 * 1->0 transition successful, parent spin held to prevent
745 		 * new lookups, chain spinlock held to protect parent field.
746 		 * Remove chain from the parent.
747 		 *
748 		 * If the chain is being removed from the parent's btree but
749 		 * is not bmapped, we have to adjust live_count downward.  If
750 		 * it is bmapped then the blockref is retained in the parent
751 		 * as is its associated live_count.  This case can occur when
752 		 * a chain added to the topology is unable to flush and is
753 		 * then later deleted.
754 		 */
755 		if (chain->flags & HAMMER2_CHAIN_ONRBTREE) {
756 			if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) &&
757 			    (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
758 				atomic_add_int(&parent->core.live_count, -1);
759 			}
760 			RB_REMOVE(hammer2_chain_tree,
761 				  &parent->core.rbtree, chain);
762 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
763 			--parent->core.chain_count;
764 			chain->parent = NULL;
765 		}
766 
767 		/*
768 		 * If our chain was the last chain in the parent's core the
769 		 * core is now empty and its parent might have to be
770 		 * re-dropped if it has 0 refs.
771 		 */
772 		if (parent->core.chain_count == 0) {
773 			rdrop = parent;
774 			atomic_add_int(&rdrop->refs, 1);
775 			/*
776 			if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0)
777 				rdrop = NULL;
778 			*/
779 		}
780 		hammer2_spin_unex(&parent->core.spin);
781 		parent = NULL;	/* safety */
782 		/* FALL THROUGH */
783 	} else {
784 		/*
785 		 * No-parent case.
786 		 */
787 		if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
788 			/*
789 			 * 1->0 transition failed, retry.
790 			 */
791 			hammer2_spin_unex(&parent->core.spin);
792 			hammer2_spin_unex(&chain->core.spin);
793 			hammer2_mtx_unlock(&chain->lock);
794 
795 			return(chain);
796 		}
797 	}
798 
799 	/*
800 	 * Successful 1->0 transition, no parent, no children... no way for
801 	 * anyone to ref this chain any more.  We can clean-up and free it.
802 	 *
803 	 * We still have the core spinlock, and core's chain_count is 0.
804 	 * Any parent spinlock is gone.
805 	 */
806 	hammer2_spin_unex(&chain->core.spin);
807 	hammer2_chain_assert_no_data(chain);
808 	hammer2_mtx_unlock(&chain->lock);
809 	KKASSERT(RB_EMPTY(&chain->core.rbtree) &&
810 		 chain->core.chain_count == 0);
811 
812 	/*
813 	 * All locks are gone, no pointers remain to the chain, finish
814 	 * freeing it.
815 	 */
816 	KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE |
817 				  HAMMER2_CHAIN_MODIFIED)) == 0);
818 #if 0
819 	dio = hammer2_chain_drop_data(chain, 1);
820 	if (dio)
821 		hammer2_io_bqrelse(&dio);
822 #endif
823 
824 	/*
825 	 * Once chain resources are gone we can use the now dead chain
826 	 * structure to placehold what might otherwise require a recursive
827 	 * drop, because we have potentially two things to drop and can only
828 	 * return one directly.
829 	 */
830 	if (chain->flags & HAMMER2_CHAIN_ALLOCATED) {
831 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ALLOCATED);
832 		chain->hmp = NULL;
833 		kfree(chain, hmp->mchain);
834 	}
835 
836 	/*
837 	 * Possible chaining loop when parent re-drop needed.
838 	 */
839 	return(rdrop);
840 }
841 
842 /*
843  * On last lock release.
844  */
845 static hammer2_io_t *
846 hammer2_chain_drop_data(hammer2_chain_t *chain)
847 {
848 	hammer2_io_t *dio;
849 
850 	if ((dio = chain->dio) != NULL) {
851 		chain->dio = NULL;
852 		chain->data = NULL;
853 	} else {
854 		switch(chain->bref.type) {
855 		case HAMMER2_BREF_TYPE_VOLUME:
856 		case HAMMER2_BREF_TYPE_FREEMAP:
857 			break;
858 		default:
859 			if (chain->data != NULL) {
860 				hammer2_spin_unex(&chain->core.spin);
861 				panic("chain data not null: "
862 				      "chain %p bref %016jx.%02x "
863 				      "refs %d parent %p dio %p data %p",
864 				      chain, chain->bref.data_off,
865 				      chain->bref.type, chain->refs,
866 				      chain->parent,
867 				      chain->dio, chain->data);
868 			}
869 			KKASSERT(chain->data == NULL);
870 			break;
871 		}
872 	}
873 	return dio;
874 }
875 
876 /*
877  * Lock a referenced chain element, acquiring its data with I/O if necessary,
878  * and specify how you would like the data to be resolved.
879  *
880  * If an I/O or other fatal error occurs, chain->error will be set to non-zero.
881  *
882  * The lock is allowed to recurse, multiple locking ops will aggregate
883  * the requested resolve types.  Once data is assigned it will not be
884  * removed until the last unlock.
885  *
886  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
887  *			   (typically used to avoid device/logical buffer
888  *			    aliasing for data)
889  *
890  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
891  *			   the INITIAL-create state (indirect blocks only).
892  *
893  *			   Do not resolve data elements for DATA chains.
894  *			   (typically used to avoid device/logical buffer
895  *			    aliasing for data)
896  *
897  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
898  *
899  * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise
900  *			   it will be locked exclusive.
901  *
902  * NOTE: Embedded elements (volume header, inodes) are always resolved
903  *	 regardless.
904  *
905  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
906  *	 element will instantiate and zero its buffer, and flush it on
907  *	 release.
908  *
909  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
910  *	 so as not to instantiate a device buffer, which could alias against
911  *	 a logical file buffer.  However, if ALWAYS is specified the
912  *	 device buffer will be instantiated anyway.
913  *
914  * WARNING! This function blocks on I/O if data needs to be fetched.  This
915  *	    blocking can run concurrent with other compatible lock holders
916  *	    who do not need data returning.  The lock is not upgraded to
917  *	    exclusive during a data fetch, a separate bit is used to
918  *	    interlock I/O.  However, an exclusive lock holder can still count
919  *	    on being interlocked against an I/O fetch managed by a shared
920  *	    lock holder.
921  */
922 void
923 hammer2_chain_lock(hammer2_chain_t *chain, int how)
924 {
925 	/*
926 	 * Ref and lock the element.  Recursive locks are allowed.
927 	 */
928 	KKASSERT(chain->refs > 0);
929 	atomic_add_int(&chain->lockcnt, 1);
930 
931 	/*
932 	 * Get the appropriate lock.  If LOCKAGAIN is flagged with SHARED
933 	 * the caller expects a shared lock to already be present and we
934 	 * are giving it another ref.  This case must importantly not block
935 	 * if there is a pending exclusive lock request.
936 	 */
937 	if (how & HAMMER2_RESOLVE_SHARED) {
938 		if (how & HAMMER2_RESOLVE_LOCKAGAIN) {
939 			hammer2_mtx_sh_again(&chain->lock);
940 		} else {
941 			hammer2_mtx_sh(&chain->lock);
942 		}
943 	} else {
944 		hammer2_mtx_ex(&chain->lock);
945 	}
946 	++curthread->td_tracker;
947 
948 	/*
949 	 * If we already have a valid data pointer make sure the data is
950 	 * synchronized to the current cpu, and then no further action is
951 	 * necessary.
952 	 */
953 	if (chain->data) {
954 		if (chain->dio)
955 			hammer2_io_bkvasync(chain->dio);
956 		return;
957 	}
958 
959 	/*
960 	 * Do we have to resolve the data?  This is generally only
961 	 * applicable to HAMMER2_BREF_TYPE_DATA which is special-cased.
962 	 * Other BREF types expects the data to be there.
963 	 */
964 	switch(how & HAMMER2_RESOLVE_MASK) {
965 	case HAMMER2_RESOLVE_NEVER:
966 		return;
967 	case HAMMER2_RESOLVE_MAYBE:
968 		if (chain->flags & HAMMER2_CHAIN_INITIAL)
969 			return;
970 		if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
971 			return;
972 #if 0
973 		if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE)
974 			return;
975 		if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF)
976 			return;
977 #endif
978 		/* fall through */
979 	case HAMMER2_RESOLVE_ALWAYS:
980 	default:
981 		break;
982 	}
983 
984 	/*
985 	 * Caller requires data
986 	 */
987 	hammer2_chain_load_data(chain);
988 }
989 
990 /*
991  * Lock the chain, retain the hold, and drop the data persistence count.
992  * The data should remain valid because we never transitioned lockcnt
993  * through 0.
994  */
995 void
996 hammer2_chain_lock_unhold(hammer2_chain_t *chain, int how)
997 {
998 	hammer2_chain_lock(chain, how);
999 	atomic_add_int(&chain->lockcnt, -1);
1000 }
1001 
1002 #if 0
1003 /*
1004  * Downgrade an exclusive chain lock to a shared chain lock.
1005  *
1006  * NOTE: There is no upgrade equivalent due to the ease of
1007  *	 deadlocks in that direction.
1008  */
1009 void
1010 hammer2_chain_lock_downgrade(hammer2_chain_t *chain)
1011 {
1012 	hammer2_mtx_downgrade(&chain->lock);
1013 }
1014 #endif
1015 
1016 /*
1017  * Issue I/O and install chain->data.  Caller must hold a chain lock, lock
1018  * may be of any type.
1019  *
1020  * Once chain->data is set it cannot be disposed of until all locks are
1021  * released.
1022  *
1023  * Make sure the data is synchronized to the current cpu.
1024  */
1025 void
1026 hammer2_chain_load_data(hammer2_chain_t *chain)
1027 {
1028 	hammer2_blockref_t *bref;
1029 	hammer2_dev_t *hmp;
1030 	hammer2_io_t *dio;
1031 	char *bdata;
1032 	int error;
1033 
1034 	/*
1035 	 * Degenerate case, data already present, or chain has no media
1036 	 * reference to load.
1037 	 */
1038 	if (chain->data) {
1039 		if (chain->dio)
1040 			hammer2_io_bkvasync(chain->dio);
1041 		return;
1042 	}
1043 	if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0)
1044 		return;
1045 
1046 	hmp = chain->hmp;
1047 	KKASSERT(hmp != NULL);
1048 
1049 	/*
1050 	 * Gain the IOINPROG bit, interlocked block.
1051 	 */
1052 	for (;;) {
1053 		u_int oflags;
1054 		u_int nflags;
1055 
1056 		oflags = chain->flags;
1057 		cpu_ccfence();
1058 		if (oflags & HAMMER2_CHAIN_IOINPROG) {
1059 			nflags = oflags | HAMMER2_CHAIN_IOSIGNAL;
1060 			tsleep_interlock(&chain->flags, 0);
1061 			if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
1062 				tsleep(&chain->flags, PINTERLOCKED,
1063 					"h2iocw", 0);
1064 			}
1065 			/* retry */
1066 		} else {
1067 			nflags = oflags | HAMMER2_CHAIN_IOINPROG;
1068 			if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
1069 				break;
1070 			}
1071 			/* retry */
1072 		}
1073 	}
1074 
1075 	/*
1076 	 * We own CHAIN_IOINPROG
1077 	 *
1078 	 * Degenerate case if we raced another load.
1079 	 */
1080 	if (chain->data)
1081 		goto done;
1082 
1083 	/*
1084 	 * We must resolve to a device buffer, either by issuing I/O or
1085 	 * by creating a zero-fill element.  We do not mark the buffer
1086 	 * dirty when creating a zero-fill element (the hammer2_chain_modify()
1087 	 * API must still be used to do that).
1088 	 *
1089 	 * The device buffer is variable-sized in powers of 2 down
1090 	 * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1091 	 * chunk always contains buffers of the same size. (XXX)
1092 	 *
1093 	 * The minimum physical IO size may be larger than the variable
1094 	 * block size.
1095 	 */
1096 	bref = &chain->bref;
1097 
1098 	/*
1099 	 * The getblk() optimization can only be used on newly created
1100 	 * elements if the physical block size matches the request.
1101 	 */
1102 	if (chain->flags & HAMMER2_CHAIN_INITIAL) {
1103 		error = hammer2_io_new(hmp, bref->type,
1104 				       bref->data_off, chain->bytes,
1105 				       &chain->dio);
1106 	} else {
1107 		error = hammer2_io_bread(hmp, bref->type,
1108 					 bref->data_off, chain->bytes,
1109 					 &chain->dio);
1110 		hammer2_adjreadcounter(&chain->bref, chain->bytes);
1111 	}
1112 	if (error) {
1113 		chain->error = HAMMER2_ERROR_EIO;
1114 		kprintf("hammer2_chain_lock: I/O error %016jx: %d\n",
1115 			(intmax_t)bref->data_off, error);
1116 		hammer2_io_bqrelse(&chain->dio);
1117 		goto done;
1118 	}
1119 	chain->error = 0;
1120 
1121 	/*
1122 	 * This isn't perfect and can be ignored on OSs which do not have
1123 	 * an indication as to whether a buffer is coming from cache or
1124 	 * if I/O was actually issued for the read.  TESTEDGOOD will work
1125 	 * pretty well without the B_IOISSUED logic because chains are
1126 	 * cached, but in that situation (without B_IOISSUED) it will not
1127 	 * detect whether a re-read via I/O is corrupted verses the original
1128 	 * read.
1129 	 *
1130 	 * We can't re-run the CRC on every fresh lock.  That would be
1131 	 * insanely expensive.
1132 	 *
1133 	 * If the underlying kernel buffer covers the entire chain we can
1134 	 * use the B_IOISSUED indication to determine if we have to re-run
1135 	 * the CRC on chain data for chains that managed to stay cached
1136 	 * across the kernel disposal of the original buffer.
1137 	 */
1138 	if ((dio = chain->dio) != NULL && dio->bp) {
1139 		struct buf *bp = dio->bp;
1140 
1141 		if (dio->psize == chain->bytes &&
1142 		    (bp->b_flags & B_IOISSUED)) {
1143 			atomic_clear_int(&chain->flags,
1144 					 HAMMER2_CHAIN_TESTEDGOOD);
1145 			bp->b_flags &= ~B_IOISSUED;
1146 		}
1147 	}
1148 
1149 	/*
1150 	 * NOTE: A locked chain's data cannot be modified without first
1151 	 *	 calling hammer2_chain_modify().
1152 	 */
1153 
1154 	/*
1155 	 * Clear INITIAL.  In this case we used io_new() and the buffer has
1156 	 * been zero'd and marked dirty.
1157 	 *
1158 	 * NOTE: hammer2_io_data() call issues bkvasync()
1159 	 */
1160 	bdata = hammer2_io_data(chain->dio, chain->bref.data_off);
1161 
1162 	if (chain->flags & HAMMER2_CHAIN_INITIAL) {
1163 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1164 		chain->bref.flags |= HAMMER2_BREF_FLAG_ZERO;
1165 	} else if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
1166 		/*
1167 		 * check data not currently synchronized due to
1168 		 * modification.  XXX assumes data stays in the buffer
1169 		 * cache, which might not be true (need biodep on flush
1170 		 * to calculate crc?  or simple crc?).
1171 		 */
1172 	} else if ((chain->flags & HAMMER2_CHAIN_TESTEDGOOD) == 0) {
1173 		if (hammer2_chain_testcheck(chain, bdata) == 0) {
1174 			chain->error = HAMMER2_ERROR_CHECK;
1175 		} else {
1176 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_TESTEDGOOD);
1177 		}
1178 	}
1179 
1180 	/*
1181 	 * Setup the data pointer, either pointing it to an embedded data
1182 	 * structure and copying the data from the buffer, or pointing it
1183 	 * into the buffer.
1184 	 *
1185 	 * The buffer is not retained when copying to an embedded data
1186 	 * structure in order to avoid potential deadlocks or recursions
1187 	 * on the same physical buffer.
1188 	 *
1189 	 * WARNING! Other threads can start using the data the instant we
1190 	 *	    set chain->data non-NULL.
1191 	 */
1192 	switch (bref->type) {
1193 	case HAMMER2_BREF_TYPE_VOLUME:
1194 	case HAMMER2_BREF_TYPE_FREEMAP:
1195 		/*
1196 		 * Copy data from bp to embedded buffer
1197 		 */
1198 		panic("hammer2_chain_load_data: unresolved volume header");
1199 		break;
1200 	case HAMMER2_BREF_TYPE_DIRENT:
1201 		KKASSERT(chain->bytes != 0);
1202 		/* fall through */
1203 	case HAMMER2_BREF_TYPE_INODE:
1204 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1205 	case HAMMER2_BREF_TYPE_INDIRECT:
1206 	case HAMMER2_BREF_TYPE_DATA:
1207 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1208 	default:
1209 		/*
1210 		 * Point data at the device buffer and leave dio intact.
1211 		 */
1212 		chain->data = (void *)bdata;
1213 		break;
1214 	}
1215 
1216 	/*
1217 	 * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested.
1218 	 */
1219 done:
1220 	for (;;) {
1221 		u_int oflags;
1222 		u_int nflags;
1223 
1224 		oflags = chain->flags;
1225 		nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG |
1226 				    HAMMER2_CHAIN_IOSIGNAL);
1227 		KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG);
1228 		if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
1229 			if (oflags & HAMMER2_CHAIN_IOSIGNAL)
1230 				wakeup(&chain->flags);
1231 			break;
1232 		}
1233 	}
1234 }
1235 
1236 /*
1237  * Unlock and deref a chain element.
1238  *
1239  * Remember that the presence of children under chain prevent the chain's
1240  * destruction but do not add additional references, so the dio will still
1241  * be dropped.
1242  */
1243 void
1244 hammer2_chain_unlock(hammer2_chain_t *chain)
1245 {
1246 	hammer2_io_t *dio;
1247 	u_int lockcnt;
1248 	int iter = 0;
1249 
1250 	--curthread->td_tracker;
1251 
1252 	/*
1253 	 * If multiple locks are present (or being attempted) on this
1254 	 * particular chain we can just unlock, drop refs, and return.
1255 	 *
1256 	 * Otherwise fall-through on the 1->0 transition.
1257 	 */
1258 	for (;;) {
1259 		lockcnt = chain->lockcnt;
1260 		KKASSERT(lockcnt > 0);
1261 		cpu_ccfence();
1262 		if (lockcnt > 1) {
1263 			if (atomic_cmpset_int(&chain->lockcnt,
1264 					      lockcnt, lockcnt - 1)) {
1265 				hammer2_mtx_unlock(&chain->lock);
1266 				return;
1267 			}
1268 		} else if (hammer2_mtx_upgrade_try(&chain->lock) == 0) {
1269 			/* while holding the mutex exclusively */
1270 			if (atomic_cmpset_int(&chain->lockcnt, 1, 0))
1271 				break;
1272 		} else {
1273 			/*
1274 			 * This situation can easily occur on SMP due to
1275 			 * the gap inbetween the 1->0 transition and the
1276 			 * final unlock.  We cannot safely block on the
1277 			 * mutex because lockcnt might go above 1.
1278 			 *
1279 			 * XXX Sleep for one tick if it takes too long.
1280 			 */
1281 			if (++iter > 1000) {
1282 				if (iter > 1000 + hz) {
1283 					kprintf("hammer2: h2race2 %p\n", chain);
1284 					iter = 1000;
1285 				}
1286 				tsleep(&iter, 0, "h2race2", 1);
1287 			}
1288 			cpu_pause();
1289 		}
1290 		/* retry */
1291 	}
1292 
1293 	/*
1294 	 * Last unlock / mutex upgraded to exclusive.  Drop the data
1295 	 * reference.
1296 	 */
1297 	dio = hammer2_chain_drop_data(chain);
1298 	if (dio)
1299 		hammer2_io_bqrelse(&dio);
1300 	hammer2_mtx_unlock(&chain->lock);
1301 }
1302 
1303 /*
1304  * Unlock and hold chain data intact
1305  */
1306 void
1307 hammer2_chain_unlock_hold(hammer2_chain_t *chain)
1308 {
1309 	atomic_add_int(&chain->lockcnt, 1);
1310 	hammer2_chain_unlock(chain);
1311 }
1312 
1313 /*
1314  * Helper to obtain the blockref[] array base and count for a chain.
1315  *
1316  * XXX Not widely used yet, various use cases need to be validated and
1317  *     converted to use this function.
1318  */
1319 static
1320 hammer2_blockref_t *
1321 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp)
1322 {
1323 	hammer2_blockref_t *base;
1324 	int count;
1325 
1326 	if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1327 		base = NULL;
1328 
1329 		switch(parent->bref.type) {
1330 		case HAMMER2_BREF_TYPE_INODE:
1331 			count = HAMMER2_SET_COUNT;
1332 			break;
1333 		case HAMMER2_BREF_TYPE_INDIRECT:
1334 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1335 			count = parent->bytes / sizeof(hammer2_blockref_t);
1336 			break;
1337 		case HAMMER2_BREF_TYPE_VOLUME:
1338 			count = HAMMER2_SET_COUNT;
1339 			break;
1340 		case HAMMER2_BREF_TYPE_FREEMAP:
1341 			count = HAMMER2_SET_COUNT;
1342 			break;
1343 		default:
1344 			panic("hammer2_chain_create_indirect: "
1345 			      "unrecognized blockref type: %d",
1346 			      parent->bref.type);
1347 			count = 0;
1348 			break;
1349 		}
1350 	} else {
1351 		switch(parent->bref.type) {
1352 		case HAMMER2_BREF_TYPE_INODE:
1353 			base = &parent->data->ipdata.u.blockset.blockref[0];
1354 			count = HAMMER2_SET_COUNT;
1355 			break;
1356 		case HAMMER2_BREF_TYPE_INDIRECT:
1357 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1358 			base = &parent->data->npdata[0];
1359 			count = parent->bytes / sizeof(hammer2_blockref_t);
1360 			break;
1361 		case HAMMER2_BREF_TYPE_VOLUME:
1362 			base = &parent->data->voldata.
1363 					sroot_blockset.blockref[0];
1364 			count = HAMMER2_SET_COUNT;
1365 			break;
1366 		case HAMMER2_BREF_TYPE_FREEMAP:
1367 			base = &parent->data->blkset.blockref[0];
1368 			count = HAMMER2_SET_COUNT;
1369 			break;
1370 		default:
1371 			panic("hammer2_chain_create_indirect: "
1372 			      "unrecognized blockref type: %d",
1373 			      parent->bref.type);
1374 			count = 0;
1375 			break;
1376 		}
1377 	}
1378 	*countp = count;
1379 
1380 	return base;
1381 }
1382 
1383 /*
1384  * This counts the number of live blockrefs in a block array and
1385  * also calculates the point at which all remaining blockrefs are empty.
1386  * This routine can only be called on a live chain.
1387  *
1388  * Caller holds the chain locked, but possibly with a shared lock.  We
1389  * must use an exclusive spinlock to prevent corruption.
1390  *
1391  * NOTE: Flag is not set until after the count is complete, allowing
1392  *	 callers to test the flag without holding the spinlock.
1393  *
1394  * NOTE: If base is NULL the related chain is still in the INITIAL
1395  *	 state and there are no blockrefs to count.
1396  *
1397  * NOTE: live_count may already have some counts accumulated due to
1398  *	 creation and deletion and could even be initially negative.
1399  */
1400 void
1401 hammer2_chain_countbrefs(hammer2_chain_t *chain,
1402 			 hammer2_blockref_t *base, int count)
1403 {
1404 	hammer2_spin_ex(&chain->core.spin);
1405         if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) {
1406 		if (base) {
1407 			while (--count >= 0) {
1408 				if (base[count].type)
1409 					break;
1410 			}
1411 			chain->core.live_zero = count + 1;
1412 			while (count >= 0) {
1413 				if (base[count].type)
1414 					atomic_add_int(&chain->core.live_count,
1415 						       1);
1416 				--count;
1417 			}
1418 		} else {
1419 			chain->core.live_zero = 0;
1420 		}
1421 		/* else do not modify live_count */
1422 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS);
1423 	}
1424 	hammer2_spin_unex(&chain->core.spin);
1425 }
1426 
1427 /*
1428  * Resize the chain's physical storage allocation in-place.  This function does
1429  * not usually adjust the data pointer and must be followed by (typically) a
1430  * hammer2_chain_modify() call to copy any old data over and adjust the
1431  * data pointer.
1432  *
1433  * Chains can be resized smaller without reallocating the storage.  Resizing
1434  * larger will reallocate the storage.  Excess or prior storage is reclaimed
1435  * asynchronously at a later time.
1436  *
1437  * An nradix value of 0 is special-cased to mean that the storage should
1438  * be disassociated, that is the chain is being resized to 0 bytes (not 1
1439  * byte).
1440  *
1441  * Must be passed an exclusively locked parent and chain.
1442  *
1443  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
1444  * to avoid instantiating a device buffer that conflicts with the vnode data
1445  * buffer.  However, because H2 can compress or encrypt data, the chain may
1446  * have a dio assigned to it in those situations, and they do not conflict.
1447  *
1448  * XXX return error if cannot resize.
1449  */
1450 int
1451 hammer2_chain_resize(hammer2_chain_t *chain,
1452 		     hammer2_tid_t mtid, hammer2_off_t dedup_off,
1453 		     int nradix, int flags)
1454 {
1455 	hammer2_dev_t *hmp;
1456 	size_t obytes;
1457 	size_t nbytes;
1458 	int error;
1459 
1460 	hmp = chain->hmp;
1461 
1462 	/*
1463 	 * Only data and indirect blocks can be resized for now.
1464 	 * (The volu root, inodes, and freemap elements use a fixed size).
1465 	 */
1466 	KKASSERT(chain != &hmp->vchain);
1467 	KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
1468 		 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1469 		 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT);
1470 
1471 	/*
1472 	 * Nothing to do if the element is already the proper size
1473 	 */
1474 	obytes = chain->bytes;
1475 	nbytes = (nradix) ? (1U << nradix) : 0;
1476 	if (obytes == nbytes)
1477 		return (chain->error);
1478 
1479 	/*
1480 	 * Make sure the old data is instantiated so we can copy it.  If this
1481 	 * is a data block, the device data may be superfluous since the data
1482 	 * might be in a logical block, but compressed or encrypted data is
1483 	 * another matter.
1484 	 *
1485 	 * NOTE: The modify will set BMAPUPD for us if BMAPPED is set.
1486 	 */
1487 	error = hammer2_chain_modify(chain, mtid, dedup_off, 0);
1488 	if (error)
1489 		return error;
1490 
1491 	/*
1492 	 * Relocate the block, even if making it smaller (because different
1493 	 * block sizes may be in different regions).
1494 	 *
1495 	 * NOTE: Operation does not copy the data and may only be used
1496 	 *	  to resize data blocks in-place, or directory entry blocks
1497 	 *	  which are about to be modified in some manner.
1498 	 */
1499 	error = hammer2_freemap_alloc(chain, nbytes);
1500 	if (error)
1501 		return error;
1502 
1503 	chain->bytes = nbytes;
1504 
1505 	/*
1506 	 * We don't want the followup chain_modify() to try to copy data
1507 	 * from the old (wrong-sized) buffer.  It won't know how much to
1508 	 * copy.  This case should only occur during writes when the
1509 	 * originator already has the data to write in-hand.
1510 	 */
1511 	if (chain->dio) {
1512 		KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
1513 			 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT);
1514 		hammer2_io_brelse(&chain->dio);
1515 		chain->data = NULL;
1516 	}
1517 	return (chain->error);
1518 }
1519 
1520 /*
1521  * Set the chain modified so its data can be changed by the caller, or
1522  * install deduplicated data.  The caller must call this routine for each
1523  * set of modifications it makes, even if the chain is already flagged
1524  * MODIFIED.
1525  *
1526  * Sets bref.modify_tid to mtid only if mtid != 0.  Note that bref.modify_tid
1527  * is a CLC (cluster level change) field and is not updated by parent
1528  * propagation during a flush.
1529  *
1530  * Returns an appropriate HAMMER2_ERROR_* code, which will generally reflect
1531  * chain->error except for HAMMER2_ERROR_ENOSPC.  If the allocation fails
1532  * due to no space available, HAMMER2_ERROR_ENOSPC is returned and the chain
1533  * remains unmodified with its old data ref intact and chain->error
1534  * unchanged.
1535  *
1536  *				 Dedup Handling
1537  *
1538  * If the DEDUPABLE flag is set in the chain the storage must be reallocated
1539  * even if the chain is still flagged MODIFIED.  In this case the chain's
1540  * DEDUPABLE flag will be cleared once the new storage has been assigned.
1541  *
1542  * If the caller passes a non-zero dedup_off we will use it to assign the
1543  * new storage.  The MODIFIED flag will be *CLEARED* in this case, and
1544  * DEDUPABLE will be set (NOTE: the UPDATE flag is always set).  The caller
1545  * must not modify the data content upon return.
1546  */
1547 int
1548 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid,
1549 		     hammer2_off_t dedup_off, int flags)
1550 {
1551 	hammer2_blockref_t obref;
1552 	hammer2_dev_t *hmp;
1553 	hammer2_io_t *dio;
1554 	int error;
1555 	int wasinitial;
1556 	int setmodified;
1557 	int setupdate;
1558 	int newmod;
1559 	char *bdata;
1560 
1561 	hmp = chain->hmp;
1562 	obref = chain->bref;
1563 	KKASSERT((chain->flags & HAMMER2_CHAIN_FICTITIOUS) == 0);
1564 
1565 	/*
1566 	 * Data is not optional for freemap chains (we must always be sure
1567 	 * to copy the data on COW storage allocations).
1568 	 */
1569 	if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
1570 	    chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
1571 		KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) ||
1572 			 (flags & HAMMER2_MODIFY_OPTDATA) == 0);
1573 	}
1574 
1575 	/*
1576 	 * Data must be resolved if already assigned, unless explicitly
1577 	 * flagged otherwise.  If we cannot safety load the data the
1578 	 * modification fails and we return early.
1579 	 */
1580 	if (chain->data == NULL && chain->bytes != 0 &&
1581 	    (flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
1582 	    (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) {
1583 		hammer2_chain_load_data(chain);
1584 		if (chain->error)
1585 			return (chain->error);
1586 	}
1587 	error = 0;
1588 
1589 	/*
1590 	 * Set MODIFIED to indicate that the chain has been modified.  A new
1591 	 * allocation is required when modifying a chain.
1592 	 *
1593 	 * Set UPDATE to ensure that the blockref is updated in the parent.
1594 	 *
1595 	 * If MODIFIED is already set determine if we can reuse the assigned
1596 	 * data block or if we need a new data block.
1597 	 */
1598 	if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
1599 		/*
1600 		 * Must set modified bit.
1601 		 */
1602 		atomic_add_long(&hammer2_count_modified_chains, 1);
1603 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1604 		hammer2_pfs_memory_inc(chain->pmp);  /* can be NULL */
1605 		setmodified = 1;
1606 
1607 		/*
1608 		 * We may be able to avoid a copy-on-write if the chain's
1609 		 * check mode is set to NONE and the chain's current
1610 		 * modify_tid is beyond the last explicit snapshot tid.
1611 		 *
1612 		 * This implements HAMMER2's overwrite-in-place feature.
1613 		 *
1614 		 * NOTE! This data-block cannot be used as a de-duplication
1615 		 *	 source when the check mode is set to NONE.
1616 		 */
1617 		if ((chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
1618 		     chain->bref.type == HAMMER2_BREF_TYPE_DIRENT) &&
1619 		    (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 &&
1620 		    (chain->flags & HAMMER2_CHAIN_DEDUPABLE) == 0 &&
1621 		    HAMMER2_DEC_CHECK(chain->bref.methods) ==
1622 		     HAMMER2_CHECK_NONE &&
1623 		    chain->pmp &&
1624 		    chain->bref.modify_tid >
1625 		     chain->pmp->iroot->meta.pfs_lsnap_tid) {
1626 			/*
1627 			 * Sector overwrite allowed.
1628 			 */
1629 			newmod = 0;
1630 		} else {
1631 			/*
1632 			 * Sector overwrite not allowed, must copy-on-write.
1633 			 */
1634 			newmod = 1;
1635 		}
1636 	} else if (chain->flags & HAMMER2_CHAIN_DEDUPABLE) {
1637 		/*
1638 		 * If the modified chain was registered for dedup we need
1639 		 * a new allocation.  This only happens for delayed-flush
1640 		 * chains (i.e. which run through the front-end buffer
1641 		 * cache).
1642 		 */
1643 		newmod = 1;
1644 		setmodified = 0;
1645 	} else {
1646 		/*
1647 		 * Already flagged modified, no new allocation is needed.
1648 		 */
1649 		newmod = 0;
1650 		setmodified = 0;
1651 	}
1652 
1653 	/*
1654 	 * Flag parent update required.
1655 	 */
1656 	if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) {
1657 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
1658 		setupdate = 1;
1659 	} else {
1660 		setupdate = 0;
1661 	}
1662 
1663 	/*
1664 	 * The modification or re-modification requires an allocation and
1665 	 * possible COW.  If an error occurs, the previous content and data
1666 	 * reference is retained and the modification fails.
1667 	 *
1668 	 * If dedup_off is non-zero, the caller is requesting a deduplication
1669 	 * rather than a modification.  The MODIFIED bit is not set and the
1670 	 * data offset is set to the deduplication offset.  The data cannot
1671 	 * be modified.
1672 	 *
1673 	 * NOTE: The dedup offset is allowed to be in a partially free state
1674 	 *	 and we must be sure to reset it to a fully allocated state
1675 	 *	 to force two bulkfree passes to free it again.
1676 	 *
1677 	 * NOTE: Only applicable when chain->bytes != 0.
1678 	 *
1679 	 * XXX can a chain already be marked MODIFIED without a data
1680 	 * assignment?  If not, assert here instead of testing the case.
1681 	 */
1682 	if (chain != &hmp->vchain && chain != &hmp->fchain &&
1683 	    chain->bytes) {
1684 		if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 ||
1685 		     newmod
1686 		) {
1687 			/*
1688 			 * NOTE: We do not have to remove the dedup
1689 			 *	 registration because the area is still
1690 			 *	 allocated and the underlying DIO will
1691 			 *	 still be flushed.
1692 			 */
1693 			if (dedup_off) {
1694 				chain->bref.data_off = dedup_off;
1695 				chain->bytes = 1 << (dedup_off &
1696 						     HAMMER2_OFF_MASK_RADIX);
1697 				chain->error = 0;
1698 				atomic_clear_int(&chain->flags,
1699 						 HAMMER2_CHAIN_MODIFIED);
1700 				atomic_add_long(&hammer2_count_modified_chains,
1701 						-1);
1702 				if (chain->pmp)
1703 					hammer2_pfs_memory_wakeup(chain->pmp);
1704 				hammer2_freemap_adjust(hmp, &chain->bref,
1705 						HAMMER2_FREEMAP_DORECOVER);
1706 				atomic_set_int(&chain->flags,
1707 						HAMMER2_CHAIN_DEDUPABLE);
1708 			} else {
1709 				error = hammer2_freemap_alloc(chain,
1710 							      chain->bytes);
1711 				atomic_clear_int(&chain->flags,
1712 						HAMMER2_CHAIN_DEDUPABLE);
1713 			}
1714 		}
1715 	}
1716 
1717 	/*
1718 	 * Stop here if error.  We have to undo any flag bits we might
1719 	 * have set above.
1720 	 */
1721 	if (error) {
1722 		if (setmodified) {
1723 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1724 			atomic_add_long(&hammer2_count_modified_chains, -1);
1725 			if (chain->pmp)
1726 				hammer2_pfs_memory_wakeup(chain->pmp);
1727 		}
1728 		if (setupdate) {
1729 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
1730 		}
1731 		return error;
1732 	}
1733 
1734 	/*
1735 	 * Update mirror_tid and modify_tid.  modify_tid is only updated
1736 	 * if not passed as zero (during flushes, parent propagation passes
1737 	 * the value 0).
1738 	 *
1739 	 * NOTE: chain->pmp could be the device spmp.
1740 	 */
1741 	chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1;
1742 	if (mtid)
1743 		chain->bref.modify_tid = mtid;
1744 
1745 	/*
1746 	 * Set BMAPUPD to tell the flush code that an existing blockmap entry
1747 	 * requires updating as well as to tell the delete code that the
1748 	 * chain's blockref might not exactly match (in terms of physical size
1749 	 * or block offset) the one in the parent's blocktable.  The base key
1750 	 * of course will still match.
1751 	 */
1752 	if (chain->flags & HAMMER2_CHAIN_BMAPPED)
1753 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD);
1754 
1755 	/*
1756 	 * Short-cut data blocks which the caller does not need an actual
1757 	 * data reference to (aka OPTDATA), as long as the chain does not
1758 	 * already have a data pointer to the data.  This generally means
1759 	 * that the modifications are being done via the logical buffer cache.
1760 	 * The INITIAL flag relates only to the device data buffer and thus
1761 	 * remains unchange in this situation.
1762 	 *
1763 	 * This code also handles bytes == 0 (most dirents).
1764 	 */
1765 	if (chain->bref.type == HAMMER2_BREF_TYPE_DATA &&
1766 	    (flags & HAMMER2_MODIFY_OPTDATA) &&
1767 	    chain->data == NULL) {
1768 		KKASSERT(chain->dio == NULL);
1769 		goto skip2;
1770 	}
1771 
1772 	/*
1773 	 * Clearing the INITIAL flag (for indirect blocks) indicates that
1774 	 * we've processed the uninitialized storage allocation.
1775 	 *
1776 	 * If this flag is already clear we are likely in a copy-on-write
1777 	 * situation but we have to be sure NOT to bzero the storage if
1778 	 * no data is present.
1779 	 */
1780 	if (chain->flags & HAMMER2_CHAIN_INITIAL) {
1781 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1782 		wasinitial = 1;
1783 	} else {
1784 		wasinitial = 0;
1785 	}
1786 
1787 	/*
1788 	 * Instantiate data buffer and possibly execute COW operation
1789 	 */
1790 	switch(chain->bref.type) {
1791 	case HAMMER2_BREF_TYPE_VOLUME:
1792 	case HAMMER2_BREF_TYPE_FREEMAP:
1793 		/*
1794 		 * The data is embedded, no copy-on-write operation is
1795 		 * needed.
1796 		 */
1797 		KKASSERT(chain->dio == NULL);
1798 		break;
1799 	case HAMMER2_BREF_TYPE_DIRENT:
1800 		/*
1801 		 * The data might be fully embedded.
1802 		 */
1803 		if (chain->bytes == 0) {
1804 			KKASSERT(chain->dio == NULL);
1805 			break;
1806 		}
1807 		/* fall through */
1808 	case HAMMER2_BREF_TYPE_INODE:
1809 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1810 	case HAMMER2_BREF_TYPE_DATA:
1811 	case HAMMER2_BREF_TYPE_INDIRECT:
1812 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1813 		/*
1814 		 * Perform the copy-on-write operation
1815 		 *
1816 		 * zero-fill or copy-on-write depending on whether
1817 		 * chain->data exists or not and set the dirty state for
1818 		 * the new buffer.  hammer2_io_new() will handle the
1819 		 * zero-fill.
1820 		 *
1821 		 * If a dedup_off was supplied this is an existing block
1822 		 * and no COW, copy, or further modification is required.
1823 		 */
1824 		KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain);
1825 
1826 		if (wasinitial && dedup_off == 0) {
1827 			error = hammer2_io_new(hmp, chain->bref.type,
1828 					       chain->bref.data_off,
1829 					       chain->bytes, &dio);
1830 		} else {
1831 			error = hammer2_io_bread(hmp, chain->bref.type,
1832 						 chain->bref.data_off,
1833 						 chain->bytes, &dio);
1834 		}
1835 		hammer2_adjreadcounter(&chain->bref, chain->bytes);
1836 
1837 		/*
1838 		 * If an I/O error occurs make sure callers cannot accidently
1839 		 * modify the old buffer's contents and corrupt the filesystem.
1840 		 *
1841 		 * NOTE: hammer2_io_data() call issues bkvasync()
1842 		 */
1843 		if (error) {
1844 			kprintf("hammer2_chain_modify: hmp=%p I/O error\n",
1845 				hmp);
1846 			chain->error = HAMMER2_ERROR_EIO;
1847 			hammer2_io_brelse(&dio);
1848 			hammer2_io_brelse(&chain->dio);
1849 			chain->data = NULL;
1850 			break;
1851 		}
1852 		chain->error = 0;
1853 		bdata = hammer2_io_data(dio, chain->bref.data_off);
1854 
1855 		if (chain->data) {
1856 			/*
1857 			 * COW (unless a dedup).
1858 			 */
1859 			KKASSERT(chain->dio != NULL);
1860 			if (chain->data != (void *)bdata && dedup_off == 0) {
1861 				bcopy(chain->data, bdata, chain->bytes);
1862 			}
1863 		} else if (wasinitial == 0) {
1864 			/*
1865 			 * We have a problem.  We were asked to COW but
1866 			 * we don't have any data to COW with!
1867 			 */
1868 			panic("hammer2_chain_modify: having a COW %p\n",
1869 			      chain);
1870 		}
1871 
1872 		/*
1873 		 * Retire the old buffer, replace with the new.  Dirty or
1874 		 * redirty the new buffer.
1875 		 *
1876 		 * WARNING! The system buffer cache may have already flushed
1877 		 *	    the buffer, so we must be sure to [re]dirty it
1878 		 *	    for further modification.
1879 		 *
1880 		 *	    If dedup_off was supplied, the caller is not
1881 		 *	    expected to make any further modification to the
1882 		 *	    buffer.
1883 		 */
1884 		if (chain->dio)
1885 			hammer2_io_bqrelse(&chain->dio);
1886 		chain->data = (void *)bdata;
1887 		chain->dio = dio;
1888 		if (dedup_off == 0)
1889 			hammer2_io_setdirty(dio);
1890 		break;
1891 	default:
1892 		panic("hammer2_chain_modify: illegal non-embedded type %d",
1893 		      chain->bref.type);
1894 		break;
1895 
1896 	}
1897 skip2:
1898 	/*
1899 	 * setflush on parent indicating that the parent must recurse down
1900 	 * to us.  Do not call on chain itself which might already have it
1901 	 * set.
1902 	 */
1903 	if (chain->parent)
1904 		hammer2_chain_setflush(chain->parent);
1905 	return (chain->error);
1906 }
1907 
1908 /*
1909  * Modify the chain associated with an inode.
1910  */
1911 int
1912 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain,
1913 			hammer2_tid_t mtid, int flags)
1914 {
1915 	int error;
1916 
1917 	hammer2_inode_modify(ip);
1918 	error = hammer2_chain_modify(chain, mtid, 0, flags);
1919 
1920 	return error;
1921 }
1922 
1923 /*
1924  * Volume header data locks
1925  */
1926 void
1927 hammer2_voldata_lock(hammer2_dev_t *hmp)
1928 {
1929 	lockmgr(&hmp->vollk, LK_EXCLUSIVE);
1930 }
1931 
1932 void
1933 hammer2_voldata_unlock(hammer2_dev_t *hmp)
1934 {
1935 	lockmgr(&hmp->vollk, LK_RELEASE);
1936 }
1937 
1938 void
1939 hammer2_voldata_modify(hammer2_dev_t *hmp)
1940 {
1941 	if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) {
1942 		atomic_add_long(&hammer2_count_modified_chains, 1);
1943 		atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED);
1944 		hammer2_pfs_memory_inc(hmp->vchain.pmp);
1945 	}
1946 }
1947 
1948 /*
1949  * This function returns the chain at the nearest key within the specified
1950  * range.  The returned chain will be referenced but not locked.
1951  *
1952  * This function will recurse through chain->rbtree as necessary and will
1953  * return a *key_nextp suitable for iteration.  *key_nextp is only set if
1954  * the iteration value is less than the current value of *key_nextp.
1955  *
1956  * The caller should use (*key_nextp) to calculate the actual range of
1957  * the returned element, which will be (key_beg to *key_nextp - 1), because
1958  * there might be another element which is superior to the returned element
1959  * and overlaps it.
1960  *
1961  * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL
1962  * chains continue to be returned.  On EOF (*key_nextp) may overflow since
1963  * it will wind up being (key_end + 1).
1964  *
1965  * WARNING!  Must be called with child's spinlock held.  Spinlock remains
1966  *	     held through the operation.
1967  */
1968 struct hammer2_chain_find_info {
1969 	hammer2_chain_t		*best;
1970 	hammer2_key_t		key_beg;
1971 	hammer2_key_t		key_end;
1972 	hammer2_key_t		key_next;
1973 };
1974 
1975 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data);
1976 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data);
1977 
1978 static
1979 hammer2_chain_t *
1980 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp,
1981 			  hammer2_key_t key_beg, hammer2_key_t key_end)
1982 {
1983 	struct hammer2_chain_find_info info;
1984 
1985 	info.best = NULL;
1986 	info.key_beg = key_beg;
1987 	info.key_end = key_end;
1988 	info.key_next = *key_nextp;
1989 
1990 	RB_SCAN(hammer2_chain_tree, &parent->core.rbtree,
1991 		hammer2_chain_find_cmp, hammer2_chain_find_callback,
1992 		&info);
1993 	*key_nextp = info.key_next;
1994 #if 0
1995 	kprintf("chain_find %p %016jx:%016jx next=%016jx\n",
1996 		parent, key_beg, key_end, *key_nextp);
1997 #endif
1998 
1999 	return (info.best);
2000 }
2001 
2002 static
2003 int
2004 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data)
2005 {
2006 	struct hammer2_chain_find_info *info = data;
2007 	hammer2_key_t child_beg;
2008 	hammer2_key_t child_end;
2009 
2010 	child_beg = child->bref.key;
2011 	child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1;
2012 
2013 	if (child_end < info->key_beg)
2014 		return(-1);
2015 	if (child_beg > info->key_end)
2016 		return(1);
2017 	return(0);
2018 }
2019 
2020 static
2021 int
2022 hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
2023 {
2024 	struct hammer2_chain_find_info *info = data;
2025 	hammer2_chain_t *best;
2026 	hammer2_key_t child_end;
2027 
2028 	/*
2029 	 * WARNING! Layerq is scanned forwards, exact matches should keep
2030 	 *	    the existing info->best.
2031 	 */
2032 	if ((best = info->best) == NULL) {
2033 		/*
2034 		 * No previous best.  Assign best
2035 		 */
2036 		info->best = child;
2037 	} else if (best->bref.key <= info->key_beg &&
2038 		   child->bref.key <= info->key_beg) {
2039 		/*
2040 		 * Illegal overlap.
2041 		 */
2042 		KKASSERT(0);
2043 		/*info->best = child;*/
2044 	} else if (child->bref.key < best->bref.key) {
2045 		/*
2046 		 * Child has a nearer key and best is not flush with key_beg.
2047 		 * Set best to child.  Truncate key_next to the old best key.
2048 		 */
2049 		info->best = child;
2050 		if (info->key_next > best->bref.key || info->key_next == 0)
2051 			info->key_next = best->bref.key;
2052 	} else if (child->bref.key == best->bref.key) {
2053 		/*
2054 		 * If our current best is flush with the child then this
2055 		 * is an illegal overlap.
2056 		 *
2057 		 * key_next will automatically be limited to the smaller of
2058 		 * the two end-points.
2059 		 */
2060 		KKASSERT(0);
2061 		info->best = child;
2062 	} else {
2063 		/*
2064 		 * Keep the current best but truncate key_next to the child's
2065 		 * base.
2066 		 *
2067 		 * key_next will also automatically be limited to the smaller
2068 		 * of the two end-points (probably not necessary for this case
2069 		 * but we do it anyway).
2070 		 */
2071 		if (info->key_next > child->bref.key || info->key_next == 0)
2072 			info->key_next = child->bref.key;
2073 	}
2074 
2075 	/*
2076 	 * Always truncate key_next based on child's end-of-range.
2077 	 */
2078 	child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits);
2079 	if (child_end && (info->key_next > child_end || info->key_next == 0))
2080 		info->key_next = child_end;
2081 
2082 	return(0);
2083 }
2084 
2085 /*
2086  * Retrieve the specified chain from a media blockref, creating the
2087  * in-memory chain structure which reflects it.  The returned chain is
2088  * held but not locked.  The caller must lock it to crc-check and
2089  * dereference its data, and should check chain->error after locking
2090  * before assuming that the data is good.
2091  *
2092  * To handle insertion races pass the INSERT_RACE flag along with the
2093  * generation number of the core.  NULL will be returned if the generation
2094  * number changes before we have a chance to insert the chain.  Insert
2095  * races can occur because the parent might be held shared.
2096  *
2097  * Caller must hold the parent locked shared or exclusive since we may
2098  * need the parent's bref array to find our block.
2099  *
2100  * WARNING! chain->pmp is always set to NULL for any chain representing
2101  *	    part of the super-root topology.
2102  */
2103 hammer2_chain_t *
2104 hammer2_chain_get(hammer2_chain_t *parent, int generation,
2105 		  hammer2_blockref_t *bref)
2106 {
2107 	hammer2_dev_t *hmp = parent->hmp;
2108 	hammer2_chain_t *chain;
2109 	int error;
2110 
2111 	/*
2112 	 * Allocate a chain structure representing the existing media
2113 	 * entry.  Resulting chain has one ref and is not locked.
2114 	 */
2115 	if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT)
2116 		chain = hammer2_chain_alloc(hmp, NULL, bref);
2117 	else
2118 		chain = hammer2_chain_alloc(hmp, parent->pmp, bref);
2119 	/* ref'd chain returned */
2120 
2121 	/*
2122 	 * Flag that the chain is in the parent's blockmap so delete/flush
2123 	 * knows what to do with it.
2124 	 */
2125 	atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED);
2126 
2127 	/*
2128 	 * Link the chain into its parent.  A spinlock is required to safely
2129 	 * access the RBTREE, and it is possible to collide with another
2130 	 * hammer2_chain_get() operation because the caller might only hold
2131 	 * a shared lock on the parent.
2132 	 *
2133 	 * NOTE: Get races can occur quite often when we distribute
2134 	 *	 asynchronous read-aheads across multiple threads.
2135 	 */
2136 	KKASSERT(parent->refs > 0);
2137 	error = hammer2_chain_insert(parent, chain,
2138 				     HAMMER2_CHAIN_INSERT_SPIN |
2139 				     HAMMER2_CHAIN_INSERT_RACE,
2140 				     generation);
2141 	if (error) {
2142 		KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
2143 		/*kprintf("chain %p get race\n", chain);*/
2144 		hammer2_chain_drop(chain);
2145 		chain = NULL;
2146 	} else {
2147 		KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
2148 	}
2149 
2150 	/*
2151 	 * Return our new chain referenced but not locked, or NULL if
2152 	 * a race occurred.
2153 	 */
2154 	return (chain);
2155 }
2156 
2157 /*
2158  * Lookup initialization/completion API
2159  */
2160 hammer2_chain_t *
2161 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags)
2162 {
2163 	hammer2_chain_ref(parent);
2164 	if (flags & HAMMER2_LOOKUP_SHARED) {
2165 		hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
2166 					   HAMMER2_RESOLVE_SHARED);
2167 	} else {
2168 		hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2169 	}
2170 	return (parent);
2171 }
2172 
2173 void
2174 hammer2_chain_lookup_done(hammer2_chain_t *parent)
2175 {
2176 	if (parent) {
2177 		hammer2_chain_unlock(parent);
2178 		hammer2_chain_drop(parent);
2179 	}
2180 }
2181 
2182 /*
2183  * Take the locked chain and return a locked parent.  The chain remains
2184  * locked on return, but may have to be temporarily unlocked to acquire
2185  * the parent.  Can return NULL.
2186  *
2187  * Pass HAMMER2_RESOLVE_* flags in flags.
2188  *
2189  * This will work even if the chain is errored, and the caller can check
2190  * parent->error on return if desired since the parent will be locked.
2191  *
2192  * This function handles the lock order reversal.
2193  */
2194 hammer2_chain_t *
2195 hammer2_chain_getparent(hammer2_chain_t *chain, int flags)
2196 {
2197 	hammer2_chain_t *parent;
2198 
2199 	/*
2200 	 * Be careful of order, chain must be unlocked before parent
2201 	 * is locked below to avoid a deadlock.
2202 	 *
2203 	 * Safe access to fu->parent requires fu's core spinlock.
2204 	 */
2205 again:
2206 	hammer2_spin_ex(&chain->core.spin);
2207 	parent = chain->parent;
2208 	if (parent == NULL) {
2209 		hammer2_spin_unex(&chain->core.spin);
2210 		panic("hammer2_chain_getparent: no parent");
2211 	}
2212 	hammer2_chain_ref(parent);
2213 	hammer2_spin_unex(&chain->core.spin);
2214 
2215 	hammer2_chain_unlock(chain);
2216 	hammer2_chain_lock(parent, flags);
2217 	hammer2_chain_lock(chain, flags);
2218 
2219 	/*
2220 	 * Parent relinking races are quite common.  We have to get it right
2221 	 * or we will blow up the block table.
2222 	 */
2223 	if (chain->parent != parent) {
2224 		hammer2_chain_unlock(parent);
2225 		hammer2_chain_drop(parent);
2226 		goto again;
2227 	}
2228 	return parent;
2229 }
2230 
2231 /*
2232  * Take the locked chain and return a locked parent.  The chain is unlocked
2233  * and dropped.  *chainp is set to the returned parent as a convenience.
2234  * Pass HAMMER2_RESOLVE_* flags in flags.
2235  *
2236  * This will work even if the chain is errored, and the caller can check
2237  * parent->error on return if desired since the parent will be locked.
2238  *
2239  * This function handles the lock order reversal.
2240  */
2241 hammer2_chain_t *
2242 hammer2_chain_repparent(hammer2_chain_t **chainp, int flags)
2243 {
2244 	hammer2_chain_t *chain;
2245 	hammer2_chain_t *parent;
2246 
2247 	/*
2248 	 * Be careful of order, chain must be unlocked before parent
2249 	 * is locked below to avoid a deadlock.
2250 	 *
2251 	 * Safe access to fu->parent requires fu's core spinlock.
2252 	 */
2253 	chain = *chainp;
2254 again:
2255 	hammer2_spin_ex(&chain->core.spin);
2256 	parent = chain->parent;
2257 	if (parent == NULL) {
2258 		hammer2_spin_unex(&chain->core.spin);
2259 		panic("hammer2_chain_getparent: no parent");
2260 	}
2261 	hammer2_chain_ref(parent);
2262 	hammer2_spin_unex(&chain->core.spin);
2263 
2264 	hammer2_chain_unlock(chain);
2265 	hammer2_chain_lock(parent, flags);
2266 
2267 	/*
2268 	 * Parent relinking races are quite common.  We have to get it right
2269 	 * or we will blow up the block table.
2270 	 */
2271 	if (chain->parent != parent) {
2272 		hammer2_chain_lock(chain, flags);
2273 		hammer2_chain_unlock(parent);
2274 		hammer2_chain_drop(parent);
2275 		goto again;
2276 	}
2277 	hammer2_chain_drop(chain);
2278 	*chainp = parent;
2279 
2280 	return parent;
2281 }
2282 
2283 /*
2284  * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive.
2285  * (*parentp) typically points to an inode but can also point to a related
2286  * indirect block and this function will recurse upwards and find the inode
2287  * or the nearest undeleted indirect block covering the key range.
2288  *
2289  * This function unconditionally sets *errorp, replacing any previous value.
2290  *
2291  * (*parentp) must be exclusive or shared locked (depending on flags) and
2292  * referenced and can be an inode or an existing indirect block within the
2293  * inode.
2294  *
2295  * If (*parent) is errored out, this function will not attempt to recurse
2296  * the radix tree and will return NULL along with an appropriate *errorp.
2297  * If NULL is returned and *errorp is 0, the requested lookup could not be
2298  * located.
2299  *
2300  * On return (*parentp) will be modified to point at the deepest parent chain
2301  * element encountered during the search, as a helper for an insertion or
2302  * deletion.
2303  *
2304  * The new (*parentp) will be locked shared or exclusive (depending on flags),
2305  * and referenced, and the old will be unlocked and dereferenced (no change
2306  * if they are both the same).  This is particularly important if the caller
2307  * wishes to insert a new chain, (*parentp) will be set properly even if NULL
2308  * is returned, as long as no error occurred.
2309  *
2310  * The matching chain will be returned locked according to flags.  If NOLOCK
2311  * is requested the chain will be returned only referenced.  Note that the
2312  * parent chain must always be locked shared or exclusive, matching the
2313  * HAMMER2_LOOKUP_SHARED flag.  We can conceivably lock it SHARED temporarily
2314  * when NOLOCK is specified but that complicates matters if *parentp must
2315  * inherit the chain.
2316  *
2317  * NOLOCK also implies NODATA, since an unlocked chain usually has a NULL
2318  * data pointer or can otherwise be in flux.
2319  *
2320  * NULL is returned if no match was found, but (*parentp) will still
2321  * potentially be adjusted.
2322  *
2323  * On return (*key_nextp) will point to an iterative value for key_beg.
2324  * (If NULL is returned (*key_nextp) is set to (key_end + 1)).
2325  *
2326  * This function will also recurse up the chain if the key is not within the
2327  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
2328  * can simply allow (*parentp) to float inside the loop.
2329  *
2330  * NOTE!  chain->data is not always resolved.  By default it will not be
2331  *	  resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF.  Use
2332  *	  HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/
2333  *	  BREF_TYPE_DATA as the device buffer can alias the logical file
2334  *	  buffer).
2335  */
2336 
2337 hammer2_chain_t *
2338 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp,
2339 		     hammer2_key_t key_beg, hammer2_key_t key_end,
2340 		     int *errorp, int flags)
2341 {
2342 	hammer2_dev_t *hmp;
2343 	hammer2_chain_t *parent;
2344 	hammer2_chain_t *chain;
2345 	hammer2_blockref_t *base;
2346 	hammer2_blockref_t *bref;
2347 	hammer2_blockref_t bcopy;
2348 	hammer2_key_t scan_beg;
2349 	hammer2_key_t scan_end;
2350 	int count = 0;
2351 	int how_always = HAMMER2_RESOLVE_ALWAYS;
2352 	int how_maybe = HAMMER2_RESOLVE_MAYBE;
2353 	int how;
2354 	int generation;
2355 	int maxloops = 300000;
2356 	volatile hammer2_mtx_t save_mtx;
2357 
2358 	if (flags & HAMMER2_LOOKUP_ALWAYS) {
2359 		how_maybe = how_always;
2360 		how = HAMMER2_RESOLVE_ALWAYS;
2361 	} else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) {
2362 		how = HAMMER2_RESOLVE_NEVER;
2363 	} else {
2364 		how = HAMMER2_RESOLVE_MAYBE;
2365 	}
2366 	if (flags & HAMMER2_LOOKUP_SHARED) {
2367 		how_maybe |= HAMMER2_RESOLVE_SHARED;
2368 		how_always |= HAMMER2_RESOLVE_SHARED;
2369 		how |= HAMMER2_RESOLVE_SHARED;
2370 	}
2371 
2372 	/*
2373 	 * Recurse (*parentp) upward if necessary until the parent completely
2374 	 * encloses the key range or we hit the inode.
2375 	 *
2376 	 * Handle races against the flusher deleting indirect nodes on its
2377 	 * way back up by continuing to recurse upward past the deletion.
2378 	 */
2379 	parent = *parentp;
2380 	hmp = parent->hmp;
2381 	*errorp = 0;
2382 
2383 	while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2384 	       parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2385 		scan_beg = parent->bref.key;
2386 		scan_end = scan_beg +
2387 			   ((hammer2_key_t)1 << parent->bref.keybits) - 1;
2388 		if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) {
2389 			if (key_beg >= scan_beg && key_end <= scan_end)
2390 				break;
2391 		}
2392 		parent = hammer2_chain_repparent(parentp, how_maybe);
2393 	}
2394 again:
2395 
2396 	if (--maxloops == 0)
2397 		panic("hammer2_chain_lookup: maxloops");
2398 	/*
2399 	 * Locate the blockref array.  Currently we do a fully associative
2400 	 * search through the array.
2401 	 */
2402 	switch(parent->bref.type) {
2403 	case HAMMER2_BREF_TYPE_INODE:
2404 		/*
2405 		 * Special shortcut for embedded data returns the inode
2406 		 * itself.  Callers must detect this condition and access
2407 		 * the embedded data (the strategy code does this for us).
2408 		 *
2409 		 * This is only applicable to regular files and softlinks.
2410 		 *
2411 		 * We need a second lock on parent.  Since we already have
2412 		 * a lock we must pass LOCKAGAIN to prevent unexpected
2413 		 * blocking (we don't want to block on a second shared
2414 		 * ref if an exclusive lock is pending)
2415 		 */
2416 		if (parent->data->ipdata.meta.op_flags &
2417 		    HAMMER2_OPFLAG_DIRECTDATA) {
2418 			if (flags & HAMMER2_LOOKUP_NODIRECT) {
2419 				chain = NULL;
2420 				*key_nextp = key_end + 1;
2421 				goto done;
2422 			}
2423 			hammer2_chain_ref(parent);
2424 			if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
2425 				hammer2_chain_lock(parent,
2426 						   how_always |
2427 						    HAMMER2_RESOLVE_LOCKAGAIN);
2428 			*key_nextp = key_end + 1;
2429 			return (parent);
2430 		}
2431 		base = &parent->data->ipdata.u.blockset.blockref[0];
2432 		count = HAMMER2_SET_COUNT;
2433 		break;
2434 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2435 	case HAMMER2_BREF_TYPE_INDIRECT:
2436 		/*
2437 		 * Handle MATCHIND on the parent
2438 		 */
2439 		if (flags & HAMMER2_LOOKUP_MATCHIND) {
2440 			scan_beg = parent->bref.key;
2441 			scan_end = scan_beg +
2442 			       ((hammer2_key_t)1 << parent->bref.keybits) - 1;
2443 			if (key_beg == scan_beg && key_end == scan_end) {
2444 				chain = parent;
2445 				hammer2_chain_ref(chain);
2446 				hammer2_chain_lock(chain, how_maybe);
2447 				*key_nextp = scan_end + 1;
2448 				goto done;
2449 			}
2450 		}
2451 
2452 		/*
2453 		 * Optimize indirect blocks in the INITIAL state to avoid
2454 		 * I/O.
2455 		 */
2456 		if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2457 			base = NULL;
2458 		} else {
2459 			if (parent->data == NULL) {
2460 				kprintf("parent->data is NULL %p\n", parent);
2461 				while (1)
2462 					tsleep(parent, 0, "xxx", 0);
2463 			}
2464 			base = &parent->data->npdata[0];
2465 		}
2466 		count = parent->bytes / sizeof(hammer2_blockref_t);
2467 		break;
2468 	case HAMMER2_BREF_TYPE_VOLUME:
2469 		base = &parent->data->voldata.sroot_blockset.blockref[0];
2470 		count = HAMMER2_SET_COUNT;
2471 		break;
2472 	case HAMMER2_BREF_TYPE_FREEMAP:
2473 		base = &parent->data->blkset.blockref[0];
2474 		count = HAMMER2_SET_COUNT;
2475 		break;
2476 	default:
2477 		kprintf("hammer2_chain_lookup: unrecognized "
2478 			"blockref(B) type: %d",
2479 			parent->bref.type);
2480 		while (1)
2481 			tsleep(&base, 0, "dead", 0);
2482 		panic("hammer2_chain_lookup: unrecognized "
2483 		      "blockref(B) type: %d",
2484 		      parent->bref.type);
2485 		base = NULL;	/* safety */
2486 		count = 0;	/* safety */
2487 	}
2488 
2489 	/*
2490 	 * No lookup is possible if the parent is errored.  We delayed
2491 	 * this check as long as we could to ensure that the parent backup,
2492 	 * embedded data, and MATCHIND code could still execute.
2493 	 */
2494 	if (parent->error) {
2495 		*errorp = parent->error;
2496 		return NULL;
2497 	}
2498 
2499 	/*
2500 	 * Merged scan to find next candidate.
2501 	 *
2502 	 * hammer2_base_*() functions require the parent->core.live_* fields
2503 	 * to be synchronized.
2504 	 *
2505 	 * We need to hold the spinlock to access the block array and RB tree
2506 	 * and to interlock chain creation.
2507 	 */
2508 	if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
2509 		hammer2_chain_countbrefs(parent, base, count);
2510 
2511 	/*
2512 	 * Combined search
2513 	 */
2514 	hammer2_spin_ex(&parent->core.spin);
2515 	chain = hammer2_combined_find(parent, base, count,
2516 				      key_nextp,
2517 				      key_beg, key_end,
2518 				      &bref);
2519 	generation = parent->core.generation;
2520 
2521 	/*
2522 	 * Exhausted parent chain, iterate.
2523 	 */
2524 	if (bref == NULL) {
2525 		KKASSERT(chain == NULL);
2526 		hammer2_spin_unex(&parent->core.spin);
2527 		if (key_beg == key_end)	/* short cut single-key case */
2528 			return (NULL);
2529 
2530 		/*
2531 		 * Stop if we reached the end of the iteration.
2532 		 */
2533 		if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
2534 		    parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2535 			return (NULL);
2536 		}
2537 
2538 		/*
2539 		 * Calculate next key, stop if we reached the end of the
2540 		 * iteration, otherwise go up one level and loop.
2541 		 */
2542 		key_beg = parent->bref.key +
2543 			  ((hammer2_key_t)1 << parent->bref.keybits);
2544 		if (key_beg == 0 || key_beg > key_end)
2545 			return (NULL);
2546 		parent = hammer2_chain_repparent(parentp, how_maybe);
2547 		goto again;
2548 	}
2549 
2550 	/*
2551 	 * Selected from blockref or in-memory chain.
2552 	 */
2553 	if (chain == NULL) {
2554 		bcopy = *bref;
2555 		hammer2_spin_unex(&parent->core.spin);
2556 		chain = hammer2_chain_get(parent, generation,
2557 					  &bcopy);
2558 		if (chain == NULL) {
2559 			/*
2560 			kprintf("retry lookup parent %p keys %016jx:%016jx\n",
2561 				parent, key_beg, key_end);
2562 			*/
2563 			goto again;
2564 		}
2565 		if (bcmp(&bcopy, bref, sizeof(bcopy))) {
2566 			hammer2_chain_drop(chain);
2567 			chain = NULL;	/* SAFETY */
2568 			goto again;
2569 		}
2570 	} else {
2571 		hammer2_chain_ref(chain);
2572 		hammer2_spin_unex(&parent->core.spin);
2573 	}
2574 
2575 	/*
2576 	 * chain is referenced but not locked.  We must lock the chain
2577 	 * to obtain definitive state.
2578 	 */
2579 	if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2580 	    chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2581 		hammer2_chain_lock(chain, how_maybe);
2582 	} else {
2583 		hammer2_chain_lock(chain, how);
2584 	}
2585 	KKASSERT(chain->parent == parent);
2586 
2587 	/*
2588 	 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX)
2589 	 *
2590 	 * NOTE: Chain's key range is not relevant as there might be
2591 	 *	 one-offs within the range that are not deleted.
2592 	 *
2593 	 * NOTE: Lookups can race delete-duplicate because
2594 	 *	 delete-duplicate does not lock the parent's core
2595 	 *	 (they just use the spinlock on the core).
2596 	 */
2597 	if (chain->flags & HAMMER2_CHAIN_DELETED) {
2598 		kprintf("skip deleted chain %016jx.%02x key=%016jx\n",
2599 			chain->bref.data_off, chain->bref.type,
2600 			chain->bref.key);
2601 		hammer2_chain_unlock(chain);
2602 		hammer2_chain_drop(chain);
2603 		chain = NULL;	/* SAFETY */
2604 		key_beg = *key_nextp;
2605 		if (key_beg == 0 || key_beg > key_end)
2606 			return(NULL);
2607 		goto again;
2608 	}
2609 
2610 	/*
2611 	 * If the chain element is an indirect block it becomes the new
2612 	 * parent and we loop on it.  We must maintain our top-down locks
2613 	 * to prevent the flusher from interfering (i.e. doing a
2614 	 * delete-duplicate and leaving us recursing down a deleted chain).
2615 	 *
2616 	 * The parent always has to be locked with at least RESOLVE_MAYBE
2617 	 * so we can access its data.  It might need a fixup if the caller
2618 	 * passed incompatible flags.  Be careful not to cause a deadlock
2619 	 * as a data-load requires an exclusive lock.
2620 	 *
2621 	 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key
2622 	 * range is within the requested key range we return the indirect
2623 	 * block and do NOT loop.  This is usually only used to acquire
2624 	 * freemap nodes.
2625 	 */
2626 	if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2627 	    chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2628 		save_mtx = parent->lock;
2629 		hammer2_chain_unlock(parent);
2630 		hammer2_chain_drop(parent);
2631 		*parentp = parent = chain;
2632 		chain = NULL;	/* SAFETY */
2633 		goto again;
2634 	}
2635 done:
2636 	/*
2637 	 * All done, return the chain.
2638 	 *
2639 	 * If the caller does not want a locked chain, replace the lock with
2640 	 * a ref.  Perhaps this can eventually be optimized to not obtain the
2641 	 * lock in the first place for situations where the data does not
2642 	 * need to be resolved.
2643 	 *
2644 	 * NOTE! A chain->error must be tested by the caller upon return.
2645 	 *	 *errorp is only set based on issues which occur while
2646 	 *	 trying to reach the chain.
2647 	 */
2648 	if (chain) {
2649 		if (flags & HAMMER2_LOOKUP_NOLOCK)
2650 			hammer2_chain_unlock(chain);
2651 	}
2652 	return (chain);
2653 }
2654 
2655 /*
2656  * After having issued a lookup we can iterate all matching keys.
2657  *
2658  * If chain is non-NULL we continue the iteration from just after it's index.
2659  *
2660  * If chain is NULL we assume the parent was exhausted and continue the
2661  * iteration at the next parent.
2662  *
2663  * If a fatal error occurs (typically an I/O error), a dummy chain is
2664  * returned with chain->error and error-identifying information set.  This
2665  * chain will assert if you try to do anything fancy with it.
2666  *
2667  * XXX Depending on where the error occurs we should allow continued iteration.
2668  *
2669  * parent must be locked on entry and remains locked throughout.  chain's
2670  * lock status must match flags.  Chain is always at least referenced.
2671  *
2672  * WARNING!  The MATCHIND flag does not apply to this function.
2673  */
2674 hammer2_chain_t *
2675 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain,
2676 		   hammer2_key_t *key_nextp,
2677 		   hammer2_key_t key_beg, hammer2_key_t key_end,
2678 		   int *errorp, int flags)
2679 {
2680 	hammer2_chain_t *parent;
2681 	int how_maybe;
2682 
2683 	/*
2684 	 * Calculate locking flags for upward recursion.
2685 	 */
2686 	how_maybe = HAMMER2_RESOLVE_MAYBE;
2687 	if (flags & HAMMER2_LOOKUP_SHARED)
2688 		how_maybe |= HAMMER2_RESOLVE_SHARED;
2689 
2690 	parent = *parentp;
2691 	*errorp = 0;
2692 
2693 	/*
2694 	 * Calculate the next index and recalculate the parent if necessary.
2695 	 */
2696 	if (chain) {
2697 		key_beg = chain->bref.key +
2698 			  ((hammer2_key_t)1 << chain->bref.keybits);
2699 		if ((flags & (HAMMER2_LOOKUP_NOLOCK |
2700 			      HAMMER2_LOOKUP_NOUNLOCK)) == 0) {
2701 			hammer2_chain_unlock(chain);
2702 		}
2703 		hammer2_chain_drop(chain);
2704 
2705 		/*
2706 		 * chain invalid past this point, but we can still do a
2707 		 * pointer comparison w/parent.
2708 		 *
2709 		 * Any scan where the lookup returned degenerate data embedded
2710 		 * in the inode has an invalid index and must terminate.
2711 		 */
2712 		if (chain == parent)
2713 			return(NULL);
2714 		if (key_beg == 0 || key_beg > key_end)
2715 			return(NULL);
2716 		chain = NULL;
2717 	} else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
2718 		   parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2719 		/*
2720 		 * We reached the end of the iteration.
2721 		 */
2722 		return (NULL);
2723 	} else {
2724 		/*
2725 		 * Continue iteration with next parent unless the current
2726 		 * parent covers the range.
2727 		 *
2728 		 * (This also handles the case of a deleted, empty indirect
2729 		 * node).
2730 		 */
2731 		key_beg = parent->bref.key +
2732 			  ((hammer2_key_t)1 << parent->bref.keybits);
2733 		if (key_beg == 0 || key_beg > key_end)
2734 			return (NULL);
2735 		parent = hammer2_chain_repparent(parentp, how_maybe);
2736 	}
2737 
2738 	/*
2739 	 * And execute
2740 	 */
2741 	return (hammer2_chain_lookup(parentp, key_nextp,
2742 				     key_beg, key_end,
2743 				     errorp, flags));
2744 }
2745 
2746 /*
2747  * Caller wishes to iterate chains under parent, loading new chains into
2748  * chainp.  Caller must initialize *chainp to NULL and *firstp to 1, and
2749  * then call hammer2_chain_scan() repeatedly until a non-zero return.
2750  * During the scan, *firstp will be set to 0 and (*chainp) will be replaced
2751  * with the returned chain for the scan.  The returned *chainp will be
2752  * locked and referenced.  Any prior contents will be unlocked and dropped.
2753  *
2754  * Caller should check the return value.  A normal scan EOF will return
2755  * exactly HAMMER2_ERROR_EOF.  Any other non-zero value indicates an
2756  * error trying to access parent data.  Any error in the returned chain
2757  * must be tested separately by the caller.
2758  *
2759  * (*chainp) is dropped on each scan, but will only be set if the returned
2760  * element itself can recurse.  Leaf elements are NOT resolved, loaded, or
2761  * returned via *chainp.  The caller will get their bref only.
2762  *
2763  * The raw scan function is similar to lookup/next but does not seek to a key.
2764  * Blockrefs are iterated via first_bref = (parent, NULL) and
2765  * next_chain = (parent, bref).
2766  *
2767  * The passed-in parent must be locked and its data resolved.  The function
2768  * nominally returns a locked and referenced *chainp != NULL for chains
2769  * the caller might need to recurse on (and will dipose of any *chainp passed
2770  * in).  The caller must check the chain->bref.type either way.
2771  */
2772 int
2773 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp,
2774 		   hammer2_blockref_t *bref, int *firstp,
2775 		   int flags)
2776 {
2777 	hammer2_dev_t *hmp;
2778 	hammer2_blockref_t *base;
2779 	hammer2_blockref_t *bref_ptr;
2780 	hammer2_key_t key;
2781 	hammer2_key_t next_key;
2782 	hammer2_chain_t *chain = NULL;
2783 	int count = 0;
2784 	int how_always = HAMMER2_RESOLVE_ALWAYS;
2785 	int how_maybe = HAMMER2_RESOLVE_MAYBE;
2786 	int how;
2787 	int generation;
2788 	int maxloops = 300000;
2789 	int error;
2790 
2791 	hmp = parent->hmp;
2792 	error = 0;
2793 
2794 	/*
2795 	 * Scan flags borrowed from lookup.
2796 	 */
2797 	if (flags & HAMMER2_LOOKUP_ALWAYS) {
2798 		how_maybe = how_always;
2799 		how = HAMMER2_RESOLVE_ALWAYS;
2800 	} else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) {
2801 		how = HAMMER2_RESOLVE_NEVER;
2802 	} else {
2803 		how = HAMMER2_RESOLVE_MAYBE;
2804 	}
2805 	if (flags & HAMMER2_LOOKUP_SHARED) {
2806 		how_maybe |= HAMMER2_RESOLVE_SHARED;
2807 		how_always |= HAMMER2_RESOLVE_SHARED;
2808 		how |= HAMMER2_RESOLVE_SHARED;
2809 	}
2810 
2811 	/*
2812 	 * Calculate key to locate first/next element, unlocking the previous
2813 	 * element as we go.  Be careful, the key calculation can overflow.
2814 	 *
2815 	 * (also reset bref to NULL)
2816 	 */
2817 	if (*firstp) {
2818 		key = 0;
2819 		*firstp = 0;
2820 	} else {
2821 		key = bref->key + ((hammer2_key_t)1 << bref->keybits);
2822 		if ((chain = *chainp) != NULL) {
2823 			*chainp = NULL;
2824 			hammer2_chain_unlock(chain);
2825 			hammer2_chain_drop(chain);
2826 			chain = NULL;
2827 		}
2828 		if (key == 0) {
2829 			error |= HAMMER2_ERROR_EOF;
2830 			goto done;
2831 		}
2832 	}
2833 
2834 again:
2835 	if (parent->error) {
2836 		error = parent->error;
2837 		goto done;
2838 	}
2839 	if (--maxloops == 0)
2840 		panic("hammer2_chain_scan: maxloops");
2841 
2842 	/*
2843 	 * Locate the blockref array.  Currently we do a fully associative
2844 	 * search through the array.
2845 	 */
2846 	switch(parent->bref.type) {
2847 	case HAMMER2_BREF_TYPE_INODE:
2848 		/*
2849 		 * An inode with embedded data has no sub-chains.
2850 		 *
2851 		 * WARNING! Bulk scan code may pass a static chain marked
2852 		 *	    as BREF_TYPE_INODE with a copy of the volume
2853 		 *	    root blockset to snapshot the volume.
2854 		 */
2855 		if (parent->data->ipdata.meta.op_flags &
2856 		    HAMMER2_OPFLAG_DIRECTDATA) {
2857 			error |= HAMMER2_ERROR_EOF;
2858 			goto done;
2859 		}
2860 		base = &parent->data->ipdata.u.blockset.blockref[0];
2861 		count = HAMMER2_SET_COUNT;
2862 		break;
2863 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2864 	case HAMMER2_BREF_TYPE_INDIRECT:
2865 		/*
2866 		 * Optimize indirect blocks in the INITIAL state to avoid
2867 		 * I/O.
2868 		 */
2869 		if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2870 			base = NULL;
2871 		} else {
2872 			if (parent->data == NULL)
2873 				panic("parent->data is NULL");
2874 			base = &parent->data->npdata[0];
2875 		}
2876 		count = parent->bytes / sizeof(hammer2_blockref_t);
2877 		break;
2878 	case HAMMER2_BREF_TYPE_VOLUME:
2879 		base = &parent->data->voldata.sroot_blockset.blockref[0];
2880 		count = HAMMER2_SET_COUNT;
2881 		break;
2882 	case HAMMER2_BREF_TYPE_FREEMAP:
2883 		base = &parent->data->blkset.blockref[0];
2884 		count = HAMMER2_SET_COUNT;
2885 		break;
2886 	default:
2887 		panic("hammer2_chain_lookup: unrecognized blockref type: %d",
2888 		      parent->bref.type);
2889 		base = NULL;	/* safety */
2890 		count = 0;	/* safety */
2891 	}
2892 
2893 	/*
2894 	 * Merged scan to find next candidate.
2895 	 *
2896 	 * hammer2_base_*() functions require the parent->core.live_* fields
2897 	 * to be synchronized.
2898 	 *
2899 	 * We need to hold the spinlock to access the block array and RB tree
2900 	 * and to interlock chain creation.
2901 	 */
2902 	if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
2903 		hammer2_chain_countbrefs(parent, base, count);
2904 
2905 	next_key = 0;
2906 	bref_ptr = NULL;
2907 	hammer2_spin_ex(&parent->core.spin);
2908 	chain = hammer2_combined_find(parent, base, count,
2909 				      &next_key,
2910 				      key, HAMMER2_KEY_MAX,
2911 				      &bref_ptr);
2912 	generation = parent->core.generation;
2913 
2914 	/*
2915 	 * Exhausted parent chain, we're done.
2916 	 */
2917 	if (bref_ptr == NULL) {
2918 		hammer2_spin_unex(&parent->core.spin);
2919 		KKASSERT(chain == NULL);
2920 		error |= HAMMER2_ERROR_EOF;
2921 		goto done;
2922 	}
2923 
2924 	/*
2925 	 * Copy into the supplied stack-based blockref.
2926 	 */
2927 	*bref = *bref_ptr;
2928 
2929 	/*
2930 	 * Selected from blockref or in-memory chain.
2931 	 */
2932 	if (chain == NULL) {
2933 		switch(bref->type) {
2934 		case HAMMER2_BREF_TYPE_INODE:
2935 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2936 		case HAMMER2_BREF_TYPE_INDIRECT:
2937 		case HAMMER2_BREF_TYPE_VOLUME:
2938 		case HAMMER2_BREF_TYPE_FREEMAP:
2939 			/*
2940 			 * Recursion, always get the chain
2941 			 */
2942 			hammer2_spin_unex(&parent->core.spin);
2943 			chain = hammer2_chain_get(parent, generation, bref);
2944 			if (chain == NULL) {
2945 				kprintf("retry scan parent %p keys %016jx\n",
2946 					parent, key);
2947 				goto again;
2948 			}
2949 			if (bcmp(bref, bref_ptr, sizeof(*bref))) {
2950 				hammer2_chain_drop(chain);
2951 				chain = NULL;
2952 				goto again;
2953 			}
2954 			break;
2955 		default:
2956 			/*
2957 			 * No recursion, do not waste time instantiating
2958 			 * a chain, just iterate using the bref.
2959 			 */
2960 			hammer2_spin_unex(&parent->core.spin);
2961 			break;
2962 		}
2963 	} else {
2964 		/*
2965 		 * Recursion or not we need the chain in order to supply
2966 		 * the bref.
2967 		 */
2968 		hammer2_chain_ref(chain);
2969 		hammer2_spin_unex(&parent->core.spin);
2970 	}
2971 
2972 	/*
2973 	 * chain is referenced but not locked.  We must lock the chain
2974 	 * to obtain definitive state.
2975 	 */
2976 	if (chain)
2977 		hammer2_chain_lock(chain, how);
2978 
2979 	/*
2980 	 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX)
2981 	 *
2982 	 * NOTE: chain's key range is not relevant as there might be
2983 	 *	 one-offs within the range that are not deleted.
2984 	 *
2985 	 * NOTE: XXX this could create problems with scans used in
2986 	 *	 situations other than mount-time recovery.
2987 	 *
2988 	 * NOTE: Lookups can race delete-duplicate because
2989 	 *	 delete-duplicate does not lock the parent's core
2990 	 *	 (they just use the spinlock on the core).
2991 	 */
2992 	if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
2993 		hammer2_chain_unlock(chain);
2994 		hammer2_chain_drop(chain);
2995 		chain = NULL;
2996 
2997 		key = next_key;
2998 		if (key == 0) {
2999 			error |= HAMMER2_ERROR_EOF;
3000 			goto done;
3001 		}
3002 		goto again;
3003 	}
3004 
3005 done:
3006 	/*
3007 	 * All done, return the bref or NULL, supply chain if necessary.
3008 	 */
3009 	if (chain)
3010 		*chainp = chain;
3011 	return (error);
3012 }
3013 
3014 /*
3015  * Create and return a new hammer2 system memory structure of the specified
3016  * key, type and size and insert it under (*parentp).  This is a full
3017  * insertion, based on the supplied key/keybits, and may involve creating
3018  * indirect blocks and moving other chains around via delete/duplicate.
3019  *
3020  * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION
3021  * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING
3022  * FULL.  This typically means that the caller is creating the chain after
3023  * doing a hammer2_chain_lookup().
3024  *
3025  * (*parentp) must be exclusive locked and may be replaced on return
3026  * depending on how much work the function had to do.
3027  *
3028  * (*parentp) must not be errored or this function will assert.
3029  *
3030  * (*chainp) usually starts out NULL and returns the newly created chain,
3031  * but if the caller desires the caller may allocate a disconnected chain
3032  * and pass it in instead.
3033  *
3034  * This function should NOT be used to insert INDIRECT blocks.  It is
3035  * typically used to create/insert inodes and data blocks.
3036  *
3037  * Caller must pass-in an exclusively locked parent the new chain is to
3038  * be inserted under, and optionally pass-in a disconnected, exclusively
3039  * locked chain to insert (else we create a new chain).  The function will
3040  * adjust (*parentp) as necessary, create or connect the chain, and
3041  * return an exclusively locked chain in *chainp.
3042  *
3043  * When creating a PFSROOT inode under the super-root, pmp is typically NULL
3044  * and will be reassigned.
3045  *
3046  * NOTE: returns HAMMER_ERROR_* flags
3047  */
3048 int
3049 hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp,
3050 		     hammer2_pfs_t *pmp, int methods,
3051 		     hammer2_key_t key, int keybits, int type, size_t bytes,
3052 		     hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags)
3053 {
3054 	hammer2_dev_t *hmp;
3055 	hammer2_chain_t *chain;
3056 	hammer2_chain_t *parent;
3057 	hammer2_blockref_t *base;
3058 	hammer2_blockref_t dummy;
3059 	int allocated = 0;
3060 	int error = 0;
3061 	int count;
3062 	int maxloops = 300000;
3063 
3064 	/*
3065 	 * Topology may be crossing a PFS boundary.
3066 	 */
3067 	parent = *parentp;
3068 	KKASSERT(hammer2_mtx_owned(&parent->lock));
3069 	KKASSERT(parent->error == 0);
3070 	hmp = parent->hmp;
3071 	chain = *chainp;
3072 
3073 	if (chain == NULL) {
3074 		/*
3075 		 * First allocate media space and construct the dummy bref,
3076 		 * then allocate the in-memory chain structure.  Set the
3077 		 * INITIAL flag for fresh chains which do not have embedded
3078 		 * data.
3079 		 *
3080 		 * XXX for now set the check mode of the child based on
3081 		 *     the parent or, if the parent is an inode, the
3082 		 *     specification in the inode.
3083 		 */
3084 		bzero(&dummy, sizeof(dummy));
3085 		dummy.type = type;
3086 		dummy.key = key;
3087 		dummy.keybits = keybits;
3088 		dummy.data_off = hammer2_getradix(bytes);
3089 
3090 		/*
3091 		 * Inherit methods from parent by default.  Primarily used
3092 		 * for BREF_TYPE_DATA.  Non-data types *must* be set to
3093 		 * a non-NONE check algorithm.
3094 		 */
3095 		if (methods == -1)
3096 			dummy.methods = parent->bref.methods;
3097 		else
3098 			dummy.methods = (uint8_t)methods;
3099 
3100 		if (type != HAMMER2_BREF_TYPE_DATA &&
3101 		    HAMMER2_DEC_CHECK(dummy.methods) == HAMMER2_CHECK_NONE) {
3102 			dummy.methods |=
3103 				HAMMER2_ENC_CHECK(HAMMER2_CHECK_DEFAULT);
3104 		}
3105 
3106 		chain = hammer2_chain_alloc(hmp, pmp, &dummy);
3107 
3108 		/*
3109 		 * Lock the chain manually, chain_lock will load the chain
3110 		 * which we do NOT want to do.  (note: chain->refs is set
3111 		 * to 1 by chain_alloc() for us, but lockcnt is not).
3112 		 */
3113 		chain->lockcnt = 1;
3114 		hammer2_mtx_ex(&chain->lock);
3115 		allocated = 1;
3116 		++curthread->td_tracker;
3117 
3118 		/*
3119 		 * Set INITIAL to optimize I/O.  The flag will generally be
3120 		 * processed when we call hammer2_chain_modify().
3121 		 *
3122 		 * Recalculate bytes to reflect the actual media block
3123 		 * allocation.  Handle special case radix 0 == 0 bytes.
3124 		 */
3125 		bytes = (size_t)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
3126 		if (bytes)
3127 			bytes = (hammer2_off_t)1 << bytes;
3128 		chain->bytes = bytes;
3129 
3130 		switch(type) {
3131 		case HAMMER2_BREF_TYPE_VOLUME:
3132 		case HAMMER2_BREF_TYPE_FREEMAP:
3133 			panic("hammer2_chain_create: called with volume type");
3134 			break;
3135 		case HAMMER2_BREF_TYPE_INDIRECT:
3136 			panic("hammer2_chain_create: cannot be used to"
3137 			      "create indirect block");
3138 			break;
3139 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3140 			panic("hammer2_chain_create: cannot be used to"
3141 			      "create freemap root or node");
3142 			break;
3143 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
3144 			KKASSERT(bytes == sizeof(chain->data->bmdata));
3145 			/* fall through */
3146 		case HAMMER2_BREF_TYPE_DIRENT:
3147 		case HAMMER2_BREF_TYPE_INODE:
3148 		case HAMMER2_BREF_TYPE_DATA:
3149 		default:
3150 			/*
3151 			 * leave chain->data NULL, set INITIAL
3152 			 */
3153 			KKASSERT(chain->data == NULL);
3154 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
3155 			break;
3156 		}
3157 	} else {
3158 		/*
3159 		 * We are reattaching a previously deleted chain, possibly
3160 		 * under a new parent and possibly with a new key/keybits.
3161 		 * The chain does not have to be in a modified state.  The
3162 		 * UPDATE flag will be set later on in this routine.
3163 		 *
3164 		 * Do NOT mess with the current state of the INITIAL flag.
3165 		 */
3166 		chain->bref.key = key;
3167 		chain->bref.keybits = keybits;
3168 		if (chain->flags & HAMMER2_CHAIN_DELETED)
3169 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED);
3170 		KKASSERT(chain->parent == NULL);
3171 	}
3172 	if (flags & HAMMER2_INSERT_PFSROOT)
3173 		chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
3174 	else
3175 		chain->bref.flags &= ~HAMMER2_BREF_FLAG_PFSROOT;
3176 
3177 	/*
3178 	 * Calculate how many entries we have in the blockref array and
3179 	 * determine if an indirect block is required.
3180 	 */
3181 again:
3182 	if (--maxloops == 0)
3183 		panic("hammer2_chain_create: maxloops");
3184 
3185 	switch(parent->bref.type) {
3186 	case HAMMER2_BREF_TYPE_INODE:
3187 		if ((parent->data->ipdata.meta.op_flags &
3188 		     HAMMER2_OPFLAG_DIRECTDATA) != 0) {
3189 			kprintf("hammer2: parent set for direct-data! "
3190 				"pkey=%016jx ckey=%016jx\n",
3191 				parent->bref.key,
3192 				chain->bref.key);
3193 	        }
3194 		KKASSERT((parent->data->ipdata.meta.op_flags &
3195 			  HAMMER2_OPFLAG_DIRECTDATA) == 0);
3196 		KKASSERT(parent->data != NULL);
3197 		base = &parent->data->ipdata.u.blockset.blockref[0];
3198 		count = HAMMER2_SET_COUNT;
3199 		break;
3200 	case HAMMER2_BREF_TYPE_INDIRECT:
3201 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3202 		if (parent->flags & HAMMER2_CHAIN_INITIAL)
3203 			base = NULL;
3204 		else
3205 			base = &parent->data->npdata[0];
3206 		count = parent->bytes / sizeof(hammer2_blockref_t);
3207 		break;
3208 	case HAMMER2_BREF_TYPE_VOLUME:
3209 		KKASSERT(parent->data != NULL);
3210 		base = &parent->data->voldata.sroot_blockset.blockref[0];
3211 		count = HAMMER2_SET_COUNT;
3212 		break;
3213 	case HAMMER2_BREF_TYPE_FREEMAP:
3214 		KKASSERT(parent->data != NULL);
3215 		base = &parent->data->blkset.blockref[0];
3216 		count = HAMMER2_SET_COUNT;
3217 		break;
3218 	default:
3219 		panic("hammer2_chain_create: unrecognized blockref type: %d",
3220 		      parent->bref.type);
3221 		base = NULL;
3222 		count = 0;
3223 		break;
3224 	}
3225 
3226 	/*
3227 	 * Make sure we've counted the brefs
3228 	 */
3229 	if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
3230 		hammer2_chain_countbrefs(parent, base, count);
3231 
3232 	KASSERT(parent->core.live_count >= 0 &&
3233 		parent->core.live_count <= count,
3234 		("bad live_count %d/%d (%02x, %d)",
3235 			parent->core.live_count, count,
3236 			parent->bref.type, parent->bytes));
3237 
3238 	/*
3239 	 * If no free blockref could be found we must create an indirect
3240 	 * block and move a number of blockrefs into it.  With the parent
3241 	 * locked we can safely lock each child in order to delete+duplicate
3242 	 * it without causing a deadlock.
3243 	 *
3244 	 * This may return the new indirect block or the old parent depending
3245 	 * on where the key falls.  NULL is returned on error.
3246 	 */
3247 	if (parent->core.live_count == count) {
3248 		hammer2_chain_t *nparent;
3249 
3250 		KKASSERT((flags & HAMMER2_INSERT_SAMEPARENT) == 0);
3251 
3252 		nparent = hammer2_chain_create_indirect(parent, key, keybits,
3253 							mtid, type, &error);
3254 		if (nparent == NULL) {
3255 			if (allocated)
3256 				hammer2_chain_drop(chain);
3257 			chain = NULL;
3258 			goto done;
3259 		}
3260 		if (parent != nparent) {
3261 			hammer2_chain_unlock(parent);
3262 			hammer2_chain_drop(parent);
3263 			parent = *parentp = nparent;
3264 		}
3265 		goto again;
3266 	}
3267 
3268 	if (chain->flags & HAMMER2_CHAIN_DELETED)
3269 		kprintf("Inserting deleted chain @%016jx\n",
3270 			chain->bref.key);
3271 
3272 	/*
3273 	 * Link the chain into its parent.
3274 	 */
3275 	if (chain->parent != NULL)
3276 		panic("hammer2: hammer2_chain_create: chain already connected");
3277 	KKASSERT(chain->parent == NULL);
3278 	KKASSERT(parent->core.live_count < count);
3279 	hammer2_chain_insert(parent, chain,
3280 			     HAMMER2_CHAIN_INSERT_SPIN |
3281 			     HAMMER2_CHAIN_INSERT_LIVE,
3282 			     0);
3283 
3284 	if (allocated) {
3285 		/*
3286 		 * Mark the newly created chain modified.  This will cause
3287 		 * UPDATE to be set and process the INITIAL flag.
3288 		 *
3289 		 * Device buffers are not instantiated for DATA elements
3290 		 * as these are handled by logical buffers.
3291 		 *
3292 		 * Indirect and freemap node indirect blocks are handled
3293 		 * by hammer2_chain_create_indirect() and not by this
3294 		 * function.
3295 		 *
3296 		 * Data for all other bref types is expected to be
3297 		 * instantiated (INODE, LEAF).
3298 		 */
3299 		switch(chain->bref.type) {
3300 		case HAMMER2_BREF_TYPE_DATA:
3301 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
3302 		case HAMMER2_BREF_TYPE_DIRENT:
3303 		case HAMMER2_BREF_TYPE_INODE:
3304 			error = hammer2_chain_modify(chain, mtid, dedup_off,
3305 						     HAMMER2_MODIFY_OPTDATA);
3306 			break;
3307 		default:
3308 			/*
3309 			 * Remaining types are not supported by this function.
3310 			 * In particular, INDIRECT and LEAF_NODE types are
3311 			 * handled by create_indirect().
3312 			 */
3313 			panic("hammer2_chain_create: bad type: %d",
3314 			      chain->bref.type);
3315 			/* NOT REACHED */
3316 			break;
3317 		}
3318 	} else {
3319 		/*
3320 		 * When reconnecting a chain we must set UPDATE and
3321 		 * setflush so the flush recognizes that it must update
3322 		 * the bref in the parent.
3323 		 */
3324 		if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0)
3325 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
3326 	}
3327 
3328 	/*
3329 	 * We must setflush(parent) to ensure that it recurses through to
3330 	 * chain.  setflush(chain) might not work because ONFLUSH is possibly
3331 	 * already set in the chain (so it won't recurse up to set it in the
3332 	 * parent).
3333 	 */
3334 	hammer2_chain_setflush(parent);
3335 
3336 done:
3337 	*chainp = chain;
3338 
3339 	return (error);
3340 }
3341 
3342 /*
3343  * Move the chain from its old parent to a new parent.  The chain must have
3344  * already been deleted or already disconnected (or never associated) with
3345  * a parent.  The chain is reassociated with the new parent and the deleted
3346  * flag will be cleared (no longer deleted).  The chain's modification state
3347  * is not altered.
3348  *
3349  * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION
3350  * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING
3351  * FULL.  This typically means that the caller is creating the chain after
3352  * doing a hammer2_chain_lookup().
3353  *
3354  * A non-NULL bref is typically passed when key and keybits must be overridden.
3355  * Note that hammer2_cluster_duplicate() *ONLY* uses the key and keybits fields
3356  * from a passed-in bref and uses the old chain's bref for everything else.
3357  *
3358  * Neither (parent) or (chain) can be errored.
3359  *
3360  * If (parent) is non-NULL then the chain is inserted under the parent.
3361  *
3362  * If (parent) is NULL then the newly duplicated chain is not inserted
3363  * anywhere, similar to if it had just been chain_alloc()'d (suitable for
3364  * passing into hammer2_chain_create() after this function returns).
3365  *
3366  * WARNING! This function calls create which means it can insert indirect
3367  *	    blocks.  This can cause other unrelated chains in the parent to
3368  *	    be moved to a newly inserted indirect block in addition to the
3369  *	    specific chain.
3370  */
3371 void
3372 hammer2_chain_rename(hammer2_blockref_t *bref,
3373 		     hammer2_chain_t **parentp, hammer2_chain_t *chain,
3374 		     hammer2_tid_t mtid, int flags)
3375 {
3376 	hammer2_dev_t *hmp;
3377 	hammer2_chain_t *parent;
3378 	size_t bytes;
3379 
3380 	/*
3381 	 * WARNING!  We should never resolve DATA to device buffers
3382 	 *	     (XXX allow it if the caller did?), and since
3383 	 *	     we currently do not have the logical buffer cache
3384 	 *	     buffer in-hand to fix its cached physical offset
3385 	 *	     we also force the modify code to not COW it. XXX
3386 	 */
3387 	hmp = chain->hmp;
3388 	KKASSERT(chain->parent == NULL);
3389 	KKASSERT(chain->error == 0);
3390 
3391 	/*
3392 	 * Now create a duplicate of the chain structure, associating
3393 	 * it with the same core, making it the same size, pointing it
3394 	 * to the same bref (the same media block).
3395 	 *
3396 	 * NOTE: Handle special radix == 0 case (means 0 bytes).
3397 	 */
3398 	if (bref == NULL)
3399 		bref = &chain->bref;
3400 	bytes = (size_t)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
3401 	if (bytes)
3402 		bytes = (hammer2_off_t)1 << bytes;
3403 
3404 	/*
3405 	 * If parent is not NULL the duplicated chain will be entered under
3406 	 * the parent and the UPDATE bit set to tell flush to update
3407 	 * the blockref.
3408 	 *
3409 	 * We must setflush(parent) to ensure that it recurses through to
3410 	 * chain.  setflush(chain) might not work because ONFLUSH is possibly
3411 	 * already set in the chain (so it won't recurse up to set it in the
3412 	 * parent).
3413 	 *
3414 	 * Having both chains locked is extremely important for atomicy.
3415 	 */
3416 	if (parentp && (parent = *parentp) != NULL) {
3417 		KKASSERT(hammer2_mtx_owned(&parent->lock));
3418 		KKASSERT(parent->refs > 0);
3419 		KKASSERT(parent->error == 0);
3420 
3421 		hammer2_chain_create(parentp, &chain,
3422 				     chain->pmp, HAMMER2_METH_DEFAULT,
3423 				     bref->key, bref->keybits, bref->type,
3424 				     chain->bytes, mtid, 0, flags);
3425 		KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE);
3426 		hammer2_chain_setflush(*parentp);
3427 	}
3428 }
3429 
3430 /*
3431  * Helper function for deleting chains.
3432  *
3433  * The chain is removed from the live view (the RBTREE) as well as the parent's
3434  * blockmap.  Both chain and its parent must be locked.
3435  *
3436  * parent may not be errored.  chain can be errored.
3437  */
3438 static int
3439 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain,
3440 			     hammer2_tid_t mtid, int flags)
3441 {
3442 	hammer2_dev_t *hmp;
3443 	int error = 0;
3444 
3445 	KKASSERT((chain->flags & (HAMMER2_CHAIN_DELETED |
3446 				  HAMMER2_CHAIN_FICTITIOUS)) == 0);
3447 	KKASSERT(chain->parent == parent);
3448 	hmp = chain->hmp;
3449 
3450 	if (chain->flags & HAMMER2_CHAIN_BMAPPED) {
3451 		/*
3452 		 * Chain is blockmapped, so there must be a parent.
3453 		 * Atomically remove the chain from the parent and remove
3454 		 * the blockmap entry.  The parent must be set modified
3455 		 * to remove the blockmap entry.
3456 		 */
3457 		hammer2_blockref_t *base;
3458 		int count;
3459 
3460 		KKASSERT(parent != NULL);
3461 		KKASSERT(parent->error == 0);
3462 		KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0);
3463 		error = hammer2_chain_modify(parent, mtid, 0, 0);
3464 		if (error)
3465 			goto done;
3466 
3467 		/*
3468 		 * Calculate blockmap pointer
3469 		 */
3470 		KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
3471 		hammer2_spin_ex(&chain->core.spin);
3472 		hammer2_spin_ex(&parent->core.spin);
3473 
3474 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
3475 		atomic_add_int(&parent->core.live_count, -1);
3476 		++parent->core.generation;
3477 		RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain);
3478 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
3479 		--parent->core.chain_count;
3480 		chain->parent = NULL;
3481 
3482 		switch(parent->bref.type) {
3483 		case HAMMER2_BREF_TYPE_INODE:
3484 			/*
3485 			 * Access the inode's block array.  However, there
3486 			 * is no block array if the inode is flagged
3487 			 * DIRECTDATA.
3488 			 */
3489 			if (parent->data &&
3490 			    (parent->data->ipdata.meta.op_flags &
3491 			     HAMMER2_OPFLAG_DIRECTDATA) == 0) {
3492 				base =
3493 				   &parent->data->ipdata.u.blockset.blockref[0];
3494 			} else {
3495 				base = NULL;
3496 			}
3497 			count = HAMMER2_SET_COUNT;
3498 			break;
3499 		case HAMMER2_BREF_TYPE_INDIRECT:
3500 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3501 			if (parent->data)
3502 				base = &parent->data->npdata[0];
3503 			else
3504 				base = NULL;
3505 			count = parent->bytes / sizeof(hammer2_blockref_t);
3506 			break;
3507 		case HAMMER2_BREF_TYPE_VOLUME:
3508 			base = &parent->data->voldata.
3509 					sroot_blockset.blockref[0];
3510 			count = HAMMER2_SET_COUNT;
3511 			break;
3512 		case HAMMER2_BREF_TYPE_FREEMAP:
3513 			base = &parent->data->blkset.blockref[0];
3514 			count = HAMMER2_SET_COUNT;
3515 			break;
3516 		default:
3517 			base = NULL;
3518 			count = 0;
3519 			panic("hammer2_flush_pass2: "
3520 			      "unrecognized blockref type: %d",
3521 			      parent->bref.type);
3522 		}
3523 
3524 		/*
3525 		 * delete blockmapped chain from its parent.
3526 		 *
3527 		 * The parent is not affected by any statistics in chain
3528 		 * which are pending synchronization.  That is, there is
3529 		 * nothing to undo in the parent since they have not yet
3530 		 * been incorporated into the parent.
3531 		 *
3532 		 * The parent is affected by statistics stored in inodes.
3533 		 * Those have already been synchronized, so they must be
3534 		 * undone.  XXX split update possible w/delete in middle?
3535 		 */
3536 		if (base) {
3537 			hammer2_base_delete(parent, base, count, chain);
3538 		}
3539 		hammer2_spin_unex(&parent->core.spin);
3540 		hammer2_spin_unex(&chain->core.spin);
3541 	} else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) {
3542 		/*
3543 		 * Chain is not blockmapped but a parent is present.
3544 		 * Atomically remove the chain from the parent.  There is
3545 		 * no blockmap entry to remove.
3546 		 *
3547 		 * Because chain was associated with a parent but not
3548 		 * synchronized, the chain's *_count_up fields contain
3549 		 * inode adjustment statistics which must be undone.
3550 		 */
3551 		hammer2_spin_ex(&chain->core.spin);
3552 		hammer2_spin_ex(&parent->core.spin);
3553 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
3554 		atomic_add_int(&parent->core.live_count, -1);
3555 		++parent->core.generation;
3556 		RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain);
3557 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
3558 		--parent->core.chain_count;
3559 		chain->parent = NULL;
3560 		hammer2_spin_unex(&parent->core.spin);
3561 		hammer2_spin_unex(&chain->core.spin);
3562 	} else {
3563 		/*
3564 		 * Chain is not blockmapped and has no parent.  This
3565 		 * is a degenerate case.
3566 		 */
3567 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
3568 	}
3569 done:
3570 	return error;
3571 }
3572 
3573 /*
3574  * Create an indirect block that covers one or more of the elements in the
3575  * current parent.  Either returns the existing parent with no locking or
3576  * ref changes or returns the new indirect block locked and referenced
3577  * and leaving the original parent lock/ref intact as well.
3578  *
3579  * If an error occurs, NULL is returned and *errorp is set to the H2 error.
3580  *
3581  * The returned chain depends on where the specified key falls.
3582  *
3583  * The key/keybits for the indirect mode only needs to follow three rules:
3584  *
3585  * (1) That all elements underneath it fit within its key space and
3586  *
3587  * (2) That all elements outside it are outside its key space.
3588  *
3589  * (3) When creating the new indirect block any elements in the current
3590  *     parent that fit within the new indirect block's keyspace must be
3591  *     moved into the new indirect block.
3592  *
3593  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
3594  *     keyspace the the current parent, but lookup/iteration rules will
3595  *     ensure (and must ensure) that rule (2) for all parents leading up
3596  *     to the nearest inode or the root volume header is adhered to.  This
3597  *     is accomplished by always recursing through matching keyspaces in
3598  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
3599  *
3600  * The current implementation calculates the current worst-case keyspace by
3601  * iterating the current parent and then divides it into two halves, choosing
3602  * whichever half has the most elements (not necessarily the half containing
3603  * the requested key).
3604  *
3605  * We can also opt to use the half with the least number of elements.  This
3606  * causes lower-numbered keys (aka logical file offsets) to recurse through
3607  * fewer indirect blocks and higher-numbered keys to recurse through more.
3608  * This also has the risk of not moving enough elements to the new indirect
3609  * block and being forced to create several indirect blocks before the element
3610  * can be inserted.
3611  *
3612  * Must be called with an exclusively locked parent.
3613  *
3614  * NOTE: *errorp set to HAMMER_ERROR_* flags
3615  */
3616 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent,
3617 				hammer2_key_t *keyp, int keybits,
3618 				hammer2_blockref_t *base, int count);
3619 static int hammer2_chain_indkey_file(hammer2_chain_t *parent,
3620 				hammer2_key_t *keyp, int keybits,
3621 				hammer2_blockref_t *base, int count,
3622 				int ncount);
3623 static int hammer2_chain_indkey_dir(hammer2_chain_t *parent,
3624 				hammer2_key_t *keyp, int keybits,
3625 				hammer2_blockref_t *base, int count,
3626 				int ncount);
3627 static
3628 hammer2_chain_t *
3629 hammer2_chain_create_indirect(hammer2_chain_t *parent,
3630 			      hammer2_key_t create_key, int create_bits,
3631 			      hammer2_tid_t mtid, int for_type, int *errorp)
3632 {
3633 	hammer2_dev_t *hmp;
3634 	hammer2_blockref_t *base;
3635 	hammer2_blockref_t *bref;
3636 	hammer2_blockref_t bcopy;
3637 	hammer2_chain_t *chain;
3638 	hammer2_chain_t *ichain;
3639 	hammer2_chain_t dummy;
3640 	hammer2_key_t key = create_key;
3641 	hammer2_key_t key_beg;
3642 	hammer2_key_t key_end;
3643 	hammer2_key_t key_next;
3644 	int keybits = create_bits;
3645 	int count;
3646 	int ncount;
3647 	int nbytes;
3648 	int loops;
3649 	int error;
3650 	int reason;
3651 	int generation;
3652 	int maxloops = 300000;
3653 
3654 	/*
3655 	 * Calculate the base blockref pointer or NULL if the chain
3656 	 * is known to be empty.  We need to calculate the array count
3657 	 * for RB lookups either way.
3658 	 */
3659 	hmp = parent->hmp;
3660 	KKASSERT(hammer2_mtx_owned(&parent->lock));
3661 
3662 	/*
3663 	 * Pre-modify the parent now to avoid having to deal with error
3664 	 * processing if we tried to later (in the middle of our loop).
3665 	 */
3666 	*errorp = hammer2_chain_modify(parent, mtid, 0, 0);
3667 	if (*errorp) {
3668 		kprintf("hammer2_create_indirect: error %08x %s\n",
3669 			*errorp, hammer2_error_str(*errorp));
3670 		return NULL;
3671 	}
3672 
3673 	/*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/
3674 	base = hammer2_chain_base_and_count(parent, &count);
3675 
3676 	/*
3677 	 * dummy used in later chain allocation (no longer used for lookups).
3678 	 */
3679 	bzero(&dummy, sizeof(dummy));
3680 
3681 	/*
3682 	 * How big should our new indirect block be?  It has to be at least
3683 	 * as large as its parent for splits to work properly.
3684 	 *
3685 	 * The freemap uses a specific indirect block size.  The number of
3686 	 * levels are built dynamically and ultimately depend on the size
3687 	 * volume.  Because freemap blocks are taken from the reserved areas
3688 	 * of the volume our goal is efficiency (fewer levels) and not so
3689 	 * much to save disk space.
3690 	 *
3691 	 * The first indirect block level for a directory usually uses
3692 	 * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries).  Due to
3693 	 * the hash mechanism, this typically gives us a nominal
3694 	 * 32 * 4 entries with one level of indirection.
3695 	 *
3696 	 * We use HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs) for FILE
3697 	 * indirect blocks.  The initial 4 entries in the inode gives us
3698 	 * 256KB.  Up to 4 indirect blocks gives us 32MB.  Three levels
3699 	 * of indirection gives us 137GB, and so forth.  H2 can support
3700 	 * huge file sizes but they are not typical, so we try to stick
3701 	 * with compactness and do not use a larger indirect block size.
3702 	 *
3703 	 * We could use 64KB (PBUFSIZE), giving us 512 blockrefs, but
3704 	 * due to the way indirect blocks are created this usually winds
3705 	 * up being extremely inefficient for small files.  Even though
3706 	 * 16KB requires more levels of indirection for very large files,
3707 	 * the 16KB records can be ganged together into 64KB DIOs.
3708 	 */
3709 	if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
3710 	    for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
3711 		nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE;
3712 	} else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
3713 		if (parent->data->ipdata.meta.type ==
3714 		    HAMMER2_OBJTYPE_DIRECTORY)
3715 			nbytes = HAMMER2_IND_BYTES_MIN;	/* 4KB = 32 entries */
3716 		else
3717 			nbytes = HAMMER2_IND_BYTES_NOM;	/* 16KB = ~8MB file */
3718 
3719 	} else {
3720 		nbytes = HAMMER2_IND_BYTES_NOM;
3721 	}
3722 	if (nbytes < count * sizeof(hammer2_blockref_t)) {
3723 		KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE &&
3724 			 for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF);
3725 		nbytes = count * sizeof(hammer2_blockref_t);
3726 	}
3727 	ncount = nbytes / sizeof(hammer2_blockref_t);
3728 
3729 	/*
3730 	 * When creating an indirect block for a freemap node or leaf
3731 	 * the key/keybits must be fitted to static radix levels because
3732 	 * particular radix levels use particular reserved blocks in the
3733 	 * related zone.
3734 	 *
3735 	 * This routine calculates the key/radix of the indirect block
3736 	 * we need to create, and whether it is on the high-side or the
3737 	 * low-side.
3738 	 */
3739 	switch(for_type) {
3740 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3741 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
3742 		keybits = hammer2_chain_indkey_freemap(parent, &key, keybits,
3743 						       base, count);
3744 		break;
3745 	case HAMMER2_BREF_TYPE_DATA:
3746 		keybits = hammer2_chain_indkey_file(parent, &key, keybits,
3747 						    base, count, ncount);
3748 		break;
3749 	case HAMMER2_BREF_TYPE_DIRENT:
3750 	case HAMMER2_BREF_TYPE_INODE:
3751 		keybits = hammer2_chain_indkey_dir(parent, &key, keybits,
3752 						   base, count, ncount);
3753 		break;
3754 	default:
3755 		panic("illegal indirect block for bref type %d", for_type);
3756 		break;
3757 	}
3758 
3759 	/*
3760 	 * Normalize the key for the radix being represented, keeping the
3761 	 * high bits and throwing away the low bits.
3762 	 */
3763 	key &= ~(((hammer2_key_t)1 << keybits) - 1);
3764 
3765 	/*
3766 	 * Ok, create our new indirect block
3767 	 */
3768 	if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
3769 	    for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
3770 		dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE;
3771 	} else {
3772 		dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
3773 	}
3774 	dummy.bref.key = key;
3775 	dummy.bref.keybits = keybits;
3776 	dummy.bref.data_off = hammer2_getradix(nbytes);
3777 	dummy.bref.methods =
3778 		HAMMER2_ENC_CHECK(HAMMER2_DEC_CHECK(parent->bref.methods)) |
3779 		HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
3780 
3781 	ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy.bref);
3782 	atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
3783 	hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE);
3784 	/* ichain has one ref at this point */
3785 
3786 	/*
3787 	 * We have to mark it modified to allocate its block, but use
3788 	 * OPTDATA to allow it to remain in the INITIAL state.  Otherwise
3789 	 * it won't be acted upon by the flush code.
3790 	 */
3791 	*errorp = hammer2_chain_modify(ichain, mtid, 0, HAMMER2_MODIFY_OPTDATA);
3792 	if (*errorp) {
3793 		kprintf("hammer2_alloc_indirect: error %08x %s\n",
3794 			*errorp, hammer2_error_str(*errorp));
3795 		hammer2_chain_unlock(ichain);
3796 		hammer2_chain_drop(ichain);
3797 		return NULL;
3798 	}
3799 
3800 	/*
3801 	 * Iterate the original parent and move the matching brefs into
3802 	 * the new indirect block.
3803 	 *
3804 	 * XXX handle flushes.
3805 	 */
3806 	key_beg = 0;
3807 	key_end = HAMMER2_KEY_MAX;
3808 	key_next = 0;	/* avoid gcc warnings */
3809 	hammer2_spin_ex(&parent->core.spin);
3810 	loops = 0;
3811 	reason = 0;
3812 
3813 	for (;;) {
3814 		/*
3815 		 * Parent may have been modified, relocating its block array.
3816 		 * Reload the base pointer.
3817 		 */
3818 		base = hammer2_chain_base_and_count(parent, &count);
3819 
3820 		if (++loops > 100000) {
3821 		    hammer2_spin_unex(&parent->core.spin);
3822 		    panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n",
3823 			  reason, parent, base, count, key_next);
3824 		}
3825 
3826 		/*
3827 		 * NOTE: spinlock stays intact, returned chain (if not NULL)
3828 		 *	 is not referenced or locked which means that we
3829 		 *	 cannot safely check its flagged / deletion status
3830 		 *	 until we lock it.
3831 		 */
3832 		chain = hammer2_combined_find(parent, base, count,
3833 					      &key_next,
3834 					      key_beg, key_end,
3835 					      &bref);
3836 		generation = parent->core.generation;
3837 		if (bref == NULL)
3838 			break;
3839 		key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
3840 
3841 		/*
3842 		 * Skip keys that are not within the key/radix of the new
3843 		 * indirect block.  They stay in the parent.
3844 		 */
3845 		if ((~(((hammer2_key_t)1 << keybits) - 1) &
3846 		    (key ^ bref->key)) != 0) {
3847 			goto next_key_spinlocked;
3848 		}
3849 
3850 		/*
3851 		 * Load the new indirect block by acquiring the related
3852 		 * chains (potentially from media as it might not be
3853 		 * in-memory).  Then move it to the new parent (ichain).
3854 		 *
3855 		 * chain is referenced but not locked.  We must lock the
3856 		 * chain to obtain definitive state.
3857 		 */
3858 		if (chain) {
3859 			/*
3860 			 * Use chain already present in the RBTREE
3861 			 */
3862 			hammer2_chain_ref(chain);
3863 			hammer2_spin_unex(&parent->core.spin);
3864 			hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
3865 		} else {
3866 			/*
3867 			 * Get chain for blockref element.  _get returns NULL
3868 			 * on insertion race.
3869 			 */
3870 			bcopy = *bref;
3871 			hammer2_spin_unex(&parent->core.spin);
3872 			chain = hammer2_chain_get(parent, generation, &bcopy);
3873 			if (chain == NULL) {
3874 				reason = 1;
3875 				hammer2_spin_ex(&parent->core.spin);
3876 				continue;
3877 			}
3878 			hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
3879 			if (bcmp(&bcopy, bref, sizeof(bcopy))) {
3880 				reason = 2;
3881 				hammer2_chain_unlock(chain);
3882 				hammer2_chain_drop(chain);
3883 				hammer2_spin_ex(&parent->core.spin);
3884 				continue;
3885 			}
3886 		}
3887 
3888 		/*
3889 		 * This is always live so if the chain has been deleted
3890 		 * we raced someone and we have to retry.
3891 		 *
3892 		 * NOTE: Lookups can race delete-duplicate because
3893 		 *	 delete-duplicate does not lock the parent's core
3894 		 *	 (they just use the spinlock on the core).
3895 		 *
3896 		 *	 (note reversed logic for this one)
3897 		 */
3898 		if (chain->parent != parent ||
3899 		    (chain->flags & HAMMER2_CHAIN_DELETED)) {
3900 			hammer2_chain_unlock(chain);
3901 			hammer2_chain_drop(chain);
3902 			kprintf("hammer2_chain_create_indirect "
3903 				"RETRY (%p,%p)->%p %08x\n",
3904 				parent, chain->parent, chain, chain->flags);
3905 			hammer2_spin_ex(&parent->core.spin);
3906 			continue;
3907 		}
3908 
3909 		/*
3910 		 * Shift the chain to the indirect block.
3911 		 *
3912 		 * WARNING! No reason for us to load chain data, pass NOSTATS
3913 		 *	    to prevent delete/insert from trying to access
3914 		 *	    inode stats (and thus asserting if there is no
3915 		 *	    chain->data loaded).
3916 		 *
3917 		 * WARNING! The (parent, chain) deletion may modify the parent
3918 		 *	    and invalidate the base pointer.
3919 		 *
3920 		 * WARNING! Parent must already be marked modified, so we
3921 		 *	    can assume that chain_delete always suceeds.
3922 		 */
3923 		error = hammer2_chain_delete(parent, chain, mtid, 0);
3924 		KKASSERT(error == 0);
3925 		hammer2_chain_rename(NULL, &ichain, chain, mtid, 0);
3926 		hammer2_chain_unlock(chain);
3927 		hammer2_chain_drop(chain);
3928 		KKASSERT(parent->refs > 0);
3929 		chain = NULL;
3930 		base = NULL;	/* safety */
3931 		hammer2_spin_ex(&parent->core.spin);
3932 next_key_spinlocked:
3933 		if (--maxloops == 0)
3934 			panic("hammer2_chain_create_indirect: maxloops");
3935 		reason = 4;
3936 		if (key_next == 0 || key_next > key_end)
3937 			break;
3938 		key_beg = key_next;
3939 		/* loop */
3940 	}
3941 	hammer2_spin_unex(&parent->core.spin);
3942 
3943 	/*
3944 	 * Insert the new indirect block into the parent now that we've
3945 	 * cleared out some entries in the parent.  We calculated a good
3946 	 * insertion index in the loop above (ichain->index).
3947 	 *
3948 	 * We don't have to set UPDATE here because we mark ichain
3949 	 * modified down below (so the normal modified -> flush -> set-moved
3950 	 * sequence applies).
3951 	 *
3952 	 * The insertion shouldn't race as this is a completely new block
3953 	 * and the parent is locked.
3954 	 */
3955 	base = NULL;	/* safety, parent modify may change address */
3956 	KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
3957 	KKASSERT(parent->core.live_count < count);
3958 	hammer2_chain_insert(parent, ichain,
3959 			     HAMMER2_CHAIN_INSERT_SPIN |
3960 			     HAMMER2_CHAIN_INSERT_LIVE,
3961 			     0);
3962 
3963 	/*
3964 	 * Make sure flushes propogate after our manual insertion.
3965 	 */
3966 	hammer2_chain_setflush(ichain);
3967 	hammer2_chain_setflush(parent);
3968 
3969 	/*
3970 	 * Figure out what to return.
3971 	 */
3972 	if (~(((hammer2_key_t)1 << keybits) - 1) &
3973 		   (create_key ^ key)) {
3974 		/*
3975 		 * Key being created is outside the key range,
3976 		 * return the original parent.
3977 		 */
3978 		hammer2_chain_unlock(ichain);
3979 		hammer2_chain_drop(ichain);
3980 	} else {
3981 		/*
3982 		 * Otherwise its in the range, return the new parent.
3983 		 * (leave both the new and old parent locked).
3984 		 */
3985 		parent = ichain;
3986 	}
3987 
3988 	return(parent);
3989 }
3990 
3991 /*
3992  * Do maintenance on an indirect chain.  Both parent and chain are locked.
3993  *
3994  * Returns non-zero if (chain) is deleted, either due to being empty or
3995  * because its children were safely moved into the parent.
3996  */
3997 int
3998 hammer2_chain_indirect_maintenance(hammer2_chain_t *parent,
3999 				   hammer2_chain_t *chain)
4000 {
4001 	hammer2_blockref_t *chain_base;
4002 	hammer2_blockref_t *base;
4003 	hammer2_blockref_t *bref;
4004 	hammer2_blockref_t bcopy;
4005 	hammer2_key_t key_next;
4006 	hammer2_key_t key_beg;
4007 	hammer2_key_t key_end;
4008 	hammer2_chain_t *sub;
4009 	int chain_count;
4010 	int count;
4011 	int error;
4012 	int generation;
4013 
4014 	/*
4015 	 * Make sure we have an accurate live_count
4016 	 */
4017 	if ((chain->flags & (HAMMER2_CHAIN_INITIAL |
4018 			     HAMMER2_CHAIN_COUNTEDBREFS)) == 0) {
4019 		base = &chain->data->npdata[0];
4020 		count = chain->bytes / sizeof(hammer2_blockref_t);
4021 		hammer2_chain_countbrefs(chain, base, count);
4022 	}
4023 
4024 	/*
4025 	 * If the indirect block is empty we can delete it.
4026 	 * (ignore deletion error)
4027 	 */
4028 	if (chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)) {
4029 		hammer2_chain_delete(parent, chain,
4030 				     chain->bref.modify_tid,
4031 				     HAMMER2_DELETE_PERMANENT);
4032 		return 1;
4033 	}
4034 
4035 	base = hammer2_chain_base_and_count(parent, &count);
4036 
4037 	if ((parent->flags & (HAMMER2_CHAIN_INITIAL |
4038 			     HAMMER2_CHAIN_COUNTEDBREFS)) == 0) {
4039 		hammer2_chain_countbrefs(parent, base, count);
4040 	}
4041 
4042 	/*
4043 	 * Determine if we can collapse chain into parent, calculate
4044 	 * hysteresis for chain emptiness.
4045 	 */
4046 	if (parent->core.live_count + chain->core.live_count - 1 > count)
4047 		return 0;
4048 	chain_count = chain->bytes / sizeof(hammer2_blockref_t);
4049 	if (chain->core.live_count > chain_count * 3 / 4)
4050 		return 0;
4051 
4052 	/*
4053 	 * Ok, theoretically we can collapse chain's contents into
4054 	 * parent.  chain is locked, but any in-memory children of chain
4055 	 * are not.  For this to work, we must be able to dispose of any
4056 	 * in-memory children of chain.
4057 	 *
4058 	 * For now require that there are no in-memory children of chain.
4059 	 *
4060 	 * WARNING! Both chain and parent must remain locked across this
4061 	 *	    entire operation.
4062 	 */
4063 
4064 	/*
4065 	 * Parent must be marked modified.  Don't try to collapse it if we
4066 	 * can't mark it modified.  Once modified, destroy chain to make room
4067 	 * and to get rid of what will be a conflicting key (this is included
4068 	 * in the calculation above).  Finally, move the children of chain
4069 	 * into chain's parent.
4070 	 *
4071 	 * This order creates an accounting problem for bref.embed.stats
4072 	 * because we destroy chain before we remove its children.  Any
4073 	 * elements whos blockref is already synchronized will be counted
4074 	 * twice.  To deal with the problem we clean out chain's stats prior
4075 	 * to deleting it.
4076 	 */
4077 	error = hammer2_chain_modify(parent, 0, 0, 0);
4078 	if (error) {
4079 		krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n",
4080 			    hammer2_error_str(error));
4081 		return 0;
4082 	}
4083 	error = hammer2_chain_modify(chain, chain->bref.modify_tid, 0, 0);
4084 	if (error) {
4085 		krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n",
4086 			    hammer2_error_str(error));
4087 		return 0;
4088 	}
4089 
4090 	chain->bref.embed.stats.inode_count = 0;
4091 	chain->bref.embed.stats.data_count = 0;
4092 	error = hammer2_chain_delete(parent, chain,
4093 				     chain->bref.modify_tid,
4094 				     HAMMER2_DELETE_PERMANENT);
4095 	KKASSERT(error == 0);
4096 
4097 	/*
4098 	 * The combined_find call requires core.spin to be held.  One would
4099 	 * think there wouldn't be any conflicts since we hold chain
4100 	 * exclusively locked, but the caching mechanism for 0-ref children
4101 	 * does not require a chain lock.
4102 	 */
4103 	hammer2_spin_ex(&chain->core.spin);
4104 
4105 	key_next = 0;
4106 	key_beg = 0;
4107 	key_end = HAMMER2_KEY_MAX;
4108 	for (;;) {
4109 		chain_base = &chain->data->npdata[0];
4110 		chain_count = chain->bytes / sizeof(hammer2_blockref_t);
4111 		sub = hammer2_combined_find(chain, chain_base, chain_count,
4112 					    &key_next,
4113 					    key_beg, key_end,
4114 					    &bref);
4115 		generation = chain->core.generation;
4116 		if (bref == NULL)
4117 			break;
4118 		key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4119 
4120 		if (sub) {
4121 			hammer2_chain_ref(sub);
4122 			hammer2_spin_unex(&chain->core.spin);
4123 			hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER);
4124 			if (sub->parent != chain ||
4125 			    (sub->flags & HAMMER2_CHAIN_DELETED)) {
4126 				hammer2_chain_unlock(sub);
4127 				hammer2_chain_drop(sub);
4128 				hammer2_spin_ex(&chain->core.spin);
4129 				continue;
4130 			}
4131 		} else {
4132 			bcopy = *bref;
4133 			hammer2_spin_unex(&chain->core.spin);
4134 			sub = hammer2_chain_get(chain, generation, &bcopy);
4135 			if (sub == NULL) {
4136 				hammer2_spin_ex(&chain->core.spin);
4137 				continue;
4138 			}
4139 			hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER);
4140 			if (bcmp(&bcopy, bref, sizeof(bcopy)) != 0) {
4141 				hammer2_chain_unlock(sub);
4142 				hammer2_chain_drop(sub);
4143 				hammer2_spin_ex(&chain->core.spin);
4144 				continue;
4145 			}
4146 		}
4147 		error = hammer2_chain_delete(chain, sub,
4148 					     sub->bref.modify_tid, 0);
4149 		KKASSERT(error == 0);
4150 		hammer2_chain_rename(NULL, &parent, sub,
4151 				     sub->bref.modify_tid,
4152 				     HAMMER2_INSERT_SAMEPARENT);
4153 		hammer2_chain_unlock(sub);
4154 		hammer2_chain_drop(sub);
4155 		hammer2_spin_ex(&chain->core.spin);
4156 
4157 		if (key_next == 0)
4158 			break;
4159 		key_beg = key_next;
4160 	}
4161 	hammer2_spin_unex(&chain->core.spin);
4162 
4163 	return 1;
4164 }
4165 
4166 /*
4167  * Freemap indirect blocks
4168  *
4169  * Calculate the keybits and highside/lowside of the freemap node the
4170  * caller is creating.
4171  *
4172  * This routine will specify the next higher-level freemap key/radix
4173  * representing the lowest-ordered set.  By doing so, eventually all
4174  * low-ordered sets will be moved one level down.
4175  *
4176  * We have to be careful here because the freemap reserves a limited
4177  * number of blocks for a limited number of levels.  So we can't just
4178  * push indiscriminately.
4179  */
4180 int
4181 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp,
4182 			     int keybits, hammer2_blockref_t *base, int count)
4183 {
4184 	hammer2_chain_t *chain;
4185 	hammer2_blockref_t *bref;
4186 	hammer2_key_t key;
4187 	hammer2_key_t key_beg;
4188 	hammer2_key_t key_end;
4189 	hammer2_key_t key_next;
4190 	int locount;
4191 	int hicount;
4192 	int maxloops = 300000;
4193 
4194 	key = *keyp;
4195 	locount = 0;
4196 	hicount = 0;
4197 	keybits = 64;
4198 
4199 	/*
4200 	 * Calculate the range of keys in the array being careful to skip
4201 	 * slots which are overridden with a deletion.
4202 	 */
4203 	key_beg = 0;
4204 	key_end = HAMMER2_KEY_MAX;
4205 	hammer2_spin_ex(&parent->core.spin);
4206 
4207 	for (;;) {
4208 		if (--maxloops == 0) {
4209 			panic("indkey_freemap shit %p %p:%d\n",
4210 			      parent, base, count);
4211 		}
4212 		chain = hammer2_combined_find(parent, base, count,
4213 					      &key_next,
4214 					      key_beg, key_end,
4215 					      &bref);
4216 
4217 		/*
4218 		 * Exhausted search
4219 		 */
4220 		if (bref == NULL)
4221 			break;
4222 
4223 		/*
4224 		 * Skip deleted chains.
4225 		 */
4226 		if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
4227 			if (key_next == 0 || key_next > key_end)
4228 				break;
4229 			key_beg = key_next;
4230 			continue;
4231 		}
4232 
4233 		/*
4234 		 * Use the full live (not deleted) element for the scan
4235 		 * iteration.  HAMMER2 does not allow partial replacements.
4236 		 *
4237 		 * XXX should be built into hammer2_combined_find().
4238 		 */
4239 		key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4240 
4241 		if (keybits > bref->keybits) {
4242 			key = bref->key;
4243 			keybits = bref->keybits;
4244 		} else if (keybits == bref->keybits && bref->key < key) {
4245 			key = bref->key;
4246 		}
4247 		if (key_next == 0)
4248 			break;
4249 		key_beg = key_next;
4250 	}
4251 	hammer2_spin_unex(&parent->core.spin);
4252 
4253 	/*
4254 	 * Return the keybits for a higher-level FREEMAP_NODE covering
4255 	 * this node.
4256 	 */
4257 	switch(keybits) {
4258 	case HAMMER2_FREEMAP_LEVEL0_RADIX:
4259 		keybits = HAMMER2_FREEMAP_LEVEL1_RADIX;
4260 		break;
4261 	case HAMMER2_FREEMAP_LEVEL1_RADIX:
4262 		keybits = HAMMER2_FREEMAP_LEVEL2_RADIX;
4263 		break;
4264 	case HAMMER2_FREEMAP_LEVEL2_RADIX:
4265 		keybits = HAMMER2_FREEMAP_LEVEL3_RADIX;
4266 		break;
4267 	case HAMMER2_FREEMAP_LEVEL3_RADIX:
4268 		keybits = HAMMER2_FREEMAP_LEVEL4_RADIX;
4269 		break;
4270 	case HAMMER2_FREEMAP_LEVEL4_RADIX:
4271 		keybits = HAMMER2_FREEMAP_LEVEL5_RADIX;
4272 		break;
4273 	case HAMMER2_FREEMAP_LEVEL5_RADIX:
4274 		panic("hammer2_chain_indkey_freemap: level too high");
4275 		break;
4276 	default:
4277 		panic("hammer2_chain_indkey_freemap: bad radix");
4278 		break;
4279 	}
4280 	*keyp = key;
4281 
4282 	return (keybits);
4283 }
4284 
4285 /*
4286  * File indirect blocks
4287  *
4288  * Calculate the key/keybits for the indirect block to create by scanning
4289  * existing keys.  The key being created is also passed in *keyp and can be
4290  * inside or outside the indirect block.  Regardless, the indirect block
4291  * must hold at least two keys in order to guarantee sufficient space.
4292  *
4293  * We use a modified version of the freemap's fixed radix tree, but taylored
4294  * for file data.  Basically we configure an indirect block encompassing the
4295  * smallest key.
4296  */
4297 static int
4298 hammer2_chain_indkey_file(hammer2_chain_t *parent, hammer2_key_t *keyp,
4299 			    int keybits, hammer2_blockref_t *base, int count,
4300 			    int ncount)
4301 {
4302 	hammer2_chain_t *chain;
4303 	hammer2_blockref_t *bref;
4304 	hammer2_key_t key;
4305 	hammer2_key_t key_beg;
4306 	hammer2_key_t key_end;
4307 	hammer2_key_t key_next;
4308 	int nradix;
4309 	int locount;
4310 	int hicount;
4311 	int maxloops = 300000;
4312 
4313 	key = *keyp;
4314 	locount = 0;
4315 	hicount = 0;
4316 	keybits = 64;
4317 
4318 	/*
4319 	 * Calculate the range of keys in the array being careful to skip
4320 	 * slots which are overridden with a deletion.
4321 	 *
4322 	 * Locate the smallest key.
4323 	 */
4324 	key_beg = 0;
4325 	key_end = HAMMER2_KEY_MAX;
4326 	hammer2_spin_ex(&parent->core.spin);
4327 
4328 	for (;;) {
4329 		if (--maxloops == 0) {
4330 			panic("indkey_freemap shit %p %p:%d\n",
4331 			      parent, base, count);
4332 		}
4333 		chain = hammer2_combined_find(parent, base, count,
4334 					      &key_next,
4335 					      key_beg, key_end,
4336 					      &bref);
4337 
4338 		/*
4339 		 * Exhausted search
4340 		 */
4341 		if (bref == NULL)
4342 			break;
4343 
4344 		/*
4345 		 * Skip deleted chains.
4346 		 */
4347 		if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
4348 			if (key_next == 0 || key_next > key_end)
4349 				break;
4350 			key_beg = key_next;
4351 			continue;
4352 		}
4353 
4354 		/*
4355 		 * Use the full live (not deleted) element for the scan
4356 		 * iteration.  HAMMER2 does not allow partial replacements.
4357 		 *
4358 		 * XXX should be built into hammer2_combined_find().
4359 		 */
4360 		key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4361 
4362 		if (keybits > bref->keybits) {
4363 			key = bref->key;
4364 			keybits = bref->keybits;
4365 		} else if (keybits == bref->keybits && bref->key < key) {
4366 			key = bref->key;
4367 		}
4368 		if (key_next == 0)
4369 			break;
4370 		key_beg = key_next;
4371 	}
4372 	hammer2_spin_unex(&parent->core.spin);
4373 
4374 	/*
4375 	 * Calculate the static keybits for a higher-level indirect block
4376 	 * that contains the key.
4377 	 */
4378 	*keyp = key;
4379 
4380 	switch(ncount) {
4381 	case HAMMER2_IND_BYTES_MIN / sizeof(hammer2_blockref_t):
4382 		nradix = HAMMER2_IND_RADIX_MIN - HAMMER2_BLOCKREF_RADIX;
4383 		break;
4384 	case HAMMER2_IND_BYTES_NOM / sizeof(hammer2_blockref_t):
4385 		nradix = HAMMER2_IND_RADIX_NOM - HAMMER2_BLOCKREF_RADIX;
4386 		break;
4387 	case HAMMER2_IND_BYTES_MAX / sizeof(hammer2_blockref_t):
4388 		nradix = HAMMER2_IND_RADIX_MAX - HAMMER2_BLOCKREF_RADIX;
4389 		break;
4390 	default:
4391 		panic("bad ncount %d\n", ncount);
4392 		nradix = 0;
4393 		break;
4394 	}
4395 
4396 	/*
4397 	 * The largest radix that can be returned for an indirect block is
4398 	 * 63 bits.  (The largest practical indirect block radix is actually
4399 	 * 62 bits because the top-level inode or volume root contains four
4400 	 * entries, but allow 63 to be returned).
4401 	 */
4402 	if (nradix >= 64)
4403 		nradix = 63;
4404 
4405 	return keybits + nradix;
4406 }
4407 
4408 #if 1
4409 
4410 /*
4411  * Directory indirect blocks.
4412  *
4413  * Covers both the inode index (directory of inodes), and directory contents
4414  * (filenames hardlinked to inodes).
4415  *
4416  * Because directory keys are hashed we generally try to cut the space in
4417  * half.  We accomodate the inode index (which tends to have linearly
4418  * increasing inode numbers) by ensuring that the keyspace is at least large
4419  * enough to fill up the indirect block being created.
4420  */
4421 static int
4422 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp,
4423 			 int keybits, hammer2_blockref_t *base, int count,
4424 			 int ncount)
4425 {
4426 	hammer2_blockref_t *bref;
4427 	hammer2_chain_t	*chain;
4428 	hammer2_key_t key_beg;
4429 	hammer2_key_t key_end;
4430 	hammer2_key_t key_next;
4431 	hammer2_key_t key;
4432 	int nkeybits;
4433 	int locount;
4434 	int hicount;
4435 	int maxloops = 300000;
4436 
4437 	/*
4438 	 * NOTE: We can't take a shortcut here anymore for inodes because
4439 	 *	 the root directory can contain a mix of inodes and directory
4440 	 *	 entries (we used to just return 63 if parent->bref.type was
4441 	 *	 HAMMER2_BREF_TYPE_INODE.
4442 	 */
4443 	key = *keyp;
4444 	locount = 0;
4445 	hicount = 0;
4446 
4447 	/*
4448 	 * Calculate the range of keys in the array being careful to skip
4449 	 * slots which are overridden with a deletion.
4450 	 */
4451 	key_beg = 0;
4452 	key_end = HAMMER2_KEY_MAX;
4453 	hammer2_spin_ex(&parent->core.spin);
4454 
4455 	for (;;) {
4456 		if (--maxloops == 0) {
4457 			panic("indkey_freemap shit %p %p:%d\n",
4458 			      parent, base, count);
4459 		}
4460 		chain = hammer2_combined_find(parent, base, count,
4461 					      &key_next,
4462 					      key_beg, key_end,
4463 					      &bref);
4464 
4465 		/*
4466 		 * Exhausted search
4467 		 */
4468 		if (bref == NULL)
4469 			break;
4470 
4471 		/*
4472 		 * Deleted object
4473 		 */
4474 		if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
4475 			if (key_next == 0 || key_next > key_end)
4476 				break;
4477 			key_beg = key_next;
4478 			continue;
4479 		}
4480 
4481 		/*
4482 		 * Use the full live (not deleted) element for the scan
4483 		 * iteration.  HAMMER2 does not allow partial replacements.
4484 		 *
4485 		 * XXX should be built into hammer2_combined_find().
4486 		 */
4487 		key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4488 
4489 		/*
4490 		 * Expand our calculated key range (key, keybits) to fit
4491 		 * the scanned key.  nkeybits represents the full range
4492 		 * that we will later cut in half (two halves @ nkeybits - 1).
4493 		 */
4494 		nkeybits = keybits;
4495 		if (nkeybits < bref->keybits) {
4496 			if (bref->keybits > 64) {
4497 				kprintf("bad bref chain %p bref %p\n",
4498 					chain, bref);
4499 				Debugger("fubar");
4500 			}
4501 			nkeybits = bref->keybits;
4502 		}
4503 		while (nkeybits < 64 &&
4504 		       (~(((hammer2_key_t)1 << nkeybits) - 1) &
4505 		        (key ^ bref->key)) != 0) {
4506 			++nkeybits;
4507 		}
4508 
4509 		/*
4510 		 * If the new key range is larger we have to determine
4511 		 * which side of the new key range the existing keys fall
4512 		 * under by checking the high bit, then collapsing the
4513 		 * locount into the hicount or vise-versa.
4514 		 */
4515 		if (keybits != nkeybits) {
4516 			if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
4517 				hicount += locount;
4518 				locount = 0;
4519 			} else {
4520 				locount += hicount;
4521 				hicount = 0;
4522 			}
4523 			keybits = nkeybits;
4524 		}
4525 
4526 		/*
4527 		 * The newly scanned key will be in the lower half or the
4528 		 * upper half of the (new) key range.
4529 		 */
4530 		if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
4531 			++hicount;
4532 		else
4533 			++locount;
4534 
4535 		if (key_next == 0)
4536 			break;
4537 		key_beg = key_next;
4538 	}
4539 	hammer2_spin_unex(&parent->core.spin);
4540 	bref = NULL;	/* now invalid (safety) */
4541 
4542 	/*
4543 	 * Adjust keybits to represent half of the full range calculated
4544 	 * above (radix 63 max) for our new indirect block.
4545 	 */
4546 	--keybits;
4547 
4548 	/*
4549 	 * Expand keybits to hold at least ncount elements.  ncount will be
4550 	 * a power of 2.  This is to try to completely fill leaf nodes (at
4551 	 * least for keys which are not hashes).
4552 	 *
4553 	 * We aren't counting 'in' or 'out', we are counting 'high side'
4554 	 * and 'low side' based on the bit at (1LL << keybits).  We want
4555 	 * everything to be inside in these cases so shift it all to
4556 	 * the low or high side depending on the new high bit.
4557 	 */
4558 	while (((hammer2_key_t)1 << keybits) < ncount) {
4559 		++keybits;
4560 		if (key & ((hammer2_key_t)1 << keybits)) {
4561 			hicount += locount;
4562 			locount = 0;
4563 		} else {
4564 			locount += hicount;
4565 			hicount = 0;
4566 		}
4567 	}
4568 
4569 	if (hicount > locount)
4570 		key |= (hammer2_key_t)1 << keybits;
4571 	else
4572 		key &= ~(hammer2_key_t)1 << keybits;
4573 
4574 	*keyp = key;
4575 
4576 	return (keybits);
4577 }
4578 
4579 #else
4580 
4581 /*
4582  * Directory indirect blocks.
4583  *
4584  * Covers both the inode index (directory of inodes), and directory contents
4585  * (filenames hardlinked to inodes).
4586  *
4587  * Because directory keys are hashed we generally try to cut the space in
4588  * half.  We accomodate the inode index (which tends to have linearly
4589  * increasing inode numbers) by ensuring that the keyspace is at least large
4590  * enough to fill up the indirect block being created.
4591  */
4592 static int
4593 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp,
4594 			 int keybits, hammer2_blockref_t *base, int count,
4595 			 int ncount)
4596 {
4597 	hammer2_blockref_t *bref;
4598 	hammer2_chain_t	*chain;
4599 	hammer2_key_t key_beg;
4600 	hammer2_key_t key_end;
4601 	hammer2_key_t key_next;
4602 	hammer2_key_t key;
4603 	int nkeybits;
4604 	int locount;
4605 	int hicount;
4606 	int maxloops = 300000;
4607 
4608 	/*
4609 	 * Shortcut if the parent is the inode.  In this situation the
4610 	 * parent has 4+1 directory entries and we are creating an indirect
4611 	 * block capable of holding many more.
4612 	 */
4613 	if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
4614 		return 63;
4615 	}
4616 
4617 	key = *keyp;
4618 	locount = 0;
4619 	hicount = 0;
4620 
4621 	/*
4622 	 * Calculate the range of keys in the array being careful to skip
4623 	 * slots which are overridden with a deletion.
4624 	 */
4625 	key_beg = 0;
4626 	key_end = HAMMER2_KEY_MAX;
4627 	hammer2_spin_ex(&parent->core.spin);
4628 
4629 	for (;;) {
4630 		if (--maxloops == 0) {
4631 			panic("indkey_freemap shit %p %p:%d\n",
4632 			      parent, base, count);
4633 		}
4634 		chain = hammer2_combined_find(parent, base, count,
4635 					      &key_next,
4636 					      key_beg, key_end,
4637 					      &bref);
4638 
4639 		/*
4640 		 * Exhausted search
4641 		 */
4642 		if (bref == NULL)
4643 			break;
4644 
4645 		/*
4646 		 * Deleted object
4647 		 */
4648 		if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
4649 			if (key_next == 0 || key_next > key_end)
4650 				break;
4651 			key_beg = key_next;
4652 			continue;
4653 		}
4654 
4655 		/*
4656 		 * Use the full live (not deleted) element for the scan
4657 		 * iteration.  HAMMER2 does not allow partial replacements.
4658 		 *
4659 		 * XXX should be built into hammer2_combined_find().
4660 		 */
4661 		key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
4662 
4663 		/*
4664 		 * Expand our calculated key range (key, keybits) to fit
4665 		 * the scanned key.  nkeybits represents the full range
4666 		 * that we will later cut in half (two halves @ nkeybits - 1).
4667 		 */
4668 		nkeybits = keybits;
4669 		if (nkeybits < bref->keybits) {
4670 			if (bref->keybits > 64) {
4671 				kprintf("bad bref chain %p bref %p\n",
4672 					chain, bref);
4673 				Debugger("fubar");
4674 			}
4675 			nkeybits = bref->keybits;
4676 		}
4677 		while (nkeybits < 64 &&
4678 		       (~(((hammer2_key_t)1 << nkeybits) - 1) &
4679 		        (key ^ bref->key)) != 0) {
4680 			++nkeybits;
4681 		}
4682 
4683 		/*
4684 		 * If the new key range is larger we have to determine
4685 		 * which side of the new key range the existing keys fall
4686 		 * under by checking the high bit, then collapsing the
4687 		 * locount into the hicount or vise-versa.
4688 		 */
4689 		if (keybits != nkeybits) {
4690 			if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
4691 				hicount += locount;
4692 				locount = 0;
4693 			} else {
4694 				locount += hicount;
4695 				hicount = 0;
4696 			}
4697 			keybits = nkeybits;
4698 		}
4699 
4700 		/*
4701 		 * The newly scanned key will be in the lower half or the
4702 		 * upper half of the (new) key range.
4703 		 */
4704 		if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
4705 			++hicount;
4706 		else
4707 			++locount;
4708 
4709 		if (key_next == 0)
4710 			break;
4711 		key_beg = key_next;
4712 	}
4713 	hammer2_spin_unex(&parent->core.spin);
4714 	bref = NULL;	/* now invalid (safety) */
4715 
4716 	/*
4717 	 * Adjust keybits to represent half of the full range calculated
4718 	 * above (radix 63 max) for our new indirect block.
4719 	 */
4720 	--keybits;
4721 
4722 	/*
4723 	 * Expand keybits to hold at least ncount elements.  ncount will be
4724 	 * a power of 2.  This is to try to completely fill leaf nodes (at
4725 	 * least for keys which are not hashes).
4726 	 *
4727 	 * We aren't counting 'in' or 'out', we are counting 'high side'
4728 	 * and 'low side' based on the bit at (1LL << keybits).  We want
4729 	 * everything to be inside in these cases so shift it all to
4730 	 * the low or high side depending on the new high bit.
4731 	 */
4732 	while (((hammer2_key_t)1 << keybits) < ncount) {
4733 		++keybits;
4734 		if (key & ((hammer2_key_t)1 << keybits)) {
4735 			hicount += locount;
4736 			locount = 0;
4737 		} else {
4738 			locount += hicount;
4739 			hicount = 0;
4740 		}
4741 	}
4742 
4743 	if (hicount > locount)
4744 		key |= (hammer2_key_t)1 << keybits;
4745 	else
4746 		key &= ~(hammer2_key_t)1 << keybits;
4747 
4748 	*keyp = key;
4749 
4750 	return (keybits);
4751 }
4752 
4753 #endif
4754 
4755 /*
4756  * Sets CHAIN_DELETED and remove the chain's blockref from the parent if
4757  * it exists.
4758  *
4759  * Both parent and chain must be locked exclusively.
4760  *
4761  * This function will modify the parent if the blockref requires removal
4762  * from the parent's block table.
4763  *
4764  * This function is NOT recursive.  Any entity already pushed into the
4765  * chain (such as an inode) may still need visibility into its contents,
4766  * as well as the ability to read and modify the contents.  For example,
4767  * for an unlinked file which is still open.
4768  *
4769  * Also note that the flusher is responsible for cleaning up empty
4770  * indirect blocks.
4771  */
4772 int
4773 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain,
4774 		     hammer2_tid_t mtid, int flags)
4775 {
4776 	int error = 0;
4777 
4778 	KKASSERT(hammer2_mtx_owned(&chain->lock));
4779 
4780 	/*
4781 	 * Nothing to do if already marked.
4782 	 *
4783 	 * We need the spinlock on the core whos RBTREE contains chain
4784 	 * to protect against races.
4785 	 */
4786 	if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) {
4787 		KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 &&
4788 			 chain->parent == parent);
4789 		error = _hammer2_chain_delete_helper(parent, chain,
4790 						     mtid, flags);
4791 	}
4792 
4793 	/*
4794 	 * Permanent deletions mark the chain as destroyed.
4795 	 */
4796 	if (error == 0) {
4797 		if (flags & HAMMER2_DELETE_PERMANENT)
4798 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
4799 		hammer2_chain_setflush(chain);
4800 	}
4801 
4802 	return error;
4803 }
4804 
4805 /*
4806  * Returns the index of the nearest element in the blockref array >= elm.
4807  * Returns (count) if no element could be found.
4808  *
4809  * Sets *key_nextp to the next key for loop purposes but does not modify
4810  * it if the next key would be higher than the current value of *key_nextp.
4811  * Note that *key_nexp can overflow to 0, which should be tested by the
4812  * caller.
4813  *
4814  * WARNING!  Must be called with parent's spinlock held.  Spinlock remains
4815  *	     held through the operation.
4816  */
4817 static int
4818 hammer2_base_find(hammer2_chain_t *parent,
4819 		  hammer2_blockref_t *base, int count,
4820 		  hammer2_key_t *key_nextp,
4821 		  hammer2_key_t key_beg, hammer2_key_t key_end)
4822 {
4823 	hammer2_blockref_t *scan;
4824 	hammer2_key_t scan_end;
4825 	int i;
4826 	int limit;
4827 
4828 	/*
4829 	 * Require the live chain's already have their core's counted
4830 	 * so we can optimize operations.
4831 	 */
4832         KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS);
4833 
4834 	/*
4835 	 * Degenerate case
4836 	 */
4837 	if (count == 0 || base == NULL)
4838 		return(count);
4839 
4840 	/*
4841 	 * Sequential optimization using parent->cache_index.  This is
4842 	 * the most likely scenario.
4843 	 *
4844 	 * We can avoid trailing empty entries on live chains, otherwise
4845 	 * we might have to check the whole block array.
4846 	 */
4847 	i = parent->cache_index;	/* SMP RACE OK */
4848 	cpu_ccfence();
4849 	limit = parent->core.live_zero;
4850 	if (i >= limit)
4851 		i = limit - 1;
4852 	if (i < 0)
4853 		i = 0;
4854 	KKASSERT(i < count);
4855 
4856 	/*
4857 	 * Search backwards
4858 	 */
4859 	scan = &base[i];
4860 	while (i > 0 && (scan->type == 0 || scan->key > key_beg)) {
4861 		--scan;
4862 		--i;
4863 	}
4864 	parent->cache_index = i;
4865 
4866 	/*
4867 	 * Search forwards, stop when we find a scan element which
4868 	 * encloses the key or until we know that there are no further
4869 	 * elements.
4870 	 */
4871 	while (i < count) {
4872 		if (scan->type != 0) {
4873 			scan_end = scan->key +
4874 				   ((hammer2_key_t)1 << scan->keybits) - 1;
4875 			if (scan->key > key_beg || scan_end >= key_beg)
4876 				break;
4877 		}
4878 		if (i >= limit)
4879 			return (count);
4880 		++scan;
4881 		++i;
4882 	}
4883 	if (i != count) {
4884 		parent->cache_index = i;
4885 		if (i >= limit) {
4886 			i = count;
4887 		} else {
4888 			scan_end = scan->key +
4889 				   ((hammer2_key_t)1 << scan->keybits);
4890 			if (scan_end && (*key_nextp > scan_end ||
4891 					 *key_nextp == 0)) {
4892 				*key_nextp = scan_end;
4893 			}
4894 		}
4895 	}
4896 	return (i);
4897 }
4898 
4899 /*
4900  * Do a combined search and return the next match either from the blockref
4901  * array or from the in-memory chain.  Sets *bresp to the returned bref in
4902  * both cases, or sets it to NULL if the search exhausted.  Only returns
4903  * a non-NULL chain if the search matched from the in-memory chain.
4904  *
4905  * When no in-memory chain has been found and a non-NULL bref is returned
4906  * in *bresp.
4907  *
4908  *
4909  * The returned chain is not locked or referenced.  Use the returned bref
4910  * to determine if the search exhausted or not.  Iterate if the base find
4911  * is chosen but matches a deleted chain.
4912  *
4913  * WARNING!  Must be called with parent's spinlock held.  Spinlock remains
4914  *	     held through the operation.
4915  */
4916 hammer2_chain_t *
4917 hammer2_combined_find(hammer2_chain_t *parent,
4918 		      hammer2_blockref_t *base, int count,
4919 		      hammer2_key_t *key_nextp,
4920 		      hammer2_key_t key_beg, hammer2_key_t key_end,
4921 		      hammer2_blockref_t **bresp)
4922 {
4923 	hammer2_blockref_t *bref;
4924 	hammer2_chain_t *chain;
4925 	int i;
4926 
4927 	/*
4928 	 * Lookup in block array and in rbtree.
4929 	 */
4930 	*key_nextp = key_end + 1;
4931 	i = hammer2_base_find(parent, base, count, key_nextp,
4932 			      key_beg, key_end);
4933 	chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end);
4934 
4935 	/*
4936 	 * Neither matched
4937 	 */
4938 	if (i == count && chain == NULL) {
4939 		*bresp = NULL;
4940 		return(NULL);
4941 	}
4942 
4943 	/*
4944 	 * Only chain matched.
4945 	 */
4946 	if (i == count) {
4947 		bref = &chain->bref;
4948 		goto found;
4949 	}
4950 
4951 	/*
4952 	 * Only blockref matched.
4953 	 */
4954 	if (chain == NULL) {
4955 		bref = &base[i];
4956 		goto found;
4957 	}
4958 
4959 	/*
4960 	 * Both in-memory and blockref matched, select the nearer element.
4961 	 *
4962 	 * If both are flush with the left-hand side or both are the
4963 	 * same distance away, select the chain.  In this situation the
4964 	 * chain must have been loaded from the matching blockmap.
4965 	 */
4966 	if ((chain->bref.key <= key_beg && base[i].key <= key_beg) ||
4967 	    chain->bref.key == base[i].key) {
4968 		KKASSERT(chain->bref.key == base[i].key);
4969 		bref = &chain->bref;
4970 		goto found;
4971 	}
4972 
4973 	/*
4974 	 * Select the nearer key
4975 	 */
4976 	if (chain->bref.key < base[i].key) {
4977 		bref = &chain->bref;
4978 	} else {
4979 		bref = &base[i];
4980 		chain = NULL;
4981 	}
4982 
4983 	/*
4984 	 * If the bref is out of bounds we've exhausted our search.
4985 	 */
4986 found:
4987 	if (bref->key > key_end) {
4988 		*bresp = NULL;
4989 		chain = NULL;
4990 	} else {
4991 		*bresp = bref;
4992 	}
4993 	return(chain);
4994 }
4995 
4996 /*
4997  * Locate the specified block array element and delete it.  The element
4998  * must exist.
4999  *
5000  * The spin lock on the related chain must be held.
5001  *
5002  * NOTE: live_count was adjusted when the chain was deleted, so it does not
5003  *	 need to be adjusted when we commit the media change.
5004  */
5005 void
5006 hammer2_base_delete(hammer2_chain_t *parent,
5007 		    hammer2_blockref_t *base, int count,
5008 		    hammer2_chain_t *chain)
5009 {
5010 	hammer2_blockref_t *elm = &chain->bref;
5011 	hammer2_blockref_t *scan;
5012 	hammer2_key_t key_next;
5013 	int i;
5014 
5015 	/*
5016 	 * Delete element.  Expect the element to exist.
5017 	 *
5018 	 * XXX see caller, flush code not yet sophisticated enough to prevent
5019 	 *     re-flushed in some cases.
5020 	 */
5021 	key_next = 0; /* max range */
5022 	i = hammer2_base_find(parent, base, count, &key_next,
5023 			      elm->key, elm->key);
5024 	scan = &base[i];
5025 	if (i == count || scan->type == 0 ||
5026 	    scan->key != elm->key ||
5027 	    ((chain->flags & HAMMER2_CHAIN_BMAPUPD) == 0 &&
5028 	     scan->keybits != elm->keybits)) {
5029 		hammer2_spin_unex(&parent->core.spin);
5030 		panic("delete base %p element not found at %d/%d elm %p\n",
5031 		      base, i, count, elm);
5032 		return;
5033 	}
5034 
5035 	/*
5036 	 * Update stats and zero the entry.
5037 	 *
5038 	 * NOTE: Handle radix == 0 (0 bytes) case.
5039 	 */
5040 	if ((int)(scan->data_off & HAMMER2_OFF_MASK_RADIX)) {
5041 		parent->bref.embed.stats.data_count -= (hammer2_off_t)1 <<
5042 				(int)(scan->data_off & HAMMER2_OFF_MASK_RADIX);
5043 	}
5044 	switch(scan->type) {
5045 	case HAMMER2_BREF_TYPE_INODE:
5046 		--parent->bref.embed.stats.inode_count;
5047 		/* fall through */
5048 	case HAMMER2_BREF_TYPE_DATA:
5049 		if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) {
5050 			atomic_set_int(&chain->flags,
5051 				       HAMMER2_CHAIN_HINT_LEAF_COUNT);
5052 		} else {
5053 			if (parent->bref.leaf_count)
5054 				--parent->bref.leaf_count;
5055 		}
5056 		/* fall through */
5057 	case HAMMER2_BREF_TYPE_INDIRECT:
5058 		if (scan->type != HAMMER2_BREF_TYPE_DATA) {
5059 			parent->bref.embed.stats.data_count -=
5060 				scan->embed.stats.data_count;
5061 			parent->bref.embed.stats.inode_count -=
5062 				scan->embed.stats.inode_count;
5063 		}
5064 		if (scan->type == HAMMER2_BREF_TYPE_INODE)
5065 			break;
5066 		if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) {
5067 			atomic_set_int(&chain->flags,
5068 				       HAMMER2_CHAIN_HINT_LEAF_COUNT);
5069 		} else {
5070 			if (parent->bref.leaf_count <= scan->leaf_count)
5071 				parent->bref.leaf_count = 0;
5072 			else
5073 				parent->bref.leaf_count -= scan->leaf_count;
5074 		}
5075 		break;
5076 	case HAMMER2_BREF_TYPE_DIRENT:
5077 		if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) {
5078 			atomic_set_int(&chain->flags,
5079 				       HAMMER2_CHAIN_HINT_LEAF_COUNT);
5080 		} else {
5081 			if (parent->bref.leaf_count)
5082 				--parent->bref.leaf_count;
5083 		}
5084 	default:
5085 		break;
5086 	}
5087 
5088 	bzero(scan, sizeof(*scan));
5089 
5090 	/*
5091 	 * We can only optimize parent->core.live_zero for live chains.
5092 	 */
5093 	if (parent->core.live_zero == i + 1) {
5094 		while (--i >= 0 && base[i].type == 0)
5095 			;
5096 		parent->core.live_zero = i + 1;
5097 	}
5098 
5099 	/*
5100 	 * Clear appropriate blockmap flags in chain.
5101 	 */
5102 	atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED |
5103 					HAMMER2_CHAIN_BMAPUPD);
5104 }
5105 
5106 /*
5107  * Insert the specified element.  The block array must not already have the
5108  * element and must have space available for the insertion.
5109  *
5110  * The spin lock on the related chain must be held.
5111  *
5112  * NOTE: live_count was adjusted when the chain was deleted, so it does not
5113  *	 need to be adjusted when we commit the media change.
5114  */
5115 void
5116 hammer2_base_insert(hammer2_chain_t *parent,
5117 		    hammer2_blockref_t *base, int count,
5118 		    hammer2_chain_t *chain, hammer2_blockref_t *elm)
5119 {
5120 	hammer2_key_t key_next;
5121 	hammer2_key_t xkey;
5122 	int i;
5123 	int j;
5124 	int k;
5125 	int l;
5126 	int u = 1;
5127 
5128 	/*
5129 	 * Insert new element.  Expect the element to not already exist
5130 	 * unless we are replacing it.
5131 	 *
5132 	 * XXX see caller, flush code not yet sophisticated enough to prevent
5133 	 *     re-flushed in some cases.
5134 	 */
5135 	key_next = 0; /* max range */
5136 	i = hammer2_base_find(parent, base, count, &key_next,
5137 			      elm->key, elm->key);
5138 
5139 	/*
5140 	 * Shortcut fill optimization, typical ordered insertion(s) may not
5141 	 * require a search.
5142 	 */
5143 	KKASSERT(i >= 0 && i <= count);
5144 
5145 	/*
5146 	 * Set appropriate blockmap flags in chain (if not NULL)
5147 	 */
5148 	if (chain)
5149 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED);
5150 
5151 	/*
5152 	 * Update stats and zero the entry
5153 	 */
5154 	if ((int)(elm->data_off & HAMMER2_OFF_MASK_RADIX)) {
5155 		parent->bref.embed.stats.data_count += (hammer2_off_t)1 <<
5156 				(int)(elm->data_off & HAMMER2_OFF_MASK_RADIX);
5157 	}
5158 	switch(elm->type) {
5159 	case HAMMER2_BREF_TYPE_INODE:
5160 		++parent->bref.embed.stats.inode_count;
5161 		/* fall through */
5162 	case HAMMER2_BREF_TYPE_DATA:
5163 		if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX)
5164 			++parent->bref.leaf_count;
5165 		/* fall through */
5166 	case HAMMER2_BREF_TYPE_INDIRECT:
5167 		if (elm->type != HAMMER2_BREF_TYPE_DATA) {
5168 			parent->bref.embed.stats.data_count +=
5169 				elm->embed.stats.data_count;
5170 			parent->bref.embed.stats.inode_count +=
5171 				elm->embed.stats.inode_count;
5172 		}
5173 		if (elm->type == HAMMER2_BREF_TYPE_INODE)
5174 			break;
5175 		if (parent->bref.leaf_count + elm->leaf_count <
5176 		    HAMMER2_BLOCKREF_LEAF_MAX) {
5177 			parent->bref.leaf_count += elm->leaf_count;
5178 		} else {
5179 			parent->bref.leaf_count = HAMMER2_BLOCKREF_LEAF_MAX;
5180 		}
5181 		break;
5182 	case HAMMER2_BREF_TYPE_DIRENT:
5183 		if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX)
5184 			++parent->bref.leaf_count;
5185 		break;
5186 	default:
5187 		break;
5188 	}
5189 
5190 
5191 	/*
5192 	 * We can only optimize parent->core.live_zero for live chains.
5193 	 */
5194 	if (i == count && parent->core.live_zero < count) {
5195 		i = parent->core.live_zero++;
5196 		base[i] = *elm;
5197 		return;
5198 	}
5199 
5200 	xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1;
5201 	if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) {
5202 		hammer2_spin_unex(&parent->core.spin);
5203 		panic("insert base %p overlapping elements at %d elm %p\n",
5204 		      base, i, elm);
5205 	}
5206 
5207 	/*
5208 	 * Try to find an empty slot before or after.
5209 	 */
5210 	j = i;
5211 	k = i;
5212 	while (j > 0 || k < count) {
5213 		--j;
5214 		if (j >= 0 && base[j].type == 0) {
5215 			if (j == i - 1) {
5216 				base[j] = *elm;
5217 			} else {
5218 				bcopy(&base[j+1], &base[j],
5219 				      (i - j - 1) * sizeof(*base));
5220 				base[i - 1] = *elm;
5221 			}
5222 			goto validate;
5223 		}
5224 		++k;
5225 		if (k < count && base[k].type == 0) {
5226 			bcopy(&base[i], &base[i+1],
5227 			      (k - i) * sizeof(hammer2_blockref_t));
5228 			base[i] = *elm;
5229 
5230 			/*
5231 			 * We can only update parent->core.live_zero for live
5232 			 * chains.
5233 			 */
5234 			if (parent->core.live_zero <= k)
5235 				parent->core.live_zero = k + 1;
5236 			u = 2;
5237 			goto validate;
5238 		}
5239 	}
5240 	panic("hammer2_base_insert: no room!");
5241 
5242 	/*
5243 	 * Debugging
5244 	 */
5245 validate:
5246 	key_next = 0;
5247 	for (l = 0; l < count; ++l) {
5248 		if (base[l].type) {
5249 			key_next = base[l].key +
5250 				   ((hammer2_key_t)1 << base[l].keybits) - 1;
5251 			break;
5252 		}
5253 	}
5254 	while (++l < count) {
5255 		if (base[l].type) {
5256 			if (base[l].key <= key_next)
5257 				panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l);
5258 			key_next = base[l].key +
5259 				   ((hammer2_key_t)1 << base[l].keybits) - 1;
5260 
5261 		}
5262 	}
5263 
5264 }
5265 
5266 #if 0
5267 
5268 /*
5269  * Sort the blockref array for the chain.  Used by the flush code to
5270  * sort the blockref[] array.
5271  *
5272  * The chain must be exclusively locked AND spin-locked.
5273  */
5274 typedef hammer2_blockref_t *hammer2_blockref_p;
5275 
5276 static
5277 int
5278 hammer2_base_sort_callback(const void *v1, const void *v2)
5279 {
5280 	hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1;
5281 	hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2;
5282 
5283 	/*
5284 	 * Make sure empty elements are placed at the end of the array
5285 	 */
5286 	if (bref1->type == 0) {
5287 		if (bref2->type == 0)
5288 			return(0);
5289 		return(1);
5290 	} else if (bref2->type == 0) {
5291 		return(-1);
5292 	}
5293 
5294 	/*
5295 	 * Sort by key
5296 	 */
5297 	if (bref1->key < bref2->key)
5298 		return(-1);
5299 	if (bref1->key > bref2->key)
5300 		return(1);
5301 	return(0);
5302 }
5303 
5304 void
5305 hammer2_base_sort(hammer2_chain_t *chain)
5306 {
5307 	hammer2_blockref_t *base;
5308 	int count;
5309 
5310 	switch(chain->bref.type) {
5311 	case HAMMER2_BREF_TYPE_INODE:
5312 		/*
5313 		 * Special shortcut for embedded data returns the inode
5314 		 * itself.  Callers must detect this condition and access
5315 		 * the embedded data (the strategy code does this for us).
5316 		 *
5317 		 * This is only applicable to regular files and softlinks.
5318 		 */
5319 		if (chain->data->ipdata.meta.op_flags &
5320 		    HAMMER2_OPFLAG_DIRECTDATA) {
5321 			return;
5322 		}
5323 		base = &chain->data->ipdata.u.blockset.blockref[0];
5324 		count = HAMMER2_SET_COUNT;
5325 		break;
5326 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
5327 	case HAMMER2_BREF_TYPE_INDIRECT:
5328 		/*
5329 		 * Optimize indirect blocks in the INITIAL state to avoid
5330 		 * I/O.
5331 		 */
5332 		KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0);
5333 		base = &chain->data->npdata[0];
5334 		count = chain->bytes / sizeof(hammer2_blockref_t);
5335 		break;
5336 	case HAMMER2_BREF_TYPE_VOLUME:
5337 		base = &chain->data->voldata.sroot_blockset.blockref[0];
5338 		count = HAMMER2_SET_COUNT;
5339 		break;
5340 	case HAMMER2_BREF_TYPE_FREEMAP:
5341 		base = &chain->data->blkset.blockref[0];
5342 		count = HAMMER2_SET_COUNT;
5343 		break;
5344 	default:
5345 		kprintf("hammer2_chain_lookup: unrecognized "
5346 			"blockref(A) type: %d",
5347 		        chain->bref.type);
5348 		while (1)
5349 			tsleep(&base, 0, "dead", 0);
5350 		panic("hammer2_chain_lookup: unrecognized "
5351 		      "blockref(A) type: %d",
5352 		      chain->bref.type);
5353 		base = NULL;	/* safety */
5354 		count = 0;	/* safety */
5355 	}
5356 	kqsort(base, count, sizeof(*base), hammer2_base_sort_callback);
5357 }
5358 
5359 #endif
5360 
5361 /*
5362  * Chain memory management
5363  */
5364 void
5365 hammer2_chain_wait(hammer2_chain_t *chain)
5366 {
5367 	tsleep(chain, 0, "chnflw", 1);
5368 }
5369 
5370 const hammer2_media_data_t *
5371 hammer2_chain_rdata(hammer2_chain_t *chain)
5372 {
5373 	KKASSERT(chain->data != NULL);
5374 	return (chain->data);
5375 }
5376 
5377 hammer2_media_data_t *
5378 hammer2_chain_wdata(hammer2_chain_t *chain)
5379 {
5380 	KKASSERT(chain->data != NULL);
5381 	return (chain->data);
5382 }
5383 
5384 /*
5385  * Set the check data for a chain.  This can be a heavy-weight operation
5386  * and typically only runs on-flush.  For file data check data is calculated
5387  * when the logical buffers are flushed.
5388  */
5389 void
5390 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata)
5391 {
5392 	chain->bref.flags &= ~HAMMER2_BREF_FLAG_ZERO;
5393 
5394 	switch(HAMMER2_DEC_CHECK(chain->bref.methods)) {
5395 	case HAMMER2_CHECK_NONE:
5396 		break;
5397 	case HAMMER2_CHECK_DISABLED:
5398 		break;
5399 	case HAMMER2_CHECK_ISCSI32:
5400 		chain->bref.check.iscsi32.value =
5401 			hammer2_icrc32(bdata, chain->bytes);
5402 		break;
5403 	case HAMMER2_CHECK_XXHASH64:
5404 		chain->bref.check.xxhash64.value =
5405 			XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED);
5406 		break;
5407 	case HAMMER2_CHECK_SHA192:
5408 		{
5409 			SHA256_CTX hash_ctx;
5410 			union {
5411 				uint8_t digest[SHA256_DIGEST_LENGTH];
5412 				uint64_t digest64[SHA256_DIGEST_LENGTH/8];
5413 			} u;
5414 
5415 			SHA256_Init(&hash_ctx);
5416 			SHA256_Update(&hash_ctx, bdata, chain->bytes);
5417 			SHA256_Final(u.digest, &hash_ctx);
5418 			u.digest64[2] ^= u.digest64[3];
5419 			bcopy(u.digest,
5420 			      chain->bref.check.sha192.data,
5421 			      sizeof(chain->bref.check.sha192.data));
5422 		}
5423 		break;
5424 	case HAMMER2_CHECK_FREEMAP:
5425 		chain->bref.check.freemap.icrc32 =
5426 			hammer2_icrc32(bdata, chain->bytes);
5427 		break;
5428 	default:
5429 		kprintf("hammer2_chain_setcheck: unknown check type %02x\n",
5430 			chain->bref.methods);
5431 		break;
5432 	}
5433 }
5434 
5435 int
5436 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata)
5437 {
5438 	uint32_t check32;
5439 	uint64_t check64;
5440 	int r;
5441 
5442 	if (chain->bref.flags & HAMMER2_BREF_FLAG_ZERO)
5443 		return 1;
5444 
5445 	switch(HAMMER2_DEC_CHECK(chain->bref.methods)) {
5446 	case HAMMER2_CHECK_NONE:
5447 		r = 1;
5448 		break;
5449 	case HAMMER2_CHECK_DISABLED:
5450 		r = 1;
5451 		break;
5452 	case HAMMER2_CHECK_ISCSI32:
5453 		check32 = hammer2_icrc32(bdata, chain->bytes);
5454 		r = (chain->bref.check.iscsi32.value == check32);
5455 		if (r == 0) {
5456 			kprintf("chain %016jx.%02x meth=%02x CHECK FAIL "
5457 				"(flags=%08x, bref/data %08x/%08x)\n",
5458 				chain->bref.data_off,
5459 				chain->bref.type,
5460 				chain->bref.methods,
5461 				chain->flags,
5462 				chain->bref.check.iscsi32.value,
5463 				check32);
5464 		}
5465 		hammer2_check_icrc32 += chain->bytes;
5466 		break;
5467 	case HAMMER2_CHECK_XXHASH64:
5468 		check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED);
5469 		r = (chain->bref.check.xxhash64.value == check64);
5470 		if (r == 0) {
5471 			kprintf("chain %016jx.%02x key=%016jx "
5472 				"meth=%02x CHECK FAIL "
5473 				"(flags=%08x, bref/data %016jx/%016jx)\n",
5474 				chain->bref.data_off,
5475 				chain->bref.type,
5476 				chain->bref.key,
5477 				chain->bref.methods,
5478 				chain->flags,
5479 				chain->bref.check.xxhash64.value,
5480 				check64);
5481 		}
5482 		hammer2_check_xxhash64 += chain->bytes;
5483 		break;
5484 	case HAMMER2_CHECK_SHA192:
5485 		{
5486 			SHA256_CTX hash_ctx;
5487 			union {
5488 				uint8_t digest[SHA256_DIGEST_LENGTH];
5489 				uint64_t digest64[SHA256_DIGEST_LENGTH/8];
5490 			} u;
5491 
5492 			SHA256_Init(&hash_ctx);
5493 			SHA256_Update(&hash_ctx, bdata, chain->bytes);
5494 			SHA256_Final(u.digest, &hash_ctx);
5495 			u.digest64[2] ^= u.digest64[3];
5496 			if (bcmp(u.digest,
5497 				 chain->bref.check.sha192.data,
5498 			         sizeof(chain->bref.check.sha192.data)) == 0) {
5499 				r = 1;
5500 			} else {
5501 				r = 0;
5502 				kprintf("chain %016jx.%02x meth=%02x "
5503 					"CHECK FAIL\n",
5504 					chain->bref.data_off,
5505 					chain->bref.type,
5506 					chain->bref.methods);
5507 			}
5508 		}
5509 		break;
5510 	case HAMMER2_CHECK_FREEMAP:
5511 		r = (chain->bref.check.freemap.icrc32 ==
5512 		     hammer2_icrc32(bdata, chain->bytes));
5513 		if (r == 0) {
5514 			kprintf("chain %016jx.%02x meth=%02x "
5515 				"CHECK FAIL\n",
5516 				chain->bref.data_off,
5517 				chain->bref.type,
5518 				chain->bref.methods);
5519 			kprintf("freemap.icrc %08x icrc32 %08x (%d)\n",
5520 				chain->bref.check.freemap.icrc32,
5521 				hammer2_icrc32(bdata, chain->bytes),
5522 					       chain->bytes);
5523 			if (chain->dio)
5524 				kprintf("dio %p buf %016jx,%d bdata %p/%p\n",
5525 					chain->dio, chain->dio->bp->b_loffset,
5526 					chain->dio->bp->b_bufsize, bdata,
5527 					chain->dio->bp->b_data);
5528 		}
5529 
5530 		break;
5531 	default:
5532 		kprintf("hammer2_chain_setcheck: unknown check type %02x\n",
5533 			chain->bref.methods);
5534 		r = 1;
5535 		break;
5536 	}
5537 	return r;
5538 }
5539 
5540 /*
5541  * Acquire the chain and parent representing the specified inode for the
5542  * device at the specified cluster index.
5543  *
5544  * The flags passed in are LOOKUP flags, not RESOLVE flags.
5545  *
5546  * If we are unable to locate the hardlink, INVAL is returned and *chainp
5547  * will be NULL.  *parentp may still be set error or not, or NULL if the
5548  * parent itself could not be resolved.
5549  *
5550  * Caller must pass-in a valid or NULL *parentp or *chainp.  The passed-in
5551  * *parentp and *chainp will be unlocked if not NULL.
5552  */
5553 int
5554 hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum,
5555 			 int clindex, int flags,
5556 			 hammer2_chain_t **parentp, hammer2_chain_t **chainp)
5557 {
5558 	hammer2_chain_t *parent;
5559 	hammer2_chain_t *rchain;
5560 	hammer2_key_t key_dummy;
5561 	int resolve_flags;
5562 	int error;
5563 
5564 	resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ?
5565 			HAMMER2_RESOLVE_SHARED : 0;
5566 
5567 	/*
5568 	 * Caller expects us to replace these.
5569 	 */
5570 	if (*chainp) {
5571 		hammer2_chain_unlock(*chainp);
5572 		hammer2_chain_drop(*chainp);
5573 		*chainp = NULL;
5574 	}
5575 	if (*parentp) {
5576 		hammer2_chain_unlock(*parentp);
5577 		hammer2_chain_drop(*parentp);
5578 		*parentp = NULL;
5579 	}
5580 
5581 	/*
5582 	 * Inodes hang off of the iroot (bit 63 is clear, differentiating
5583 	 * inodes from root directory entries in the key lookup).
5584 	 */
5585 	parent = hammer2_inode_chain(pmp->iroot, clindex, resolve_flags);
5586 	rchain = NULL;
5587 	if (parent) {
5588 		rchain = hammer2_chain_lookup(&parent, &key_dummy,
5589 					      inum, inum,
5590 					      &error, flags);
5591 	} else {
5592 		error = HAMMER2_ERROR_EIO;
5593 	}
5594 	*parentp = parent;
5595 	*chainp = rchain;
5596 
5597 	return error;
5598 }
5599 
5600 /*
5601  * Used by the bulkscan code to snapshot the synchronized storage for
5602  * a volume, allowing it to be scanned concurrently against normal
5603  * operation.
5604  */
5605 hammer2_chain_t *
5606 hammer2_chain_bulksnap(hammer2_dev_t *hmp)
5607 {
5608 	hammer2_chain_t *copy;
5609 
5610 	copy = hammer2_chain_alloc(hmp, hmp->spmp, &hmp->vchain.bref);
5611 	copy->data = kmalloc(sizeof(copy->data->voldata),
5612 			     hmp->mchain,
5613 			     M_WAITOK | M_ZERO);
5614 	hammer2_voldata_lock(hmp);
5615 	copy->data->voldata = hmp->volsync;
5616 	hammer2_voldata_unlock(hmp);
5617 
5618 	return copy;
5619 }
5620 
5621 void
5622 hammer2_chain_bulkdrop(hammer2_chain_t *copy)
5623 {
5624 	KKASSERT(copy->bref.type == HAMMER2_BREF_TYPE_VOLUME);
5625 	KKASSERT(copy->data);
5626 	kfree(copy->data, copy->hmp->mchain);
5627 	copy->data = NULL;
5628 	atomic_add_long(&hammer2_chain_allocs, -1);
5629 	hammer2_chain_drop(copy);
5630 }
5631 
5632 /*
5633  * Returns non-zero if the chain (INODE or DIRENT) matches the
5634  * filename.
5635  */
5636 int
5637 hammer2_chain_dirent_test(hammer2_chain_t *chain, const char *name,
5638 			  size_t name_len)
5639 {
5640 	const hammer2_inode_data_t *ripdata;
5641 	const hammer2_dirent_head_t *den;
5642 
5643 	if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
5644 		ripdata = &chain->data->ipdata;
5645 		if (ripdata->meta.name_len == name_len &&
5646 		    bcmp(ripdata->filename, name, name_len) == 0) {
5647 			return 1;
5648 		}
5649 	}
5650 	if (chain->bref.type == HAMMER2_BREF_TYPE_DIRENT &&
5651 	   chain->bref.embed.dirent.namlen == name_len) {
5652 		den = &chain->bref.embed.dirent;
5653 		if (name_len > sizeof(chain->bref.check.buf) &&
5654 		    bcmp(chain->data->buf, name, name_len) == 0) {
5655 			return 1;
5656 		}
5657 		if (name_len <= sizeof(chain->bref.check.buf) &&
5658 		    bcmp(chain->bref.check.buf, name, name_len) == 0) {
5659 			return 1;
5660 		}
5661 	}
5662 	return 0;
5663 }
5664