xref: /dragonfly/sys/vfs/hammer2/hammer2_chain.c (revision 31c7ac8b)
1 /*
2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem implements most of the core support functions for
37  * the hammer2_chain structure.
38  *
39  * Chains are the in-memory version on media objects (volume header, inodes,
40  * indirect blocks, data blocks, etc).  Chains represent a portion of the
41  * HAMMER2 topology.
42  *
43  * A chain is topologically stable once it has been inserted into the
44  * in-memory topology.  Modifications which copy, move, or resize the chain
45  * are handled via the DELETE-DUPLICATE mechanic where the original chain
46  * stays intact but is marked deleted and a new chain is allocated which
47  * shares the old chain's children.
48  *
49  * This sharing is handled via the hammer2_chain_core structure.
50  *
51  * The DELETE-DUPLICATE mechanism allows the same topological level to contain
52  * many overloadings.  However, our RBTREE mechanics require that there be
53  * no overlaps so we accomplish the overloading by moving conflicting chains
54  * with smaller or equal radii into a sub-RBTREE under the chain being
55  * overloaded.
56  *
57  * DELETE-DUPLICATE is also used when a modification to a chain crosses a
58  * flush synchronization boundary, allowing the flush code to continue flushing
59  * the older version of the topology and not be disrupted by new frontend
60  * operations.
61  *
62  *				LIVE VS FLUSH VIEW
63  *
64  * All lookup and iterate operations and most modifications are done on the
65  * live view.  During flushes lookups are not normally done and modifications
66  * may be run on the flush view.  However, flushes often needs to allocate
67  * blocks and the freemap_alloc/free code issues lookups.  This code is
68  * special cased to use the live view when called from a flush.
69  *
70  * General chain lookup/iteration functions are NOT aware of the flush view,
71  * they only know about live views.
72  */
73 #include <sys/cdefs.h>
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/types.h>
77 #include <sys/lock.h>
78 #include <sys/kern_syscall.h>
79 #include <sys/uuid.h>
80 
81 #include "hammer2.h"
82 
83 static int hammer2_indirect_optimize;	/* XXX SYSCTL */
84 
85 static hammer2_chain_t *hammer2_chain_create_indirect(
86 		hammer2_trans_t *trans, hammer2_chain_t *parent,
87 		hammer2_key_t key, int keybits, int for_type, int *errorp);
88 static void hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop);
89 static void adjreadcounter(hammer2_blockref_t *bref, size_t bytes);
90 static hammer2_chain_t *hammer2_combined_find(
91 		hammer2_chain_t *parent,
92 		hammer2_blockref_t *base, int count,
93 		int *cache_indexp, hammer2_key_t *key_nextp,
94 		hammer2_key_t key_beg, hammer2_key_t key_end,
95 		hammer2_blockref_t **bresp);
96 
97 /*
98  * Basic RBTree for chains (core->rbtree and core->dbtree).  Chains cannot
99  * overlap in the RB trees.  Deleted chains are moved from rbtree to either
100  * dbtree or to dbq.
101  *
102  * Chains in delete-duplicate sequences can always iterate through core_entry
103  * to locate the live version of the chain.
104  */
105 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
106 
107 int
108 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
109 {
110 	hammer2_key_t c1_beg;
111 	hammer2_key_t c1_end;
112 	hammer2_key_t c2_beg;
113 	hammer2_key_t c2_end;
114 
115 	/*
116 	 * Compare chains.  Overlaps are not supposed to happen and catch
117 	 * any software issues early we count overlaps as a match.
118 	 */
119 	c1_beg = chain1->bref.key;
120 	c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1;
121 	c2_beg = chain2->bref.key;
122 	c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1;
123 
124 	if (c1_end < c2_beg)	/* fully to the left */
125 		return(-1);
126 	if (c1_beg > c2_end)	/* fully to the right */
127 		return(1);
128 	return(0);		/* overlap (must not cross edge boundary) */
129 }
130 
131 static __inline
132 int
133 hammer2_isclusterable(hammer2_chain_t *chain)
134 {
135 	if (hammer2_cluster_enable) {
136 		if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
137 		    chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
138 		    chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
139 			return(1);
140 		}
141 	}
142 	return(0);
143 }
144 
145 /*
146  * Recursively set update_hi starting at chain up through to the root.
147  *
148  * This controls top-down visibility for flushes.  The child has just one
149  * 'above' core, but the core itself can be multi-homed with parents iterated
150  * via core->ownerq.  The last parent is the 'live' parent (all others had to
151  * have been delete-duplicated).  We always propagate upward through the live
152  * parent.
153  *
154  * This function is not used during a flush (except when the flush is
155  * allocating which requires the live tree).  The flush keeps track of its
156  * recursion itself.
157  *
158  * XXX SMP races
159  */
160 void
161 hammer2_chain_setsubmod(hammer2_trans_t *trans, hammer2_chain_t *chain)
162 {
163 	hammer2_chain_core_t *above;
164 
165 	if (chain->update_hi < trans->sync_tid)
166 		chain->update_hi = trans->sync_tid;
167 
168 	while ((above = chain->above) != NULL) {
169 		spin_lock(&above->cst.spin);
170 		chain = TAILQ_LAST(&above->ownerq, h2_core_list);
171 		if (chain->update_hi < trans->sync_tid)
172 			chain->update_hi = trans->sync_tid;
173 		spin_unlock(&above->cst.spin);
174 	}
175 }
176 
177 /*
178  * Allocate a new disconnected chain element representing the specified
179  * bref.  chain->refs is set to 1 and the passed bref is copied to
180  * chain->bref.  chain->bytes is derived from the bref.
181  *
182  * chain->core is NOT allocated and the media data and bp pointers are left
183  * NULL.  The caller must call chain_core_alloc() to allocate or associate
184  * a core with the chain.
185  *
186  * NOTE: Returns a referenced but unlocked (because there is no core) chain.
187  */
188 hammer2_chain_t *
189 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_pfsmount_t *pmp,
190 		    hammer2_trans_t *trans, hammer2_blockref_t *bref)
191 {
192 	hammer2_chain_t *chain;
193 	u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
194 
195 	/*
196 	 * Construct the appropriate system structure.
197 	 */
198 	switch(bref->type) {
199 	case HAMMER2_BREF_TYPE_INODE:
200 	case HAMMER2_BREF_TYPE_INDIRECT:
201 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
202 	case HAMMER2_BREF_TYPE_DATA:
203 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
204 		/*
205 		 * Chain's are really only associated with the hmp but we
206 		 * maintain a pmp association for per-mount memory tracking
207 		 * purposes.  The pmp can be NULL.
208 		 */
209 		chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO);
210 		if (pmp)
211 			chain->pmp = pmp;
212 		break;
213 	case HAMMER2_BREF_TYPE_VOLUME:
214 	case HAMMER2_BREF_TYPE_FREEMAP:
215 		chain = NULL;
216 		panic("hammer2_chain_alloc volume type illegal for op");
217 	default:
218 		chain = NULL;
219 		panic("hammer2_chain_alloc: unrecognized blockref type: %d",
220 		      bref->type);
221 	}
222 
223 	chain->hmp = hmp;
224 	chain->bref = *bref;
225 	chain->bytes = bytes;
226 	chain->refs = 1;
227 	chain->flags = HAMMER2_CHAIN_ALLOCATED;
228 	chain->delete_tid = HAMMER2_MAX_TID;
229 
230 	/*
231 	 * Set modify_tid if a transaction is creating the inode.
232 	 * Enforce update_lo = 0 so nearby transactions do not think
233 	 * it has been flushed when it hasn't.
234 	 *
235 	 * NOTE: When loading a chain from backing store or creating a
236 	 *	 snapshot, trans will be NULL and the caller is responsible
237 	 *	 for setting these fields.
238 	 */
239 	if (trans) {
240 		chain->modify_tid = trans->sync_tid;
241 		chain->update_lo = 0;
242 	}
243 
244 	return (chain);
245 }
246 
247 /*
248  * Associate an existing core with the chain or allocate a new core.
249  *
250  * The core is not locked.  No additional refs on the chain are made.
251  * (trans) must not be NULL if (core) is not NULL.
252  *
253  * When chains are delete-duplicated during flushes we insert nchain on
254  * the ownerq after ochain instead of at the end in order to give the
255  * drop code visibility in the correct order, otherwise drops can be missed.
256  */
257 void
258 hammer2_chain_core_alloc(hammer2_trans_t *trans,
259 			 hammer2_chain_t *nchain, hammer2_chain_t *ochain)
260 {
261 	hammer2_chain_core_t *core;
262 
263 	KKASSERT(nchain->core == NULL);
264 
265 	if (ochain == NULL) {
266 		/*
267 		 * Fresh core under nchain (no multi-homing of ochain's
268 		 * sub-tree).
269 		 */
270 		core = kmalloc(sizeof(*core), nchain->hmp->mchain,
271 			       M_WAITOK | M_ZERO);
272 		TAILQ_INIT(&core->ownerq);
273 		TAILQ_INIT(&core->dbq);
274 		RB_INIT(&core->rbtree);	/* live chains */
275 		RB_INIT(&core->dbtree);	/* deleted original (bmapped) chains */
276 		core->sharecnt = 1;
277 		core->good = 0x1234;
278 		nchain->core = core;
279 		ccms_cst_init(&core->cst, nchain);
280 		TAILQ_INSERT_TAIL(&core->ownerq, nchain, core_entry);
281 	} else {
282 		/*
283 		 * Propagate the PFSROOT flag which we set on all subdirs
284 		 * under the super-root.
285 		 */
286 		atomic_set_int(&nchain->flags,
287 			       ochain->flags & HAMMER2_CHAIN_PFSROOT);
288 
289 		/*
290 		 * Duplicating ochain -> nchain.  Set the DUPLICATED flag on
291 		 * ochain if nchain is not a snapshot.
292 		 *
293 		 * It is possible for the DUPLICATED flag to already be
294 		 * set when called via a flush operation because flush
295 		 * operations may have to work on elements with delete_tid's
296 		 * beyond the flush sync_tid.  In this situation we must
297 		 * ensure that nchain is placed just after ochain in the
298 		 * ownerq and that the DUPLICATED flag is set on nchain so
299 		 * 'live' operations skip past it to the correct chain.
300 		 *
301 		 * The flusher understands the blockref synchronization state
302 		 * for any stale chains by observing bref.mirror_tid, which
303 		 * delete-duplicate replicates.
304 		 *
305 		 * WARNING! However, the case is disallowed when the flusher
306 		 *	    is allocating freemap space because this entails
307 		 *	    more than just adjusting a block table.
308 		 */
309 		if (ochain->flags & HAMMER2_CHAIN_DUPLICATED) {
310 			KKASSERT((trans->flags &
311 				  (HAMMER2_TRANS_ISFLUSH |
312 				   HAMMER2_TRANS_ISALLOCATING)) ==
313 			         HAMMER2_TRANS_ISFLUSH);
314 			atomic_set_int(&nchain->flags,
315 				       HAMMER2_CHAIN_DUPLICATED);
316 		}
317 		if ((nchain->flags & HAMMER2_CHAIN_SNAPSHOT) == 0) {
318 			atomic_set_int(&ochain->flags,
319 				       HAMMER2_CHAIN_DUPLICATED);
320 		}
321 		core = ochain->core;
322 		atomic_add_int(&core->sharecnt, 1);
323 
324 		spin_lock(&core->cst.spin);
325 		nchain->core = core;
326 
327 		/*
328 		 * Maintain ordering for refactor test so we don't skip over
329 		 * a snapshot.  Also, during flushes, delete-duplications
330 		 * for block-table updates can occur on ochains already
331 		 * deleted (delete-duplicated by a later transaction), or
332 		 * on forward-indexed ochains.  We must properly insert
333 		 * nchain relative to ochain.
334 		 */
335 		if (trans && trans->sync_tid < ochain->modify_tid) {
336 			TAILQ_INSERT_BEFORE(ochain, nchain, core_entry);
337 		} else {
338 			TAILQ_INSERT_AFTER(&core->ownerq, ochain,
339 					   nchain, core_entry);
340 		}
341 		spin_unlock(&core->cst.spin);
342 	}
343 }
344 
345 /*
346  * Add a reference to a chain element, preventing its destruction.
347  */
348 void
349 hammer2_chain_ref(hammer2_chain_t *chain)
350 {
351 	atomic_add_int(&chain->refs, 1);
352 }
353 
354 /*
355  * Insert the chain in the core rbtree.
356  *
357  * Normal insertions are placed in the live rbtree.  Insertion of a deleted
358  * chain is a special case used by the flush code that is placed on the
359  * unstaged deleted list to avoid confusing the live view.
360  */
361 #define HAMMER2_CHAIN_INSERT_SPIN	0x0001
362 #define HAMMER2_CHAIN_INSERT_LIVE	0x0002
363 #define HAMMER2_CHAIN_INSERT_RACE	0x0004
364 
365 static
366 int
367 hammer2_chain_insert(hammer2_chain_core_t *above,
368 		     hammer2_chain_t *ochain, hammer2_chain_t *nchain,
369 		     int flags, int generation)
370 {
371 	hammer2_chain_t *xchain;
372 	int error = 0;
373 
374 	if (flags & HAMMER2_CHAIN_INSERT_SPIN)
375 		spin_lock(&above->cst.spin);
376 
377 	/*
378 	 * Interlocked by spinlock, check for race
379 	 */
380 	if ((flags & HAMMER2_CHAIN_INSERT_RACE) &&
381 	    above->generation != generation) {
382 		error = EAGAIN;
383 		goto failed;
384 	}
385 
386 	/*
387 	 * Insert nchain
388 	 */
389 	if (nchain->flags & HAMMER2_CHAIN_DELETED) {
390 		if (ochain && (ochain->flags & HAMMER2_CHAIN_BMAPPED)) {
391 			if (ochain->flags & HAMMER2_CHAIN_ONDBTREE) {
392 				RB_REMOVE(hammer2_chain_tree,
393 					  &above->dbtree, ochain);
394 				atomic_clear_int(&ochain->flags,
395 						 HAMMER2_CHAIN_ONDBTREE);
396 				TAILQ_INSERT_TAIL(&above->dbq,
397 						  ochain, db_entry);
398 				atomic_set_int(&ochain->flags,
399 						HAMMER2_CHAIN_ONDBQ);
400 			}
401 			/* clear BMAPPED (DBTREE, sometimes RBTREE) */
402 			atomic_clear_int(&ochain->flags, HAMMER2_CHAIN_BMAPPED);
403 
404 			xchain = RB_INSERT(hammer2_chain_tree,
405 					   &above->dbtree, nchain);
406 			KKASSERT(xchain == NULL);
407 			atomic_set_int(&nchain->flags,
408 				       HAMMER2_CHAIN_ONDBTREE |
409 				       HAMMER2_CHAIN_BMAPPED);
410 		} else {
411 			TAILQ_INSERT_TAIL(&above->dbq, nchain, db_entry);
412 			atomic_set_int(&nchain->flags, HAMMER2_CHAIN_ONDBQ);
413 		}
414 	} else {
415 		xchain = RB_INSERT(hammer2_chain_tree, &above->rbtree, nchain);
416 		KASSERT(xchain == NULL,
417 			("hammer2_chain_insert: collision %p", nchain));
418 		atomic_set_int(&nchain->flags, HAMMER2_CHAIN_ONRBTREE);
419 	}
420 
421 	nchain->above = above;
422 	++above->chain_count;
423 	++above->generation;
424 
425 	/*
426 	 * We have to keep track of the effective live-view blockref count
427 	 * so the create code knows when to push an indirect block.
428 	 */
429 	if (flags & HAMMER2_CHAIN_INSERT_LIVE)
430 		atomic_add_int(&above->live_count, 1);
431 failed:
432 	if (flags & HAMMER2_CHAIN_INSERT_SPIN)
433 		spin_unlock(&above->cst.spin);
434 	return error;
435 }
436 
437 /*
438  * Drop the caller's reference to the chain.  When the ref count drops to
439  * zero this function will try to disassociate the chain from its parent and
440  * deallocate it, then recursely drop the parent using the implied ref
441  * from the chain's chain->parent.
442  */
443 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain,
444 					       struct h2_core_list *delayq);
445 
446 void
447 hammer2_chain_drop(hammer2_chain_t *chain)
448 {
449 	struct h2_core_list delayq;
450 	hammer2_chain_t *scan;
451 	u_int refs;
452 	u_int need = 0;
453 
454 	if (hammer2_debug & 0x200000)
455 		Debugger("drop");
456 
457 	if (chain->flags & HAMMER2_CHAIN_FLUSH_CREATE)
458 		++need;
459 	if (chain->flags & HAMMER2_CHAIN_FLUSH_DELETE)
460 		++need;
461 	if (chain->flags & HAMMER2_CHAIN_MODIFIED)
462 		++need;
463 	KKASSERT(chain->refs > need);
464 
465 	TAILQ_INIT(&delayq);
466 
467 	while (chain) {
468 		refs = chain->refs;
469 		cpu_ccfence();
470 		KKASSERT(refs > 0);
471 
472 		if (refs == 1) {
473 			chain = hammer2_chain_lastdrop(chain, &delayq);
474 		} else {
475 			if (atomic_cmpset_int(&chain->refs, refs, refs - 1))
476 				break;
477 			/* retry the same chain */
478 		}
479 
480 		/*
481 		 * When we've exhausted lastdrop chaining pull off of delayq.
482 		 * chains on delayq are dead but are used to placehold other
483 		 * chains which we added a ref to for the purpose of dropping.
484 		 */
485 		if (chain == NULL) {
486 			hammer2_mount_t *hmp;
487 
488 			if ((scan = TAILQ_FIRST(&delayq)) != NULL) {
489 				chain = (void *)scan->data;
490 				TAILQ_REMOVE(&delayq, scan, core_entry);
491 				scan->flags &= ~HAMMER2_CHAIN_ALLOCATED;
492 				hmp = scan->hmp;
493 				scan->hmp = NULL;
494 				kfree(scan, hmp->mchain);
495 			}
496 		}
497 	}
498 }
499 
500 /*
501  * Safe handling of the 1->0 transition on chain.  Returns a chain for
502  * recursive drop or NULL, possibly returning the same chain if the atomic
503  * op fails.
504  *
505  * Whem two chains need to be recursively dropped we use the chain
506  * we would otherwise free to placehold the additional chain.  It's a bit
507  * convoluted but we can't just recurse without potentially blowing out
508  * the kernel stack.
509  *
510  * The chain cannot be freed if it has a non-empty core (children) or
511  * it is not at the head of ownerq.
512  *
513  * The cst spinlock is allowed nest child-to-parent (not parent-to-child).
514  */
515 static
516 hammer2_chain_t *
517 hammer2_chain_lastdrop(hammer2_chain_t *chain, struct h2_core_list *delayq)
518 {
519 	hammer2_pfsmount_t *pmp;
520 	hammer2_mount_t *hmp;
521 	hammer2_chain_core_t *above;
522 	hammer2_chain_core_t *core;
523 	hammer2_chain_t *rdrop1;
524 	hammer2_chain_t *rdrop2;
525 
526 	/*
527 	 * Spinlock the core and check to see if it is empty.  If it is
528 	 * not empty we leave chain intact with refs == 0.  The elements
529 	 * in core->rbtree are associated with other chains contemporary
530 	 * with ours but not with our chain directly.
531 	 */
532 	if ((core = chain->core) != NULL) {
533 		spin_lock(&core->cst.spin);
534 
535 		/*
536 		 * We can't free non-stale chains with children until we are
537 		 * able to free the children because there might be a flush
538 		 * dependency.  Flushes of stale children (which should also
539 		 * have their deleted flag set) short-cut recursive flush
540 		 * dependencies and can be freed here.  Any flushes which run
541 		 * through stale children due to the flush synchronization
542 		 * point should have a FLUSH_* bit set in the chain and not
543 		 * reach lastdrop at this time.
544 		 *
545 		 * NOTE: We return (chain) on failure to retry.
546 		 */
547 		if (core->chain_count &&
548 		    (chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0) {
549 			if (atomic_cmpset_int(&chain->refs, 1, 0))
550 				chain = NULL;	/* success */
551 			spin_unlock(&core->cst.spin);
552 			return(chain);
553 		}
554 		/* no chains left under us */
555 
556 		/*
557 		 * Various parts of the code might be holding a ref on a
558 		 * stale chain as a placemarker which must be iterated to
559 		 * locate a later non-stale (live) chain.  We must be sure
560 		 * NOT to free the later non-stale chain (which might have
561 		 * no refs).  Otherwise mass confusion may result.
562 		 *
563 		 * The DUPLICATED flag tells us whether the chain is stale
564 		 * or not, so the rule is that any chain whos DUPLICATED flag
565 		 * is NOT set must also be at the head of the ownerq.
566 		 *
567 		 * Note that the DELETED flag is not involved.  That is, a
568 		 * live chain can represent a deletion that has not yet been
569 		 * flushed (or still has refs).
570 		 */
571 #if 0
572 		if (TAILQ_NEXT(chain, core_entry) == NULL &&
573 		    TAILQ_FIRST(&core->ownerq) != chain) {
574 #endif
575 		if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0 &&
576 		    TAILQ_FIRST(&core->ownerq) != chain) {
577 			if (atomic_cmpset_int(&chain->refs, 1, 0))
578 				chain = NULL;	/* success */
579 			spin_unlock(&core->cst.spin);
580 			return(chain);
581 		}
582 	}
583 
584 	/*
585 	 * chain->core has no children left so no accessors can get to our
586 	 * chain from there.  Now we have to lock the above core to interlock
587 	 * remaining possible accessors that might bump chain's refs before
588 	 * we can safely drop chain's refs with intent to free the chain.
589 	 */
590 	hmp = chain->hmp;
591 	pmp = chain->pmp;	/* can be NULL */
592 	rdrop1 = NULL;
593 	rdrop2 = NULL;
594 
595 	/*
596 	 * Spinlock the parent and try to drop the last ref on chain.
597 	 * On success remove chain from its parent, otherwise return NULL.
598 	 *
599 	 * (normal core locks are top-down recursive but we define core
600 	 *  spinlocks as bottom-up recursive, so this is safe).
601 	 */
602 	if ((above = chain->above) != NULL) {
603 		spin_lock(&above->cst.spin);
604 		if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
605 			/* 1->0 transition failed */
606 			spin_unlock(&above->cst.spin);
607 			if (core)
608 				spin_unlock(&core->cst.spin);
609 			return(chain);	/* retry */
610 		}
611 
612 		/*
613 		 * 1->0 transition successful, remove chain from its
614 		 * above core.
615 		 */
616 		switch (chain->flags & (HAMMER2_CHAIN_ONRBTREE |
617 					HAMMER2_CHAIN_ONDBTREE |
618 					HAMMER2_CHAIN_ONDBQ)) {
619 		case HAMMER2_CHAIN_ONRBTREE:
620 			RB_REMOVE(hammer2_chain_tree, &above->rbtree, chain);
621 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
622 			break;
623 		case HAMMER2_CHAIN_ONDBTREE:
624 			RB_REMOVE(hammer2_chain_tree, &above->dbtree, chain);
625 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONDBTREE);
626 			break;
627 		case HAMMER2_CHAIN_ONDBQ:
628 			TAILQ_REMOVE(&above->dbq, chain, db_entry);
629 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONDBQ);
630 			break;
631 		default:
632 			panic("hammer2_chain_lastdrop: chain %p badflags %08x",
633 			      chain, chain->flags);
634 			break;
635 		}
636 
637 		--above->chain_count;
638 		chain->above = NULL;
639 
640 		/*
641 		 * If our chain was the last chain in the parent's core the
642 		 * core is now empty and its parents might now be droppable.
643 		 * Try to drop the first multi-homed parent by gaining a
644 		 * ref on it here and then dropping it below.
645 		 */
646 		if (above->chain_count == 0) {
647 			rdrop1 = TAILQ_FIRST(&above->ownerq);
648 			if (rdrop1 &&
649 			    atomic_cmpset_int(&rdrop1->refs, 0, 1) == 0) {
650 				rdrop1 = NULL;
651 			}
652 		}
653 		spin_unlock(&above->cst.spin);
654 		above = NULL;	/* safety */
655 	}
656 
657 	/*
658 	 * Successful 1->0 transition and the chain can be destroyed now.
659 	 *
660 	 * We still have the core spinlock (if core is non-NULL), and core's
661 	 * chain_count is 0.  The above spinlock is gone.
662 	 *
663 	 * Remove chain from ownerq.  Once core has no more owners (and no
664 	 * children which is already the case) we can destroy core.
665 	 *
666 	 * If core has more owners we may be able to continue a bottom-up
667 	 * drop with our next sibling.
668 	 */
669 	if (core) {
670 		chain->core = NULL;
671 
672 		TAILQ_REMOVE(&core->ownerq, chain, core_entry);
673 		rdrop2 = TAILQ_FIRST(&core->ownerq);
674 		if (rdrop2 && atomic_cmpset_int(&rdrop2->refs, 0, 1) == 0)
675 			rdrop2 = NULL;
676 		spin_unlock(&core->cst.spin);
677 
678 		/*
679 		 * We can do the final 1->0 transition with an atomic op
680 		 * after releasing core's spinlock.
681 		 */
682 		if (atomic_fetchadd_int(&core->sharecnt, -1) == 1) {
683 			/*
684 			 * On the 1->0 transition of core we can destroy
685 			 * it.
686 			 */
687 			KKASSERT(TAILQ_EMPTY(&core->ownerq));
688 			KKASSERT(RB_EMPTY(&core->rbtree) &&
689 				 RB_EMPTY(&core->dbtree) &&
690 				 TAILQ_EMPTY(&core->dbq) &&
691 				 core->chain_count == 0);
692 			KKASSERT(core->cst.count == 0);
693 			KKASSERT(core->cst.upgrade == 0);
694 			core->good = 0x5678;
695 			kfree(core, hmp->mchain);
696 		}
697 		core = NULL;	/* safety */
698 	}
699 
700 	/*
701 	 * All spin locks are gone, finish freeing stuff.
702 	 */
703 	KKASSERT((chain->flags & (HAMMER2_CHAIN_FLUSH_CREATE |
704 				  HAMMER2_CHAIN_FLUSH_DELETE |
705 				  HAMMER2_CHAIN_MODIFIED)) == 0);
706 	hammer2_chain_drop_data(chain, 1);
707 
708 	KKASSERT(chain->dio == NULL);
709 
710 	/*
711 	 * Once chain resources are gone we can use the now dead chain
712 	 * structure to placehold what might otherwise require a recursive
713 	 * drop, because we have potentially two things to drop and can only
714 	 * return one directly.
715 	 */
716 	if (rdrop1 && rdrop2) {
717 		KKASSERT(chain->flags & HAMMER2_CHAIN_ALLOCATED);
718 		chain->data = (void *)rdrop1;
719 		TAILQ_INSERT_TAIL(delayq, chain, core_entry);
720 		rdrop1 = NULL;
721 	} else if (chain->flags & HAMMER2_CHAIN_ALLOCATED) {
722 		chain->flags &= ~HAMMER2_CHAIN_ALLOCATED;
723 		chain->hmp = NULL;
724 		kfree(chain, hmp->mchain);
725 	}
726 
727 	/*
728 	 * Either or both can be NULL.  We already handled the case where
729 	 * both might not have been NULL.
730 	 */
731 	if (rdrop1)
732 		return(rdrop1);
733 	else
734 		return(rdrop2);
735 }
736 
737 /*
738  * On either last lock release or last drop
739  */
740 static void
741 hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop)
742 {
743 	/*hammer2_mount_t *hmp = chain->hmp;*/
744 
745 	switch(chain->bref.type) {
746 	case HAMMER2_BREF_TYPE_VOLUME:
747 	case HAMMER2_BREF_TYPE_FREEMAP:
748 		if (lastdrop)
749 			chain->data = NULL;
750 		break;
751 	default:
752 		KKASSERT(chain->data == NULL);
753 		break;
754 	}
755 }
756 
757 /*
758  * Ref and lock a chain element, acquiring its data with I/O if necessary,
759  * and specify how you would like the data to be resolved.
760  *
761  * Returns 0 on success or an error code if the data could not be acquired.
762  * The chain element is locked on return regardless of whether an error
763  * occurred or not.
764  *
765  * The lock is allowed to recurse, multiple locking ops will aggregate
766  * the requested resolve types.  Once data is assigned it will not be
767  * removed until the last unlock.
768  *
769  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
770  *			   (typically used to avoid device/logical buffer
771  *			    aliasing for data)
772  *
773  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
774  *			   the INITIAL-create state (indirect blocks only).
775  *
776  *			   Do not resolve data elements for DATA chains.
777  *			   (typically used to avoid device/logical buffer
778  *			    aliasing for data)
779  *
780  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
781  *
782  * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise
783  *			   it will be locked exclusive.
784  *
785  * NOTE: Embedded elements (volume header, inodes) are always resolved
786  *	 regardless.
787  *
788  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
789  *	 element will instantiate and zero its buffer, and flush it on
790  *	 release.
791  *
792  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
793  *	 so as not to instantiate a device buffer, which could alias against
794  *	 a logical file buffer.  However, if ALWAYS is specified the
795  *	 device buffer will be instantiated anyway.
796  *
797  * WARNING! If data must be fetched a shared lock will temporarily be
798  *	    upgraded to exclusive.  However, a deadlock can occur if
799  *	    the caller owns more than one shared lock.
800  */
801 int
802 hammer2_chain_lock(hammer2_chain_t *chain, int how)
803 {
804 	hammer2_mount_t *hmp;
805 	hammer2_chain_core_t *core;
806 	hammer2_blockref_t *bref;
807 	ccms_state_t ostate;
808 	char *bdata;
809 	int error;
810 
811 	/*
812 	 * Ref and lock the element.  Recursive locks are allowed.
813 	 */
814 	if ((how & HAMMER2_RESOLVE_NOREF) == 0)
815 		hammer2_chain_ref(chain);
816 	atomic_add_int(&chain->lockcnt, 1);
817 
818 	hmp = chain->hmp;
819 	KKASSERT(hmp != NULL);
820 
821 	/*
822 	 * Get the appropriate lock.
823 	 */
824 	core = chain->core;
825 	if (how & HAMMER2_RESOLVE_SHARED)
826 		ccms_thread_lock(&core->cst, CCMS_STATE_SHARED);
827 	else
828 		ccms_thread_lock(&core->cst, CCMS_STATE_EXCLUSIVE);
829 
830 	/*
831 	 * If we already have a valid data pointer no further action is
832 	 * necessary.
833 	 */
834 	if (chain->data)
835 		return (0);
836 
837 	/*
838 	 * Do we have to resolve the data?
839 	 */
840 	switch(how & HAMMER2_RESOLVE_MASK) {
841 	case HAMMER2_RESOLVE_NEVER:
842 		return(0);
843 	case HAMMER2_RESOLVE_MAYBE:
844 		if (chain->flags & HAMMER2_CHAIN_INITIAL)
845 			return(0);
846 		if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
847 			return(0);
848 #if 0
849 		if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE)
850 			return(0);
851 #endif
852 		if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF)
853 			return(0);
854 		/* fall through */
855 	case HAMMER2_RESOLVE_ALWAYS:
856 		break;
857 	}
858 
859 	/*
860 	 * Upgrade to an exclusive lock so we can safely manipulate the
861 	 * buffer cache.  If another thread got to it before us we
862 	 * can just return.
863 	 */
864 	ostate = ccms_thread_lock_upgrade(&core->cst);
865 	if (chain->data) {
866 		ccms_thread_lock_downgrade(&core->cst, ostate);
867 		return (0);
868 	}
869 
870 	/*
871 	 * We must resolve to a device buffer, either by issuing I/O or
872 	 * by creating a zero-fill element.  We do not mark the buffer
873 	 * dirty when creating a zero-fill element (the hammer2_chain_modify()
874 	 * API must still be used to do that).
875 	 *
876 	 * The device buffer is variable-sized in powers of 2 down
877 	 * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
878 	 * chunk always contains buffers of the same size. (XXX)
879 	 *
880 	 * The minimum physical IO size may be larger than the variable
881 	 * block size.
882 	 */
883 	bref = &chain->bref;
884 
885 	/*
886 	 * The getblk() optimization can only be used on newly created
887 	 * elements if the physical block size matches the request.
888 	 */
889 	if (chain->flags & HAMMER2_CHAIN_INITIAL) {
890 		error = hammer2_io_new(hmp, bref->data_off, chain->bytes,
891 					&chain->dio);
892 	} else {
893 		error = hammer2_io_bread(hmp, bref->data_off, chain->bytes,
894 					 &chain->dio);
895 		adjreadcounter(&chain->bref, chain->bytes);
896 	}
897 
898 	if (error) {
899 		kprintf("hammer2_chain_lock: I/O error %016jx: %d\n",
900 			(intmax_t)bref->data_off, error);
901 		hammer2_io_bqrelse(&chain->dio);
902 		ccms_thread_lock_downgrade(&core->cst, ostate);
903 		return (error);
904 	}
905 
906 #if 0
907 	/*
908 	 * No need for this, always require that hammer2_chain_modify()
909 	 * be called before any modifying operations.
910 	 */
911 	if ((chain->flags & HAMMER2_CHAIN_MODIFIED) &&
912 	    !hammer2_io_isdirty(chain->dio)) {
913 		hammer2_io_setdirty(chain->dio);
914 	}
915 #endif
916 
917 	/*
918 	 * We can clear the INITIAL state now, we've resolved the buffer
919 	 * to zeros and marked it dirty with hammer2_io_new().
920 	 */
921 	bdata = hammer2_io_data(chain->dio, chain->bref.data_off);
922 	if (chain->flags & HAMMER2_CHAIN_INITIAL) {
923 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
924 	}
925 
926 	/*
927 	 * Setup the data pointer, either pointing it to an embedded data
928 	 * structure and copying the data from the buffer, or pointing it
929 	 * into the buffer.
930 	 *
931 	 * The buffer is not retained when copying to an embedded data
932 	 * structure in order to avoid potential deadlocks or recursions
933 	 * on the same physical buffer.
934 	 */
935 	switch (bref->type) {
936 	case HAMMER2_BREF_TYPE_VOLUME:
937 	case HAMMER2_BREF_TYPE_FREEMAP:
938 		/*
939 		 * Copy data from bp to embedded buffer
940 		 */
941 		panic("hammer2_chain_lock: called on unresolved volume header");
942 		break;
943 	case HAMMER2_BREF_TYPE_INODE:
944 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
945 	case HAMMER2_BREF_TYPE_INDIRECT:
946 	case HAMMER2_BREF_TYPE_DATA:
947 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
948 	default:
949 		/*
950 		 * Point data at the device buffer and leave dio intact.
951 		 */
952 		chain->data = (void *)bdata;
953 		break;
954 	}
955 	ccms_thread_lock_downgrade(&core->cst, ostate);
956 	return (0);
957 }
958 
959 /*
960  * This basically calls hammer2_io_breadcb() but does some pre-processing
961  * of the chain first to handle certain cases.
962  */
963 void
964 hammer2_chain_load_async(hammer2_chain_t *chain,
965 			 void (*callback)(hammer2_io_t *dio,
966 					  hammer2_chain_t *chain,
967 					  void *arg_p, off_t arg_o),
968 			 void *arg_p, off_t arg_o)
969 {
970 	hammer2_mount_t *hmp;
971 	struct hammer2_io *dio;
972 	hammer2_blockref_t *bref;
973 	int error;
974 
975 	if (chain->data) {
976 		callback(NULL, chain, arg_p, arg_o);
977 		return;
978 	}
979 
980 	/*
981 	 * We must resolve to a device buffer, either by issuing I/O or
982 	 * by creating a zero-fill element.  We do not mark the buffer
983 	 * dirty when creating a zero-fill element (the hammer2_chain_modify()
984 	 * API must still be used to do that).
985 	 *
986 	 * The device buffer is variable-sized in powers of 2 down
987 	 * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
988 	 * chunk always contains buffers of the same size. (XXX)
989 	 *
990 	 * The minimum physical IO size may be larger than the variable
991 	 * block size.
992 	 */
993 	bref = &chain->bref;
994 	hmp = chain->hmp;
995 
996 	/*
997 	 * The getblk() optimization can only be used on newly created
998 	 * elements if the physical block size matches the request.
999 	 */
1000 	if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1001 	    chain->bytes == hammer2_devblksize(chain->bytes)) {
1002 		error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1003 		KKASSERT(error == 0);
1004 		callback(dio, chain, arg_p, arg_o);
1005 		return;
1006 	}
1007 
1008 	/*
1009 	 * Otherwise issue a read
1010 	 */
1011 	adjreadcounter(&chain->bref, chain->bytes);
1012 	hammer2_io_breadcb(hmp, bref->data_off, chain->bytes,
1013 			   callback, chain, arg_p, arg_o);
1014 }
1015 
1016 /*
1017  * Unlock and deref a chain element.
1018  *
1019  * On the last lock release any non-embedded data (chain->dio) will be
1020  * retired.
1021  */
1022 void
1023 hammer2_chain_unlock(hammer2_chain_t *chain)
1024 {
1025 	hammer2_chain_core_t *core = chain->core;
1026 	ccms_state_t ostate;
1027 	long *counterp;
1028 	u_int lockcnt;
1029 
1030 	/*
1031 	 * The core->cst lock can be shared across several chains so we
1032 	 * need to track the per-chain lockcnt separately.
1033 	 *
1034 	 * If multiple locks are present (or being attempted) on this
1035 	 * particular chain we can just unlock, drop refs, and return.
1036 	 *
1037 	 * Otherwise fall-through on the 1->0 transition.
1038 	 */
1039 	for (;;) {
1040 		lockcnt = chain->lockcnt;
1041 		KKASSERT(lockcnt > 0);
1042 		cpu_ccfence();
1043 		if (lockcnt > 1) {
1044 			if (atomic_cmpset_int(&chain->lockcnt,
1045 					      lockcnt, lockcnt - 1)) {
1046 				ccms_thread_unlock(&core->cst);
1047 				hammer2_chain_drop(chain);
1048 				return;
1049 			}
1050 		} else {
1051 			if (atomic_cmpset_int(&chain->lockcnt, 1, 0))
1052 				break;
1053 		}
1054 		/* retry */
1055 	}
1056 
1057 	/*
1058 	 * On the 1->0 transition we upgrade the core lock (if necessary)
1059 	 * to exclusive for terminal processing.  If after upgrading we find
1060 	 * that lockcnt is non-zero, another thread is racing us and will
1061 	 * handle the unload for us later on, so just cleanup and return
1062 	 * leaving the data/io intact
1063 	 *
1064 	 * Otherwise if lockcnt is still 0 it is possible for it to become
1065 	 * non-zero and race, but since we hold the core->cst lock
1066 	 * exclusively all that will happen is that the chain will be
1067 	 * reloaded after we unload it.
1068 	 */
1069 	ostate = ccms_thread_lock_upgrade(&core->cst);
1070 	if (chain->lockcnt) {
1071 		ccms_thread_unlock_upgraded(&core->cst, ostate);
1072 		hammer2_chain_drop(chain);
1073 		return;
1074 	}
1075 
1076 	/*
1077 	 * Shortcut the case if the data is embedded or not resolved.
1078 	 *
1079 	 * Do NOT NULL out chain->data (e.g. inode data), it might be
1080 	 * dirty.
1081 	 */
1082 	if (chain->dio == NULL) {
1083 		if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0)
1084 			hammer2_chain_drop_data(chain, 0);
1085 		ccms_thread_unlock_upgraded(&core->cst, ostate);
1086 		hammer2_chain_drop(chain);
1087 		return;
1088 	}
1089 
1090 	/*
1091 	 * Statistics
1092 	 */
1093 	if (hammer2_io_isdirty(chain->dio) == 0) {
1094 		;
1095 	} else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
1096 		switch(chain->bref.type) {
1097 		case HAMMER2_BREF_TYPE_DATA:
1098 			counterp = &hammer2_ioa_file_write;
1099 			break;
1100 		case HAMMER2_BREF_TYPE_INODE:
1101 			counterp = &hammer2_ioa_meta_write;
1102 			break;
1103 		case HAMMER2_BREF_TYPE_INDIRECT:
1104 			counterp = &hammer2_ioa_indr_write;
1105 			break;
1106 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1107 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1108 			counterp = &hammer2_ioa_fmap_write;
1109 			break;
1110 		default:
1111 			counterp = &hammer2_ioa_volu_write;
1112 			break;
1113 		}
1114 		*counterp += chain->bytes;
1115 	} else {
1116 		switch(chain->bref.type) {
1117 		case HAMMER2_BREF_TYPE_DATA:
1118 			counterp = &hammer2_iod_file_write;
1119 			break;
1120 		case HAMMER2_BREF_TYPE_INODE:
1121 			counterp = &hammer2_iod_meta_write;
1122 			break;
1123 		case HAMMER2_BREF_TYPE_INDIRECT:
1124 			counterp = &hammer2_iod_indr_write;
1125 			break;
1126 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1127 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1128 			counterp = &hammer2_iod_fmap_write;
1129 			break;
1130 		default:
1131 			counterp = &hammer2_iod_volu_write;
1132 			break;
1133 		}
1134 		*counterp += chain->bytes;
1135 	}
1136 
1137 	/*
1138 	 * Clean out the dio.
1139 	 *
1140 	 * If a device buffer was used for data be sure to destroy the
1141 	 * buffer when we are done to avoid aliases (XXX what about the
1142 	 * underlying VM pages?).
1143 	 *
1144 	 * NOTE: Freemap leaf's use reserved blocks and thus no aliasing
1145 	 *	 is possible.
1146 	 *
1147 	 * NOTE: The isdirty check tracks whether we have to bdwrite() the
1148 	 *	 buffer or not.  The buffer might already be dirty.  The
1149 	 *	 flag is re-set when chain_modify() is called, even if
1150 	 *	 MODIFIED is already set, allowing the OS to retire the
1151 	 *	 buffer independent of a hammer2 flush.
1152 	 */
1153 	chain->data = NULL;
1154 	if ((chain->flags & HAMMER2_CHAIN_IOFLUSH) &&
1155 	    hammer2_io_isdirty(chain->dio)) {
1156 		hammer2_io_bawrite(&chain->dio);
1157 	} else {
1158 		hammer2_io_bqrelse(&chain->dio);
1159 	}
1160 	ccms_thread_unlock_upgraded(&core->cst, ostate);
1161 	hammer2_chain_drop(chain);
1162 }
1163 
1164 /*
1165  * This counts the number of live blockrefs in a block array and
1166  * also calculates the point at which all remaining blockrefs are empty.
1167  * This routine can only be called on a live chain (DUPLICATED flag not set).
1168  *
1169  * NOTE: Flag is not set until after the count is complete, allowing
1170  *	 callers to test the flag without holding the spinlock.
1171  *
1172  * NOTE: If base is NULL the related chain is still in the INITIAL
1173  *	 state and there are no blockrefs to count.
1174  *
1175  * NOTE: live_count may already have some counts accumulated due to
1176  *	 creation and deletion and could even be initially negative.
1177  */
1178 void
1179 hammer2_chain_countbrefs(hammer2_chain_t *chain,
1180 			 hammer2_blockref_t *base, int count)
1181 {
1182 	hammer2_chain_core_t *core = chain->core;
1183 
1184 	KKASSERT((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0);
1185 
1186 	spin_lock(&core->cst.spin);
1187         if ((core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0) {
1188 		if (base) {
1189 			while (--count >= 0) {
1190 				if (base[count].type)
1191 					break;
1192 			}
1193 			core->live_zero = count + 1;
1194 			while (count >= 0) {
1195 				if (base[count].type)
1196 					atomic_add_int(&core->live_count, 1);
1197 				--count;
1198 			}
1199 		} else {
1200 			core->live_zero = 0;
1201 		}
1202 		/* else do not modify live_count */
1203 		atomic_set_int(&core->flags, HAMMER2_CORE_COUNTEDBREFS);
1204 	}
1205 	spin_unlock(&core->cst.spin);
1206 }
1207 
1208 /*
1209  * Resize the chain's physical storage allocation in-place.  This may
1210  * replace the passed-in chain with a new chain.
1211  *
1212  * Chains can be resized smaller without reallocating the storage.
1213  * Resizing larger will reallocate the storage.
1214  *
1215  * Must be passed an exclusively locked parent and chain, returns a new
1216  * exclusively locked chain at the same index and unlocks the old chain.
1217  * Flushes the buffer if necessary.
1218  *
1219  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
1220  * to avoid instantiating a device buffer that conflicts with the vnode
1221  * data buffer.  That is, the passed-in bp is a logical buffer, whereas
1222  * any chain-oriented bp would be a device buffer.
1223  *
1224  * XXX return error if cannot resize.
1225  */
1226 void
1227 hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
1228 		     hammer2_chain_t *parent, hammer2_chain_t **chainp,
1229 		     int nradix, int flags)
1230 {
1231 	hammer2_mount_t *hmp;
1232 	hammer2_chain_t *chain;
1233 	size_t obytes;
1234 	size_t nbytes;
1235 
1236 	chain = *chainp;
1237 	hmp = chain->hmp;
1238 
1239 	/*
1240 	 * Only data and indirect blocks can be resized for now.
1241 	 * (The volu root, inodes, and freemap elements use a fixed size).
1242 	 */
1243 	KKASSERT(chain != &hmp->vchain);
1244 	KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
1245 		 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
1246 
1247 	/*
1248 	 * Nothing to do if the element is already the proper size
1249 	 */
1250 	obytes = chain->bytes;
1251 	nbytes = 1U << nradix;
1252 	if (obytes == nbytes)
1253 		return;
1254 
1255 	/*
1256 	 * Delete the old chain and duplicate it at the same (parent, index),
1257 	 * returning a new chain.  This allows the old chain to still be
1258 	 * used by the flush code.  The new chain will be returned in a
1259 	 * modified state.
1260 	 *
1261 	 * The parent does not have to be locked for the delete/duplicate call,
1262 	 * but is in this particular code path.
1263 	 *
1264 	 * NOTE: If we are not crossing a synchronization point the
1265 	 *	 duplication code will simply reuse the existing chain
1266 	 *	 structure.
1267 	 */
1268 	hammer2_chain_delete_duplicate(trans, &chain, 0);
1269 
1270 	/*
1271 	 * Relocate the block, even if making it smaller (because different
1272 	 * block sizes may be in different regions).
1273 	 *
1274 	 * (data blocks only, we aren't copying the storage here).
1275 	 */
1276 	hammer2_freemap_alloc(trans, chain, nbytes);
1277 	chain->bytes = nbytes;
1278 	atomic_clear_int(&chain->flags, HAMMER2_CHAIN_FORCECOW);
1279 	/*ip->delta_dcount += (ssize_t)(nbytes - obytes);*/ /* XXX atomic */
1280 
1281 	/*
1282 	 * For now just support it on DATA chains (and not on indirect
1283 	 * blocks).
1284 	 */
1285 	KKASSERT(chain->dio == NULL);
1286 
1287 	*chainp = chain;
1288 }
1289 
1290 /*
1291  * Set a chain modified, making it read-write and duplicating it if necessary.
1292  * This function will assign a new physical block to the chain if necessary
1293  *
1294  * Duplication of already-modified chains is possible when the modification
1295  * crosses a flush synchronization boundary.
1296  *
1297  * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
1298  *		     level or the COW operation will not work.
1299  *
1300  * Data blocks	   - The chain is usually locked RESOLVE_NEVER so as not to
1301  *		     run the data through the device buffers.
1302  *
1303  * This function may return a different chain than was passed, in which case
1304  * the old chain will be unlocked and the new chain will be locked.
1305  *
1306  * ip->chain may be adjusted by hammer2_chain_modify_ip().
1307  */
1308 hammer2_inode_data_t *
1309 hammer2_chain_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
1310 			hammer2_chain_t **chainp, int flags)
1311 {
1312 	atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
1313 	hammer2_chain_modify(trans, chainp, flags);
1314 	if (ip->chain != *chainp)
1315 		hammer2_inode_repoint(ip, NULL, *chainp);
1316 	if (ip->vp)
1317 		vsetisdirty(ip->vp);
1318 	return(&ip->chain->data->ipdata);
1319 }
1320 
1321 void
1322 hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
1323 		     int flags)
1324 {
1325 	hammer2_mount_t *hmp;
1326 	hammer2_chain_t *chain;
1327 	hammer2_io_t *dio;
1328 	int error;
1329 	int wasinitial;
1330 	char *bdata;
1331 
1332 	chain = *chainp;
1333 	hmp = chain->hmp;
1334 
1335 	KKASSERT(chain->bref.mirror_tid != trans->sync_tid ||
1336 		 (chain->flags & HAMMER2_CHAIN_MODIFIED));
1337 
1338 	/*
1339 	 * data is not optional for freemap chains (we must always be sure
1340 	 * to copy the data on COW storage allocations).
1341 	 */
1342 	if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
1343 	    chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
1344 		KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) ||
1345 			 (flags & HAMMER2_MODIFY_OPTDATA) == 0);
1346 	}
1347 
1348 	/*
1349 	 * Determine if a delete-duplicate is needed.
1350 	 *
1351 	 * (a) Modify_tid is part of a prior flush
1352 	 * (b) Transaction is concurrent with a flush (has higher tid)
1353 	 * (c) and chain is not in the initial state (freshly created)
1354 	 * (d) and caller didn't request an in-place modification.
1355 	 *
1356 	 * The freemap and volume header special chains are never D-Dd.
1357 	 */
1358 	if (chain->modify_tid != trans->sync_tid &&	   /* cross boundary */
1359 	    (flags & HAMMER2_MODIFY_INPLACE) == 0) {	   /* from d-d */
1360 		if (chain != &hmp->fchain && chain != &hmp->vchain) {
1361 			KKASSERT((flags & HAMMER2_MODIFY_ASSERTNOCOPY) == 0);
1362 			hammer2_chain_delete_duplicate(trans, chainp, 0);
1363 			chain = *chainp;
1364 		}
1365 	}
1366 
1367 	/*
1368 	 * Data must be resolved if already assigned unless explicitly
1369 	 * flagged otherwise.
1370 	 */
1371 	if (chain->data == NULL && (flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
1372 	    (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) {
1373 		hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1374 		hammer2_chain_unlock(chain);
1375 	}
1376 
1377 	/*
1378 	 * Otherwise do initial-chain handling.  Set MODIFIED to indicate
1379 	 * that the chain has been modified.  Set FLUSH_CREATE to flush
1380 	 * the new blockref (the D-D set FLUSH_DELETE on the old chain to
1381 	 * delete the old blockref).
1382 	 */
1383 	if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
1384 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1385 		hammer2_chain_ref(chain);
1386 		hammer2_chain_memory_inc(chain->pmp);
1387 	}
1388 	if ((chain->flags & HAMMER2_CHAIN_FLUSH_CREATE) == 0) {
1389 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSH_CREATE);
1390 		hammer2_chain_ref(chain);
1391 	}
1392 
1393 	/*
1394 	 * The modification or re-modification requires an allocation and
1395 	 * possible COW.
1396 	 *
1397 	 * We normally always allocate new storage here.  If storage exists
1398 	 * and MODIFY_NOREALLOC is passed in, we do not allocate new storage.
1399 	 */
1400 	if (chain != &hmp->vchain && chain != &hmp->fchain) {
1401 		if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 ||
1402 		     ((flags & HAMMER2_MODIFY_NOREALLOC) == 0 &&
1403 		      chain->modify_tid != trans->sync_tid)
1404 		) {
1405 			hammer2_freemap_alloc(trans, chain, chain->bytes);
1406 			/* XXX failed allocation */
1407 		} else if (chain->flags & HAMMER2_CHAIN_FORCECOW) {
1408 			hammer2_freemap_alloc(trans, chain, chain->bytes);
1409 			/* XXX failed allocation */
1410 		}
1411 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_FORCECOW);
1412 	}
1413 
1414 	/*
1415 	 * Update modify_tid.  XXX special-case vchain/fchain because they
1416 	 * are always modified in-place.  Otherwise the chain being modified
1417 	 * must not be part of a future transaction.
1418 	 */
1419 	if (chain == &hmp->vchain || chain == &hmp->fchain) {
1420 		if (chain->modify_tid <= trans->sync_tid)
1421 			chain->modify_tid = trans->sync_tid;
1422 	} else {
1423 		KKASSERT(chain->modify_tid <= trans->sync_tid);
1424 		chain->modify_tid = trans->sync_tid;
1425 	}
1426 
1427 	if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
1428 		chain->bref.modify_tid = trans->sync_tid;
1429 
1430 	/*
1431 	 * Do not COW BREF_TYPE_DATA when OPTDATA is set.  This is because
1432 	 * data modifications are done via the logical buffer cache so COWing
1433 	 * it here would result in unnecessary extra copies (and possibly extra
1434 	 * block reallocations).  The INITIAL flag remains unchanged in this
1435 	 * situation.
1436 	 *
1437 	 * (This is a bit of a hack).
1438 	 */
1439 	if (chain->bref.type == HAMMER2_BREF_TYPE_DATA &&
1440 	    (flags & HAMMER2_MODIFY_OPTDATA)) {
1441 		goto skip2;
1442 	}
1443 
1444 	/*
1445 	 * Clearing the INITIAL flag (for indirect blocks) indicates that
1446 	 * we've processed the uninitialized storage allocation.
1447 	 *
1448 	 * If this flag is already clear we are likely in a copy-on-write
1449 	 * situation but we have to be sure NOT to bzero the storage if
1450 	 * no data is present.
1451 	 */
1452 	if (chain->flags & HAMMER2_CHAIN_INITIAL) {
1453 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1454 		wasinitial = 1;
1455 	} else {
1456 		wasinitial = 0;
1457 	}
1458 
1459 	/*
1460 	 * Instantiate data buffer and possibly execute COW operation
1461 	 */
1462 	switch(chain->bref.type) {
1463 	case HAMMER2_BREF_TYPE_VOLUME:
1464 	case HAMMER2_BREF_TYPE_FREEMAP:
1465 		/*
1466 		 * The data is embedded, no copy-on-write operation is
1467 		 * needed.
1468 		 */
1469 		KKASSERT(chain->dio == NULL);
1470 		break;
1471 	case HAMMER2_BREF_TYPE_INODE:
1472 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1473 	case HAMMER2_BREF_TYPE_DATA:
1474 	case HAMMER2_BREF_TYPE_INDIRECT:
1475 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1476 		/*
1477 		 * Perform the copy-on-write operation
1478 		 *
1479 		 * zero-fill or copy-on-write depending on whether
1480 		 * chain->data exists or not and set the dirty state for
1481 		 * the new buffer.  hammer2_io_new() will handle the
1482 		 * zero-fill.
1483 		 */
1484 		KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain);
1485 
1486 		if (wasinitial) {
1487 			error = hammer2_io_new(hmp, chain->bref.data_off,
1488 					       chain->bytes, &dio);
1489 		} else {
1490 			error = hammer2_io_bread(hmp, chain->bref.data_off,
1491 						 chain->bytes, &dio);
1492 		}
1493 		adjreadcounter(&chain->bref, chain->bytes);
1494 		KKASSERT(error == 0);
1495 
1496 		bdata = hammer2_io_data(dio, chain->bref.data_off);
1497 
1498 		if (chain->data) {
1499 			KKASSERT(chain->dio != NULL);
1500 			if (chain->data != (void *)bdata) {
1501 				bcopy(chain->data, bdata, chain->bytes);
1502 			}
1503 		} else if (wasinitial == 0) {
1504 			/*
1505 			 * We have a problem.  We were asked to COW but
1506 			 * we don't have any data to COW with!
1507 			 */
1508 			panic("hammer2_chain_modify: having a COW %p\n",
1509 			      chain);
1510 		}
1511 
1512 		/*
1513 		 * Retire the old buffer, replace with the new
1514 		 */
1515 		if (chain->dio)
1516 			hammer2_io_brelse(&chain->dio);
1517 		chain->data = (void *)bdata;
1518 		chain->dio = dio;
1519 		hammer2_io_setdirty(dio);	/* modified by bcopy above */
1520 		break;
1521 	default:
1522 		panic("hammer2_chain_modify: illegal non-embedded type %d",
1523 		      chain->bref.type);
1524 		break;
1525 
1526 	}
1527 skip2:
1528 	hammer2_chain_setsubmod(trans, chain);
1529 }
1530 
1531 /*
1532  * Mark the volume as having been modified.  This short-cut version
1533  * does not have to lock the volume's chain, which allows the ioctl
1534  * code to make adjustments to connections without deadlocking.  XXX
1535  *
1536  * No ref is made on vchain when flagging it MODIFIED.
1537  */
1538 void
1539 hammer2_modify_volume(hammer2_mount_t *hmp)
1540 {
1541 	hammer2_voldata_lock(hmp);
1542 	hammer2_voldata_unlock(hmp, 1);
1543 }
1544 
1545 /*
1546  * This function returns the chain at the nearest key within the specified
1547  * range with the highest delete_tid.  The core spinlock must be held on
1548  * call and the returned chain will be referenced but not locked.
1549  *
1550  * The returned chain may or may not be in a deleted state.  Note that
1551  * live chains have a delete_tid = MAX_TID.
1552  *
1553  * This function will recurse through chain->rbtree as necessary and will
1554  * return a *key_nextp suitable for iteration.  *key_nextp is only set if
1555  * the iteration value is less than the current value of *key_nextp.
1556  *
1557  * The caller should use (*key_nextp) to calculate the actual range of
1558  * the returned element, which will be (key_beg to *key_nextp - 1), because
1559  * there might be another element which is superior to the returned element
1560  * and overlaps it.
1561  *
1562  * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL
1563  * chains continue to be returned.  On EOF (*key_nextp) may overflow since
1564  * it will wind up being (key_end + 1).
1565  */
1566 struct hammer2_chain_find_info {
1567 	hammer2_chain_t		*best;
1568 	hammer2_key_t		key_beg;
1569 	hammer2_key_t		key_end;
1570 	hammer2_key_t		key_next;
1571 };
1572 
1573 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data);
1574 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data);
1575 
1576 static
1577 hammer2_chain_t *
1578 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp,
1579 			  hammer2_key_t key_beg, hammer2_key_t key_end)
1580 {
1581 	struct hammer2_chain_find_info info;
1582 
1583 	info.best = NULL;
1584 	info.key_beg = key_beg;
1585 	info.key_end = key_end;
1586 	info.key_next = *key_nextp;
1587 
1588 	KKASSERT(parent->core->good == 0x1234);
1589 	RB_SCAN(hammer2_chain_tree, &parent->core->rbtree,
1590 		hammer2_chain_find_cmp, hammer2_chain_find_callback,
1591 		&info);
1592 	*key_nextp = info.key_next;
1593 #if 0
1594 	kprintf("chain_find %p %016jx:%016jx next=%016jx\n",
1595 		parent, key_beg, key_end, *key_nextp);
1596 #endif
1597 
1598 	return (info.best);
1599 }
1600 
1601 /*
1602  * Find a deleted chain covering a block table entry.  Be careful to deal
1603  * with the race condition where the block table has been updated but the
1604  * chain has not yet been removed from dbtree (due to multiple parents having
1605  * to be updated).
1606  */
1607 static
1608 hammer2_chain_t *
1609 hammer2_chain_find_deleted(hammer2_chain_t *parent,
1610 			  hammer2_key_t key_beg, hammer2_key_t key_end)
1611 {
1612 	struct hammer2_chain_find_info info;
1613 	hammer2_chain_t *child;
1614 
1615 	info.best = NULL;
1616 	info.key_beg = key_beg;
1617 	info.key_end = key_end;
1618 	info.key_next = 0;
1619 
1620 	KKASSERT(parent->core->good == 0x1234);
1621 	RB_SCAN(hammer2_chain_tree, &parent->core->dbtree,
1622 		hammer2_chain_find_cmp, hammer2_chain_find_callback,
1623 		&info);
1624 	if ((child = info.best) != NULL) {
1625 		if (child->delete_tid <= parent->update_lo)
1626 			child = NULL;
1627 	}
1628 	return child;
1629 }
1630 
1631 static
1632 int
1633 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data)
1634 {
1635 	struct hammer2_chain_find_info *info = data;
1636 	hammer2_key_t child_beg;
1637 	hammer2_key_t child_end;
1638 
1639 	child_beg = child->bref.key;
1640 	child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1;
1641 
1642 	if (child_end < info->key_beg)
1643 		return(-1);
1644 	if (child_beg > info->key_end)
1645 		return(1);
1646 	return(0);
1647 }
1648 
1649 static
1650 int
1651 hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
1652 {
1653 	struct hammer2_chain_find_info *info = data;
1654 	hammer2_chain_t *best;
1655 	hammer2_key_t child_end;
1656 
1657 	/*
1658 	 * WARNING! Do not discard DUPLICATED chains, it is possible that
1659 	 *	    we are catching an insertion half-way done.  If a
1660 	 *	    duplicated chain turns out to be the best choice the
1661 	 *	    caller will re-check its flags after locking it.
1662 	 *
1663 	 * WARNING! Layerq is scanned forwards, exact matches should keep
1664 	 *	    the existing info->best.
1665 	 */
1666 	if ((best = info->best) == NULL) {
1667 		/*
1668 		 * No previous best.  Assign best
1669 		 */
1670 		info->best = child;
1671 	} else if (best->bref.key <= info->key_beg &&
1672 		   child->bref.key <= info->key_beg) {
1673 		/*
1674 		 * If our current best is flush with key_beg and child is
1675 		 * also flush with key_beg choose based on delete_tid.
1676 		 *
1677 		 * key_next will automatically be limited to the smaller of
1678 		 * the two end-points.
1679 		 */
1680 		if (child->delete_tid > best->delete_tid)
1681 			info->best = child;
1682 	} else if (child->bref.key < best->bref.key) {
1683 		/*
1684 		 * Child has a nearer key and best is not flush with key_beg.
1685 		 * Truncate key_next to the old best key iff it had a better
1686 		 * delete_tid.
1687 		 */
1688 		info->best = child;
1689 		if (best->delete_tid >= child->delete_tid &&
1690 		    (info->key_next > best->bref.key || info->key_next == 0))
1691 			info->key_next = best->bref.key;
1692 	} else if (child->bref.key == best->bref.key) {
1693 		/*
1694 		 * If our current best is flush with the child then choose
1695 		 * based on delete_tid.
1696 		 *
1697 		 * key_next will automatically be limited to the smaller of
1698 		 * the two end-points.
1699 		 */
1700 		if (child->delete_tid > best->delete_tid)
1701 			info->best = child;
1702 	} else {
1703 		/*
1704 		 * Keep the current best but truncate key_next to the child's
1705 		 * base iff the child has a higher delete_tid.
1706 		 *
1707 		 * key_next will also automatically be limited to the smaller
1708 		 * of the two end-points (probably not necessary for this case
1709 		 * but we do it anyway).
1710 		 */
1711 		if (child->delete_tid >= best->delete_tid &&
1712 		    (info->key_next > child->bref.key || info->key_next == 0))
1713 			info->key_next = child->bref.key;
1714 	}
1715 
1716 	/*
1717 	 * Always truncate key_next based on child's end-of-range.
1718 	 */
1719 	child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits);
1720 	if (child_end && (info->key_next > child_end || info->key_next == 0))
1721 		info->key_next = child_end;
1722 
1723 	return(0);
1724 }
1725 
1726 /*
1727  * Retrieve the specified chain from a media blockref, creating the
1728  * in-memory chain structure which reflects it.  modify_tid will be
1729  * left 0 which forces any modifications to issue a delete-duplicate.
1730  *
1731  * To handle insertion races pass the INSERT_RACE flag along with the
1732  * generation number of the core.  NULL will be returned if the generation
1733  * number changes before we have a chance to insert the chain.  Insert
1734  * races can occur because the parent might be held shared.
1735  *
1736  * Caller must hold the parent locked shared or exclusive since we may
1737  * need the parent's bref array to find our block.
1738  */
1739 hammer2_chain_t *
1740 hammer2_chain_get(hammer2_chain_t *parent, int generation,
1741 		  hammer2_blockref_t *bref)
1742 {
1743 	hammer2_mount_t *hmp = parent->hmp;
1744 	hammer2_chain_core_t *above = parent->core;
1745 	hammer2_chain_t *chain;
1746 	int error;
1747 
1748 	/*
1749 	 * Allocate a chain structure representing the existing media
1750 	 * entry.  Resulting chain has one ref and is not locked.
1751 	 */
1752 	chain = hammer2_chain_alloc(hmp, parent->pmp, NULL, bref);
1753 	hammer2_chain_core_alloc(NULL, chain, NULL);
1754 	/* ref'd chain returned */
1755 
1756 	/*
1757 	 * Set modify_tid and update_lo to the chain's synchronization
1758 	 * point from the media.
1759 	 */
1760 	chain->modify_tid = chain->bref.mirror_tid;
1761 	chain->update_lo = chain->bref.mirror_tid;
1762 	atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED);
1763 
1764 	/*
1765 	 * Link the chain into its parent.  A spinlock is required to safely
1766 	 * access the RBTREE, and it is possible to collide with another
1767 	 * hammer2_chain_get() operation because the caller might only hold
1768 	 * a shared lock on the parent.
1769 	 */
1770 	KKASSERT(parent->refs > 0);
1771 	error = hammer2_chain_insert(above, NULL, chain,
1772 				     HAMMER2_CHAIN_INSERT_SPIN |
1773 				     HAMMER2_CHAIN_INSERT_RACE,
1774 				     generation);
1775 	if (error) {
1776 		KKASSERT((chain->flags & (HAMMER2_CHAIN_ONRBTREE |
1777 					  HAMMER2_CHAIN_ONDBTREE |
1778 					  HAMMER2_CHAIN_ONDBQ)) == 0);
1779 		kprintf("chain %p get race\n", chain);
1780 		hammer2_chain_drop(chain);
1781 		chain = NULL;
1782 	} else {
1783 		KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
1784 	}
1785 
1786 	/*
1787 	 * Return our new chain referenced but not locked, or NULL if
1788 	 * a race occurred.
1789 	 */
1790 	return (chain);
1791 }
1792 
1793 /*
1794  * Lookup initialization/completion API
1795  */
1796 hammer2_chain_t *
1797 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags)
1798 {
1799 	if (flags & HAMMER2_LOOKUP_SHARED) {
1800 		hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
1801 					   HAMMER2_RESOLVE_SHARED);
1802 	} else {
1803 		hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
1804 	}
1805 	return (parent);
1806 }
1807 
1808 void
1809 hammer2_chain_lookup_done(hammer2_chain_t *parent)
1810 {
1811 	if (parent)
1812 		hammer2_chain_unlock(parent);
1813 }
1814 
1815 static
1816 hammer2_chain_t *
1817 hammer2_chain_getparent(hammer2_chain_t **parentp, int how)
1818 {
1819 	hammer2_chain_t *oparent;
1820 	hammer2_chain_t *bparent;
1821 	hammer2_chain_t *nparent;
1822 	hammer2_chain_core_t *above;
1823 
1824 	oparent = *parentp;
1825 	above = oparent->above;
1826 
1827 	spin_lock(&above->cst.spin);
1828 	bparent = TAILQ_FIRST(&above->ownerq);
1829 	hammer2_chain_ref(bparent);
1830 
1831 	/*
1832 	 * Be careful of order, oparent must be unlocked before nparent
1833 	 * is locked below to avoid a deadlock.  We might as well delay its
1834 	 * unlocking until we conveniently no longer have the spinlock (instead
1835 	 * of cycling the spinlock).
1836 	 *
1837 	 * Theoretically our ref on bparent should prevent elements of the
1838 	 * following chain from going away and prevent above from going away,
1839 	 * but we still need the spinlock to safely scan the list.
1840 	 */
1841 	for (;;) {
1842 		nparent = bparent;
1843 		while (nparent->flags & HAMMER2_CHAIN_DUPLICATED)
1844 			nparent = TAILQ_NEXT(nparent, core_entry);
1845 		hammer2_chain_ref(nparent);
1846 		spin_unlock(&above->cst.spin);
1847 
1848 		if (oparent) {
1849 			hammer2_chain_unlock(oparent);
1850 			oparent = NULL;
1851 		}
1852 		hammer2_chain_lock(nparent, how | HAMMER2_RESOLVE_NOREF);
1853 		hammer2_chain_drop(bparent);
1854 
1855 		/*
1856 		 * We might have raced a delete-duplicate.
1857 		 */
1858 		if ((nparent->flags & HAMMER2_CHAIN_DUPLICATED) == 0)
1859 			break;
1860 		bparent = nparent;
1861 		hammer2_chain_ref(bparent);
1862 		hammer2_chain_unlock(nparent);
1863 		spin_lock(&above->cst.spin);
1864 		/* retry */
1865 	}
1866 	*parentp = nparent;
1867 
1868 	return (nparent);
1869 }
1870 
1871 /*
1872  * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive.
1873  * (*parentp) typically points to an inode but can also point to a related
1874  * indirect block and this function will recurse upwards and find the inode
1875  * again.
1876  *
1877  * (*parentp) must be exclusively locked and referenced and can be an inode
1878  * or an existing indirect block within the inode.
1879  *
1880  * On return (*parentp) will be modified to point at the deepest parent chain
1881  * element encountered during the search, as a helper for an insertion or
1882  * deletion.   The new (*parentp) will be locked and referenced and the old
1883  * will be unlocked and dereferenced (no change if they are both the same).
1884  *
1885  * The matching chain will be returned exclusively locked.  If NOLOCK is
1886  * requested the chain will be returned only referenced.
1887  *
1888  * NULL is returned if no match was found, but (*parentp) will still
1889  * potentially be adjusted.
1890  *
1891  * On return (*key_nextp) will point to an iterative value for key_beg.
1892  * (If NULL is returned (*key_nextp) is set to key_end).
1893  *
1894  * This function will also recurse up the chain if the key is not within the
1895  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
1896  * can simply allow (*parentp) to float inside the loop.
1897  *
1898  * NOTE!  chain->data is not always resolved.  By default it will not be
1899  *	  resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF.  Use
1900  *	  HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/
1901  *	  BREF_TYPE_DATA as the device buffer can alias the logical file
1902  *	  buffer).
1903  */
1904 hammer2_chain_t *
1905 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp,
1906 		     hammer2_key_t key_beg, hammer2_key_t key_end,
1907 		     int *cache_indexp, int flags)
1908 {
1909 	hammer2_mount_t *hmp;
1910 	hammer2_chain_t *parent;
1911 	hammer2_chain_t *chain;
1912 	hammer2_blockref_t *base;
1913 	hammer2_blockref_t *bref;
1914 	hammer2_blockref_t bcopy;
1915 	hammer2_key_t scan_beg;
1916 	hammer2_key_t scan_end;
1917 	hammer2_chain_core_t *above;
1918 	int count = 0;
1919 	int how_always = HAMMER2_RESOLVE_ALWAYS;
1920 	int how_maybe = HAMMER2_RESOLVE_MAYBE;
1921 	int how;
1922 	int generation;
1923 	int maxloops = 300000;
1924 	int wasdup;
1925 
1926 	if (flags & HAMMER2_LOOKUP_ALWAYS) {
1927 		how_maybe = how_always;
1928 		how = HAMMER2_RESOLVE_ALWAYS;
1929 	} else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) {
1930 		how = HAMMER2_RESOLVE_NEVER;
1931 	} else {
1932 		how = HAMMER2_RESOLVE_MAYBE;
1933 	}
1934 	if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) {
1935 		how_maybe |= HAMMER2_RESOLVE_SHARED;
1936 		how_always |= HAMMER2_RESOLVE_SHARED;
1937 		how |= HAMMER2_RESOLVE_SHARED;
1938 	}
1939 
1940 	/*
1941 	 * Recurse (*parentp) upward if necessary until the parent completely
1942 	 * encloses the key range or we hit the inode.
1943 	 *
1944 	 * This function handles races against the flusher doing a delete-
1945 	 * duplicate above us and re-homes the parent to the duplicate in
1946 	 * that case, otherwise we'd wind up recursing down a stale chain.
1947 	 */
1948 	parent = *parentp;
1949 	hmp = parent->hmp;
1950 
1951 	while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
1952 	       parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
1953 		scan_beg = parent->bref.key;
1954 		scan_end = scan_beg +
1955 			   ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1956 		if (key_beg >= scan_beg && key_end <= scan_end)
1957 			break;
1958 		parent = hammer2_chain_getparent(parentp, how_maybe);
1959 	}
1960 
1961 again:
1962 	if (--maxloops == 0)
1963 		panic("hammer2_chain_lookup: maxloops");
1964 	/*
1965 	 * Locate the blockref array.  Currently we do a fully associative
1966 	 * search through the array.
1967 	 */
1968 	switch(parent->bref.type) {
1969 	case HAMMER2_BREF_TYPE_INODE:
1970 		/*
1971 		 * Special shortcut for embedded data returns the inode
1972 		 * itself.  Callers must detect this condition and access
1973 		 * the embedded data (the strategy code does this for us).
1974 		 *
1975 		 * This is only applicable to regular files and softlinks.
1976 		 */
1977 		if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1978 			if (flags & HAMMER2_LOOKUP_NOLOCK)
1979 				hammer2_chain_ref(parent);
1980 			else
1981 				hammer2_chain_lock(parent, how_always);
1982 			*key_nextp = key_end + 1;
1983 			return (parent);
1984 		}
1985 		base = &parent->data->ipdata.u.blockset.blockref[0];
1986 		count = HAMMER2_SET_COUNT;
1987 		break;
1988 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1989 	case HAMMER2_BREF_TYPE_INDIRECT:
1990 		/*
1991 		 * Handle MATCHIND on the parent
1992 		 */
1993 		if (flags & HAMMER2_LOOKUP_MATCHIND) {
1994 			scan_beg = parent->bref.key;
1995 			scan_end = scan_beg +
1996 			       ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1997 			if (key_beg == scan_beg && key_end == scan_end) {
1998 				chain = parent;
1999 				hammer2_chain_lock(chain, how_maybe);
2000 				*key_nextp = scan_end + 1;
2001 				goto done;
2002 			}
2003 		}
2004 		/*
2005 		 * Optimize indirect blocks in the INITIAL state to avoid
2006 		 * I/O.
2007 		 */
2008 		if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2009 			base = NULL;
2010 		} else {
2011 			if (parent->data == NULL)
2012 				panic("parent->data is NULL");
2013 			base = &parent->data->npdata[0];
2014 		}
2015 		count = parent->bytes / sizeof(hammer2_blockref_t);
2016 		break;
2017 	case HAMMER2_BREF_TYPE_VOLUME:
2018 		base = &hmp->voldata.sroot_blockset.blockref[0];
2019 		count = HAMMER2_SET_COUNT;
2020 		break;
2021 	case HAMMER2_BREF_TYPE_FREEMAP:
2022 		base = &hmp->voldata.freemap_blockset.blockref[0];
2023 		count = HAMMER2_SET_COUNT;
2024 		break;
2025 	default:
2026 		panic("hammer2_chain_lookup: unrecognized blockref type: %d",
2027 		      parent->bref.type);
2028 		base = NULL;	/* safety */
2029 		count = 0;	/* safety */
2030 	}
2031 
2032 	/*
2033 	 * Merged scan to find next candidate.
2034 	 *
2035 	 * hammer2_base_*() functions require the above->live_* fields
2036 	 * to be synchronized.
2037 	 *
2038 	 * We need to hold the spinlock to access the block array and RB tree
2039 	 * and to interlock chain creation.
2040 	 */
2041 	above = parent->core;
2042 	if ((parent->core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0)
2043 		hammer2_chain_countbrefs(parent, base, count);
2044 
2045 	/*
2046 	 * Combined search
2047 	 */
2048 	spin_lock(&above->cst.spin);
2049 	chain = hammer2_combined_find(parent, base, count,
2050 				      cache_indexp, key_nextp,
2051 				      key_beg, key_end,
2052 				      &bref);
2053 	generation = above->generation;
2054 
2055 	/*
2056 	 * Exhausted parent chain, iterate.
2057 	 */
2058 	if (bref == NULL) {
2059 		spin_unlock(&above->cst.spin);
2060 		if (key_beg == key_end)	/* short cut single-key case */
2061 			return (NULL);
2062 		return (hammer2_chain_next(parentp, NULL, key_nextp,
2063 					   key_beg, key_end,
2064 					   cache_indexp, flags));
2065 	}
2066 
2067 	/*
2068 	 * Selected from blockref or in-memory chain.
2069 	 */
2070 	if (chain == NULL) {
2071 		bcopy = *bref;
2072 		spin_unlock(&above->cst.spin);
2073 		chain = hammer2_chain_get(parent, generation,
2074 					  &bcopy);
2075 		if (chain == NULL) {
2076 			kprintf("retry lookup parent %p keys %016jx:%016jx\n",
2077 				parent, key_beg, key_end);
2078 			goto again;
2079 		}
2080 		if (bcmp(&bcopy, bref, sizeof(bcopy))) {
2081 			hammer2_chain_drop(chain);
2082 			goto again;
2083 		}
2084 		wasdup = 0;
2085 	} else {
2086 		hammer2_chain_ref(chain);
2087 		wasdup = ((chain->flags & HAMMER2_CHAIN_DUPLICATED) != 0);
2088 		spin_unlock(&above->cst.spin);
2089 	}
2090 
2091 	/*
2092 	 * chain is referenced but not locked.  We must lock the chain
2093 	 * to obtain definitive DUPLICATED/DELETED state
2094 	 */
2095 	if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2096 	    chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2097 		hammer2_chain_lock(chain, how_maybe | HAMMER2_RESOLVE_NOREF);
2098 	} else {
2099 		hammer2_chain_lock(chain, how | HAMMER2_RESOLVE_NOREF);
2100 	}
2101 
2102 	/*
2103 	 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX)
2104 	 *
2105 	 * NOTE: Chain's key range is not relevant as there might be
2106 	 *	 one-offs within the range that are not deleted.
2107 	 *
2108 	 * NOTE: Lookups can race delete-duplicate because
2109 	 *	 delete-duplicate does not lock the parent's core
2110 	 *	 (they just use the spinlock on the core).  We must
2111 	 *	 check for races by comparing the DUPLICATED flag before
2112 	 *	 releasing the spinlock with the flag after locking the
2113 	 *	 chain.
2114 	 */
2115 	if (chain->flags & HAMMER2_CHAIN_DELETED) {
2116 		hammer2_chain_unlock(chain);
2117 		if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0 || wasdup) {
2118 			key_beg = *key_nextp;
2119 			if (key_beg == 0 || key_beg > key_end)
2120 				return(NULL);
2121 		}
2122 		goto again;
2123 	}
2124 
2125 	/*
2126 	 * If the chain element is an indirect block it becomes the new
2127 	 * parent and we loop on it.  We must maintain our top-down locks
2128 	 * to prevent the flusher from interfering (i.e. doing a
2129 	 * delete-duplicate and leaving us recursing down a deleted chain).
2130 	 *
2131 	 * The parent always has to be locked with at least RESOLVE_MAYBE
2132 	 * so we can access its data.  It might need a fixup if the caller
2133 	 * passed incompatible flags.  Be careful not to cause a deadlock
2134 	 * as a data-load requires an exclusive lock.
2135 	 *
2136 	 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key
2137 	 * range is within the requested key range we return the indirect
2138 	 * block and do NOT loop.  This is usually only used to acquire
2139 	 * freemap nodes.
2140 	 */
2141 	if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2142 	    chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2143 		hammer2_chain_unlock(parent);
2144 		*parentp = parent = chain;
2145 		goto again;
2146 	}
2147 done:
2148 	/*
2149 	 * All done, return the chain
2150 	 */
2151 	return (chain);
2152 }
2153 
2154 /*
2155  * After having issued a lookup we can iterate all matching keys.
2156  *
2157  * If chain is non-NULL we continue the iteration from just after it's index.
2158  *
2159  * If chain is NULL we assume the parent was exhausted and continue the
2160  * iteration at the next parent.
2161  *
2162  * parent must be locked on entry and remains locked throughout.  chain's
2163  * lock status must match flags.  Chain is always at least referenced.
2164  *
2165  * WARNING!  The MATCHIND flag does not apply to this function.
2166  */
2167 hammer2_chain_t *
2168 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain,
2169 		   hammer2_key_t *key_nextp,
2170 		   hammer2_key_t key_beg, hammer2_key_t key_end,
2171 		   int *cache_indexp, int flags)
2172 {
2173 	hammer2_chain_t *parent;
2174 	int how_maybe;
2175 
2176 	/*
2177 	 * Calculate locking flags for upward recursion.
2178 	 */
2179 	how_maybe = HAMMER2_RESOLVE_MAYBE;
2180 	if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
2181 		how_maybe |= HAMMER2_RESOLVE_SHARED;
2182 
2183 	parent = *parentp;
2184 
2185 	/*
2186 	 * Calculate the next index and recalculate the parent if necessary.
2187 	 */
2188 	if (chain) {
2189 		key_beg = chain->bref.key +
2190 			  ((hammer2_key_t)1 << chain->bref.keybits);
2191 		if (flags & HAMMER2_LOOKUP_NOLOCK)
2192 			hammer2_chain_drop(chain);
2193 		else
2194 			hammer2_chain_unlock(chain);
2195 
2196 		/*
2197 		 * Any scan where the lookup returned degenerate data embedded
2198 		 * in the inode has an invalid index and must terminate.
2199 		 */
2200 		if (chain == parent)
2201 			return(NULL);
2202 		if (key_beg == 0 || key_beg > key_end)
2203 			return(NULL);
2204 		chain = NULL;
2205 	} else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
2206 		   parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2207 		/*
2208 		 * We reached the end of the iteration.
2209 		 */
2210 		return (NULL);
2211 	} else {
2212 		/*
2213 		 * Continue iteration with next parent unless the current
2214 		 * parent covers the range.
2215 		 */
2216 		key_beg = parent->bref.key +
2217 			  ((hammer2_key_t)1 << parent->bref.keybits);
2218 		if (key_beg == 0 || key_beg > key_end)
2219 			return (NULL);
2220 		parent = hammer2_chain_getparent(parentp, how_maybe);
2221 	}
2222 
2223 	/*
2224 	 * And execute
2225 	 */
2226 	return (hammer2_chain_lookup(parentp, key_nextp,
2227 				     key_beg, key_end,
2228 				     cache_indexp, flags));
2229 }
2230 
2231 /*
2232  * The raw scan function is similar to lookup/next but does not seek to a key.
2233  * Blockrefs are iterated via first_chain = (parent, NULL) and
2234  * next_chain = (parent, chain).
2235  *
2236  * The passed-in parent must be locked and its data resolved.  The returned
2237  * chain will be locked.  Pass chain == NULL to acquire the first sub-chain
2238  * under parent and then iterate with the passed-in chain (which this
2239  * function will unlock).
2240  */
2241 hammer2_chain_t *
2242 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t *chain,
2243 		   int *cache_indexp, int flags)
2244 {
2245 	hammer2_mount_t *hmp;
2246 	hammer2_blockref_t *base;
2247 	hammer2_blockref_t *bref;
2248 	hammer2_blockref_t bcopy;
2249 	hammer2_chain_core_t *above;
2250 	hammer2_key_t key;
2251 	hammer2_key_t next_key;
2252 	int count = 0;
2253 	int how_always = HAMMER2_RESOLVE_ALWAYS;
2254 	int how_maybe = HAMMER2_RESOLVE_MAYBE;
2255 	int how;
2256 	int generation;
2257 	int maxloops = 300000;
2258 	int wasdup;
2259 
2260 	hmp = parent->hmp;
2261 
2262 	/*
2263 	 * Scan flags borrowed from lookup
2264 	 */
2265 	if (flags & HAMMER2_LOOKUP_ALWAYS) {
2266 		how_maybe = how_always;
2267 		how = HAMMER2_RESOLVE_ALWAYS;
2268 	} else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) {
2269 		how = HAMMER2_RESOLVE_NEVER;
2270 	} else {
2271 		how = HAMMER2_RESOLVE_MAYBE;
2272 	}
2273 	if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) {
2274 		how_maybe |= HAMMER2_RESOLVE_SHARED;
2275 		how_always |= HAMMER2_RESOLVE_SHARED;
2276 		how |= HAMMER2_RESOLVE_SHARED;
2277 	}
2278 
2279 	/*
2280 	 * Calculate key to locate first/next element, unlocking the previous
2281 	 * element as we go.  Be careful, the key calculation can overflow.
2282 	 */
2283 	if (chain) {
2284 		key = chain->bref.key +
2285 		      ((hammer2_key_t)1 << chain->bref.keybits);
2286 		hammer2_chain_unlock(chain);
2287 		chain = NULL;
2288 		if (key == 0)
2289 			goto done;
2290 	} else {
2291 		key = 0;
2292 	}
2293 
2294 again:
2295 	if (--maxloops == 0)
2296 		panic("hammer2_chain_scan: maxloops");
2297 	/*
2298 	 * Locate the blockref array.  Currently we do a fully associative
2299 	 * search through the array.
2300 	 */
2301 	switch(parent->bref.type) {
2302 	case HAMMER2_BREF_TYPE_INODE:
2303 		/*
2304 		 * An inode with embedded data has no sub-chains.
2305 		 */
2306 		if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA)
2307 			goto done;
2308 		base = &parent->data->ipdata.u.blockset.blockref[0];
2309 		count = HAMMER2_SET_COUNT;
2310 		break;
2311 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2312 	case HAMMER2_BREF_TYPE_INDIRECT:
2313 		/*
2314 		 * Optimize indirect blocks in the INITIAL state to avoid
2315 		 * I/O.
2316 		 */
2317 		if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2318 			base = NULL;
2319 		} else {
2320 			if (parent->data == NULL)
2321 				panic("parent->data is NULL");
2322 			base = &parent->data->npdata[0];
2323 		}
2324 		count = parent->bytes / sizeof(hammer2_blockref_t);
2325 		break;
2326 	case HAMMER2_BREF_TYPE_VOLUME:
2327 		base = &hmp->voldata.sroot_blockset.blockref[0];
2328 		count = HAMMER2_SET_COUNT;
2329 		break;
2330 	case HAMMER2_BREF_TYPE_FREEMAP:
2331 		base = &hmp->voldata.freemap_blockset.blockref[0];
2332 		count = HAMMER2_SET_COUNT;
2333 		break;
2334 	default:
2335 		panic("hammer2_chain_lookup: unrecognized blockref type: %d",
2336 		      parent->bref.type);
2337 		base = NULL;	/* safety */
2338 		count = 0;	/* safety */
2339 	}
2340 
2341 	/*
2342 	 * Merged scan to find next candidate.
2343 	 *
2344 	 * hammer2_base_*() functions require the above->live_* fields
2345 	 * to be synchronized.
2346 	 *
2347 	 * We need to hold the spinlock to access the block array and RB tree
2348 	 * and to interlock chain creation.
2349 	 */
2350 	if ((parent->core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0)
2351 		hammer2_chain_countbrefs(parent, base, count);
2352 
2353 	above = parent->core;
2354 	next_key = 0;
2355 	spin_lock(&above->cst.spin);
2356 	chain = hammer2_combined_find(parent, base, count,
2357 				      cache_indexp, &next_key,
2358 				      key, HAMMER2_MAX_KEY,
2359 				      &bref);
2360 	generation = above->generation;
2361 
2362 	/*
2363 	 * Exhausted parent chain, we're done.
2364 	 */
2365 	if (bref == NULL) {
2366 		spin_unlock(&above->cst.spin);
2367 		KKASSERT(chain == NULL);
2368 		goto done;
2369 	}
2370 
2371 	/*
2372 	 * Selected from blockref or in-memory chain.
2373 	 */
2374 	if (chain == NULL) {
2375 		bcopy = *bref;
2376 		spin_unlock(&above->cst.spin);
2377 		chain = hammer2_chain_get(parent, generation, &bcopy);
2378 		if (chain == NULL) {
2379 			kprintf("retry scan parent %p keys %016jx\n",
2380 				parent, key);
2381 			goto again;
2382 		}
2383 		if (bcmp(&bcopy, bref, sizeof(bcopy))) {
2384 			hammer2_chain_drop(chain);
2385 			chain = NULL;
2386 			goto again;
2387 		}
2388 		wasdup = 0;
2389 	} else {
2390 		hammer2_chain_ref(chain);
2391 		wasdup = ((chain->flags & HAMMER2_CHAIN_DUPLICATED) != 0);
2392 		spin_unlock(&above->cst.spin);
2393 	}
2394 
2395 	/*
2396 	 * chain is referenced but not locked.  We must lock the chain
2397 	 * to obtain definitive DUPLICATED/DELETED state
2398 	 */
2399 	hammer2_chain_lock(chain, how | HAMMER2_RESOLVE_NOREF);
2400 
2401 	/*
2402 	 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX)
2403 	 *
2404 	 * NOTE: chain's key range is not relevant as there might be
2405 	 *	 one-offs within the range that are not deleted.
2406 	 *
2407 	 * NOTE: XXX this could create problems with scans used in
2408 	 *	 situations other than mount-time recovery.
2409 	 *
2410 	 * NOTE: Lookups can race delete-duplicate because
2411 	 *	 delete-duplicate does not lock the parent's core
2412 	 *	 (they just use the spinlock on the core).  We must
2413 	 *	 check for races by comparing the DUPLICATED flag before
2414 	 *	 releasing the spinlock with the flag after locking the
2415 	 *	 chain.
2416 	 */
2417 	if (chain->flags & HAMMER2_CHAIN_DELETED) {
2418 		hammer2_chain_unlock(chain);
2419 		chain = NULL;
2420 
2421 		if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0 || wasdup) {
2422 			key = next_key;
2423 			if (key == 0)
2424 				goto done;
2425 		}
2426 		goto again;
2427 	}
2428 
2429 done:
2430 	/*
2431 	 * All done, return the chain or NULL
2432 	 */
2433 	return (chain);
2434 }
2435 
2436 /*
2437  * Create and return a new hammer2 system memory structure of the specified
2438  * key, type and size and insert it under (*parentp).  This is a full
2439  * insertion, based on the supplied key/keybits, and may involve creating
2440  * indirect blocks and moving other chains around via delete/duplicate.
2441  *
2442  * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION
2443  * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING
2444  * FULL.  This typically means that the caller is creating the chain after
2445  * doing a hammer2_chain_lookup().
2446  *
2447  * (*parentp) must be exclusive locked and may be replaced on return
2448  * depending on how much work the function had to do.
2449  *
2450  * (*chainp) usually starts out NULL and returns the newly created chain,
2451  * but if the caller desires the caller may allocate a disconnected chain
2452  * and pass it in instead.  (It is also possible for the caller to use
2453  * chain_duplicate() to create a disconnected chain, manipulate it, then
2454  * pass it into this function to insert it).
2455  *
2456  * This function should NOT be used to insert INDIRECT blocks.  It is
2457  * typically used to create/insert inodes and data blocks.
2458  *
2459  * Caller must pass-in an exclusively locked parent the new chain is to
2460  * be inserted under, and optionally pass-in a disconnected, exclusively
2461  * locked chain to insert (else we create a new chain).  The function will
2462  * adjust (*parentp) as necessary, create or connect the chain, and
2463  * return an exclusively locked chain in *chainp.
2464  */
2465 int
2466 hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
2467 		     hammer2_chain_t **chainp,
2468 		     hammer2_key_t key, int keybits, int type, size_t bytes)
2469 {
2470 	hammer2_mount_t *hmp;
2471 	hammer2_chain_t *chain;
2472 	hammer2_chain_t *parent = *parentp;
2473 	hammer2_chain_core_t *above;
2474 	hammer2_blockref_t *base;
2475 	hammer2_blockref_t dummy;
2476 	int allocated = 0;
2477 	int error = 0;
2478 	int count;
2479 	int maxloops = 300000;
2480 
2481 	above = parent->core;
2482 	KKASSERT(ccms_thread_lock_owned(&above->cst));
2483 	hmp = parent->hmp;
2484 	chain = *chainp;
2485 
2486 	if (chain == NULL) {
2487 		/*
2488 		 * First allocate media space and construct the dummy bref,
2489 		 * then allocate the in-memory chain structure.  Set the
2490 		 * INITIAL flag for fresh chains which do not have embedded
2491 		 * data.
2492 		 */
2493 		bzero(&dummy, sizeof(dummy));
2494 		dummy.type = type;
2495 		dummy.key = key;
2496 		dummy.keybits = keybits;
2497 		dummy.data_off = hammer2_getradix(bytes);
2498 		dummy.methods = parent->bref.methods;
2499 		chain = hammer2_chain_alloc(hmp, parent->pmp, trans, &dummy);
2500 		hammer2_chain_core_alloc(trans, chain, NULL);
2501 
2502 		/*
2503 		 * Lock the chain manually, chain_lock will load the chain
2504 		 * which we do NOT want to do.  (note: chain->refs is set
2505 		 * to 1 by chain_alloc() for us, but lockcnt is not).
2506 		 */
2507 		chain->lockcnt = 1;
2508 		ccms_thread_lock(&chain->core->cst, CCMS_STATE_EXCLUSIVE);
2509 		allocated = 1;
2510 
2511 		/*
2512 		 * We do NOT set INITIAL here (yet).  INITIAL is only
2513 		 * used for indirect blocks.
2514 		 *
2515 		 * Recalculate bytes to reflect the actual media block
2516 		 * allocation.
2517 		 */
2518 		bytes = (hammer2_off_t)1 <<
2519 			(int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
2520 		chain->bytes = bytes;
2521 
2522 		switch(type) {
2523 		case HAMMER2_BREF_TYPE_VOLUME:
2524 		case HAMMER2_BREF_TYPE_FREEMAP:
2525 			panic("hammer2_chain_create: called with volume type");
2526 			break;
2527 		case HAMMER2_BREF_TYPE_INDIRECT:
2528 			panic("hammer2_chain_create: cannot be used to"
2529 			      "create indirect block");
2530 			break;
2531 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2532 			panic("hammer2_chain_create: cannot be used to"
2533 			      "create freemap root or node");
2534 			break;
2535 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2536 			KKASSERT(bytes == sizeof(chain->data->bmdata));
2537 			/* fall through */
2538 		case HAMMER2_BREF_TYPE_INODE:
2539 		case HAMMER2_BREF_TYPE_DATA:
2540 		default:
2541 			/*
2542 			 * leave chain->data NULL, set INITIAL
2543 			 */
2544 			KKASSERT(chain->data == NULL);
2545 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
2546 			break;
2547 		}
2548 	} else {
2549 		/*
2550 		 * We are reattaching a chain that has been duplicated and
2551 		 * left disconnected under a DIFFERENT parent with potentially
2552 		 * different key/keybits.
2553 		 *
2554 		 * The chain must be modified in the current transaction
2555 		 * (the duplication code should have done that for us),
2556 		 * and it's modify_tid should be greater than the parent's
2557 		 * bref.mirror_tid.  This should cause it to be created under
2558 		 * the new parent.
2559 		 *
2560 		 * If deleted in the same transaction, the create/delete TIDs
2561 		 * will be the same and effective the chain will not have
2562 		 * existed at all from the point of view of the parent.
2563 		 *
2564 		 * Do NOT mess with the current state of the INITIAL flag.
2565 		 */
2566 		KKASSERT(chain->modify_tid == trans->sync_tid);
2567 		chain->bref.key = key;
2568 		chain->bref.keybits = keybits;
2569 		KKASSERT(chain->above == NULL);
2570 	}
2571 
2572 	/*
2573 	 * Calculate how many entries we have in the blockref array and
2574 	 * determine if an indirect block is required.
2575 	 */
2576 again:
2577 	if (--maxloops == 0)
2578 		panic("hammer2_chain_create: maxloops");
2579 	above = parent->core;
2580 
2581 	switch(parent->bref.type) {
2582 	case HAMMER2_BREF_TYPE_INODE:
2583 		KKASSERT((parent->data->ipdata.op_flags &
2584 			  HAMMER2_OPFLAG_DIRECTDATA) == 0);
2585 		KKASSERT(parent->data != NULL);
2586 		base = &parent->data->ipdata.u.blockset.blockref[0];
2587 		count = HAMMER2_SET_COUNT;
2588 		break;
2589 	case HAMMER2_BREF_TYPE_INDIRECT:
2590 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2591 		if (parent->flags & HAMMER2_CHAIN_INITIAL)
2592 			base = NULL;
2593 		else
2594 			base = &parent->data->npdata[0];
2595 		count = parent->bytes / sizeof(hammer2_blockref_t);
2596 		break;
2597 	case HAMMER2_BREF_TYPE_VOLUME:
2598 		KKASSERT(parent->data != NULL);
2599 		base = &hmp->voldata.sroot_blockset.blockref[0];
2600 		count = HAMMER2_SET_COUNT;
2601 		break;
2602 	case HAMMER2_BREF_TYPE_FREEMAP:
2603 		KKASSERT(parent->data != NULL);
2604 		base = &hmp->voldata.freemap_blockset.blockref[0];
2605 		count = HAMMER2_SET_COUNT;
2606 		break;
2607 	default:
2608 		panic("hammer2_chain_create: unrecognized blockref type: %d",
2609 		      parent->bref.type);
2610 		base = NULL;
2611 		count = 0;
2612 		break;
2613 	}
2614 
2615 	/*
2616 	 * Make sure we've counted the brefs
2617 	 */
2618 	if ((parent->core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0)
2619 		hammer2_chain_countbrefs(parent, base, count);
2620 
2621 	KKASSERT(above->live_count >= 0 && above->live_count <= count);
2622 
2623 	/*
2624 	 * If no free blockref could be found we must create an indirect
2625 	 * block and move a number of blockrefs into it.  With the parent
2626 	 * locked we can safely lock each child in order to delete+duplicate
2627 	 * it without causing a deadlock.
2628 	 *
2629 	 * This may return the new indirect block or the old parent depending
2630 	 * on where the key falls.  NULL is returned on error.
2631 	 */
2632 	if (above->live_count == count) {
2633 		hammer2_chain_t *nparent;
2634 
2635 		nparent = hammer2_chain_create_indirect(trans, parent,
2636 							key, keybits,
2637 							type, &error);
2638 		if (nparent == NULL) {
2639 			if (allocated)
2640 				hammer2_chain_drop(chain);
2641 			chain = NULL;
2642 			goto done;
2643 		}
2644 		if (parent != nparent) {
2645 			hammer2_chain_unlock(parent);
2646 			parent = *parentp = nparent;
2647 		}
2648 		goto again;
2649 	}
2650 
2651 	/*
2652 	 * Link the chain into its parent.
2653 	 */
2654 	if (chain->above != NULL)
2655 		panic("hammer2: hammer2_chain_create: chain already connected");
2656 	KKASSERT(chain->above == NULL);
2657 	hammer2_chain_insert(above, NULL, chain,
2658 			     HAMMER2_CHAIN_INSERT_SPIN |
2659 			     HAMMER2_CHAIN_INSERT_LIVE,
2660 			     0);
2661 
2662 	if (allocated) {
2663 		/*
2664 		 * Mark the newly created chain modified.  This will cause
2665 		 * FLUSH_CREATE to be set.
2666 		 *
2667 		 * Device buffers are not instantiated for DATA elements
2668 		 * as these are handled by logical buffers.
2669 		 *
2670 		 * Indirect and freemap node indirect blocks are handled
2671 		 * by hammer2_chain_create_indirect() and not by this
2672 		 * function.
2673 		 *
2674 		 * Data for all other bref types is expected to be
2675 		 * instantiated (INODE, LEAF).
2676 		 */
2677 		switch(chain->bref.type) {
2678 		case HAMMER2_BREF_TYPE_DATA:
2679 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2680 		case HAMMER2_BREF_TYPE_INODE:
2681 			hammer2_chain_modify(trans, &chain,
2682 					     HAMMER2_MODIFY_OPTDATA |
2683 					     HAMMER2_MODIFY_ASSERTNOCOPY);
2684 			break;
2685 		default:
2686 			/*
2687 			 * Remaining types are not supported by this function.
2688 			 * In particular, INDIRECT and LEAF_NODE types are
2689 			 * handled by create_indirect().
2690 			 */
2691 			panic("hammer2_chain_create: bad type: %d",
2692 			      chain->bref.type);
2693 			/* NOT REACHED */
2694 			break;
2695 		}
2696 	} else {
2697 		/*
2698 		 * When reconnecting a chain we must set FLUSH_CREATE and
2699 		 * setsubmod so the flush recognizes that it must update
2700 		 * the bref in the parent.
2701 		 */
2702 		if ((chain->flags & HAMMER2_CHAIN_FLUSH_CREATE) == 0) {
2703 			hammer2_chain_ref(chain);
2704 			atomic_set_int(&chain->flags,
2705 				       HAMMER2_CHAIN_FLUSH_CREATE);
2706 		}
2707 	}
2708 	hammer2_chain_setsubmod(trans, chain);
2709 
2710 done:
2711 	*chainp = chain;
2712 
2713 	return (error);
2714 }
2715 
2716 /*
2717  * Replace (*chainp) with a duplicate in-memory chain structure which shares
2718  * the same core and media state as the orignal.  The original *chainp is
2719  * unlocked and the replacement will be returned locked.  The duplicated
2720  * chain is inserted under (*parentp).
2721  *
2722  * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION
2723  * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING
2724  * FULL.  This typically means that the caller is creating the chain after
2725  * doing a hammer2_chain_lookup().
2726  *
2727  * The old chain must be in a DELETED state unless snapshot is non-zero.
2728  *
2729  * The new chain will be live (i.e. not deleted), and modified.
2730  *
2731  * If (parent) is non-NULL then the new duplicated chain is inserted under
2732  * the parent.
2733  *
2734  * If (parent) is NULL then the newly duplicated chain is not inserted
2735  * anywhere, similar to if it had just been chain_alloc()'d (suitable for
2736  * passing into hammer2_chain_create() after this function returns).
2737  *
2738  * WARNING! This function cannot take snapshots all by itself.  The caller
2739  *	    needs to do other massaging for snapshots.
2740  *
2741  * WARNING! This function calls create which means it can insert indirect
2742  *	    blocks.  Callers may have to refactor locked chains held across
2743  *	    the call (other than the ones passed into the call).
2744  */
2745 void
2746 hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
2747 			hammer2_chain_t **chainp, hammer2_blockref_t *bref,
2748 			int snapshot, int duplicate_reason)
2749 {
2750 	hammer2_mount_t *hmp;
2751 	hammer2_chain_t *parent;
2752 	hammer2_chain_t *ochain;
2753 	hammer2_chain_t *nchain;
2754 	hammer2_chain_core_t *above;
2755 	size_t bytes;
2756 
2757 	/*
2758 	 * We want nchain to be our go-to live chain, but ochain may be in
2759 	 * a MODIFIED state within the current flush synchronization segment.
2760 	 * Force any further modifications of ochain to do another COW
2761 	 * operation even if modify_tid indicates that one is not needed.
2762 	 *
2763 	 * We don't want to set FORCECOW on nchain simply as an optimization,
2764 	 * as many duplication calls simply move chains into ichains and
2765 	 * then delete the original.
2766 	 *
2767 	 * WARNING!  We should never resolve DATA to device buffers
2768 	 *	     (XXX allow it if the caller did?), and since
2769 	 *	     we currently do not have the logical buffer cache
2770 	 *	     buffer in-hand to fix its cached physical offset
2771 	 *	     we also force the modify code to not COW it. XXX
2772 	 */
2773 	ochain = *chainp;
2774 	hmp = ochain->hmp;
2775 	KKASSERT(snapshot == 1 || (ochain->flags & HAMMER2_CHAIN_DELETED));
2776 
2777 	/*
2778 	 * Now create a duplicate of the chain structure, associating
2779 	 * it with the same core, making it the same size, pointing it
2780 	 * to the same bref (the same media block).
2781 	 *
2782 	 * Give nchain the same modify_tid that we previously ensured was
2783 	 * sufficiently advanced to trigger a block table insertion on flush.
2784 	 *
2785 	 * nchain copies ochain's data and must inherit ochain->update_lo.
2786 	 *
2787 	 * NOTE: bref.mirror_tid duplicated by virtue of bref copy in
2788 	 *	 hammer2_chain_alloc()
2789 	 */
2790 	if (bref == NULL)
2791 		bref = &ochain->bref;
2792 	if (snapshot) {
2793 		nchain = hammer2_chain_alloc(hmp, NULL, trans, bref);
2794 		atomic_set_int(&nchain->flags, HAMMER2_CHAIN_SNAPSHOT);
2795 	} else {
2796 		nchain = hammer2_chain_alloc(hmp, ochain->pmp, trans, bref);
2797 	}
2798 	hammer2_chain_core_alloc(trans, nchain, ochain);
2799 	bytes = (hammer2_off_t)1 <<
2800 		(int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
2801 	nchain->bytes = bytes;
2802 	nchain->modify_tid = ochain->modify_tid;
2803 	nchain->update_lo = ochain->update_lo;
2804 	nchain->inode_reason = ochain->inode_reason + 0x100000;
2805 	atomic_set_int(&nchain->flags,
2806 		       ochain->flags & (HAMMER2_CHAIN_INITIAL |
2807 					HAMMER2_CHAIN_FORCECOW |
2808 					HAMMER2_CHAIN_UNLINKED));
2809 	if (ochain->modify_tid == trans->sync_tid)
2810 		atomic_set_int(&ochain->flags, HAMMER2_CHAIN_FORCECOW);
2811 
2812 	/*
2813 	 * Switch from ochain to nchain
2814 	 */
2815 	hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER |
2816 				   HAMMER2_RESOLVE_NOREF);
2817 	/* nchain has 1 ref */
2818 	hammer2_chain_unlock(ochain);
2819 
2820 	/*
2821 	 * Place nchain in the modified state, instantiate media data
2822 	 * if necessary.  Because modify_tid is already completely
2823 	 * synchronized this should not result in a delete-duplicate.
2824 	 *
2825 	 * We want nchain at the target to look like a new insertion.
2826 	 * Forcing the modification to be INPLACE accomplishes this
2827 	 * because we get the same nchain with an updated modify_tid.
2828 	 */
2829 	if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2830 		hammer2_chain_modify(trans, &nchain,
2831 				     HAMMER2_MODIFY_OPTDATA |
2832 				     HAMMER2_MODIFY_NOREALLOC |
2833 				     HAMMER2_MODIFY_INPLACE);
2834 	} else if (nchain->flags & HAMMER2_CHAIN_INITIAL) {
2835 		hammer2_chain_modify(trans, &nchain,
2836 				     HAMMER2_MODIFY_OPTDATA |
2837 				     HAMMER2_MODIFY_INPLACE);
2838 	} else {
2839 		hammer2_chain_modify(trans, &nchain,
2840 				     HAMMER2_MODIFY_INPLACE);
2841 	}
2842 
2843 	/*
2844 	 * If parent is not NULL the duplicated chain will be entered under
2845 	 * the parent and the FLUSH_CREATE bit set to tell flush to update
2846 	 * the blockref.
2847 	 *
2848 	 * Having both chains locked is extremely important for atomicy.
2849 	 */
2850 	if (parentp && (parent = *parentp) != NULL) {
2851 		above = parent->core;
2852 		KKASSERT(ccms_thread_lock_owned(&above->cst));
2853 		KKASSERT((nchain->flags & HAMMER2_CHAIN_DELETED) == 0);
2854 		KKASSERT(parent->refs > 0);
2855 
2856 		hammer2_chain_create(trans, parentp, &nchain,
2857 				     nchain->bref.key, nchain->bref.keybits,
2858 				     nchain->bref.type, nchain->bytes);
2859 		parent = NULL;
2860 
2861 		KKASSERT(nchain->flags & HAMMER2_CHAIN_FLUSH_CREATE);
2862 		hammer2_chain_setsubmod(trans, nchain);
2863 	}
2864 
2865 	*chainp = nchain;
2866 }
2867 
2868 /*
2869  * Helper function for deleting chains.
2870  *
2871  * The chain is removed from the live view (the RBTREE).
2872  *
2873  * If appropriate, the chain is added to the shadow topology and FLUSH_DELETE
2874  * is set for flusher visbility.  The caller is responsible for calling
2875  * setsubmod on chain, so we do not adjust update_hi here.
2876  */
2877 static void
2878 _hammer2_chain_delete_helper(hammer2_trans_t *trans,
2879 			     hammer2_chain_core_t *above,
2880 			     hammer2_chain_t *chain)
2881 {
2882 	hammer2_mount_t *hmp;
2883 	hammer2_chain_t *xchain;
2884 
2885 	KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
2886 	KKASSERT(trans->sync_tid >= chain->modify_tid);
2887 	KKASSERT((chain->flags & (HAMMER2_CHAIN_DELETED |
2888 				  HAMMER2_CHAIN_ONDBQ |
2889 				  HAMMER2_CHAIN_ONDBTREE |
2890 				  HAMMER2_CHAIN_FLUSH_DELETE)) == 0);
2891 
2892 	/*
2893 	 * Flag as deleted, reduce live_count and bump the above core's
2894 	 * generation.
2895 	 */
2896 	chain->delete_tid = trans->sync_tid;
2897 	atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
2898 	atomic_add_int(&above->live_count, -1);
2899 	++above->generation;
2900 	hmp = chain->hmp;
2901 
2902 	/*
2903 	 * Remove from live tree
2904 	 */
2905 	RB_REMOVE(hammer2_chain_tree, &above->rbtree, chain);
2906 	atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
2907 
2908 	if (chain->flags & HAMMER2_CHAIN_BMAPPED) {
2909 		/*
2910 		 * If the chain was originally bmapped we must place on the
2911 		 * deleted tree and set FLUSH_DELETE (+ref) to prevent
2912 		 * destruction of the chain until the flush can reconcile
2913 		 * the parent's block table.
2914 		 *
2915 		 * NOTE! DBTREE is only representitive of the live view,
2916 		 *	 the flush must check both DBTREE and DBQ.
2917 		 */
2918 		xchain = RB_INSERT(hammer2_chain_tree, &above->dbtree, chain);
2919 		KKASSERT(xchain == NULL);
2920 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONDBTREE);
2921 
2922 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSH_DELETE);
2923 		hammer2_chain_ref(chain);
2924 	} else {
2925 		/*
2926 		 * If the chain no longer (and never had) an actual blockmap
2927 		 * entry we must place it on the dbq list and set FLUSH_DELETE
2928 		 * (+ref) to prevent destruction of the chain until the flush
2929 		 * can reconcile the parent's block table.
2930 		 *
2931 		 * NOTE! DBTREE is only representitive of the live view,
2932 		 *	 the flush must check both DBTREE and DBQ.
2933 		 */
2934 		TAILQ_INSERT_TAIL(&above->dbq, chain, db_entry);
2935 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONDBQ);
2936 
2937 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSH_DELETE);
2938 		hammer2_chain_ref(chain);
2939 	}
2940 }
2941 
2942 /*
2943  * Special in-place delete-duplicate sequence which does not require a
2944  * locked parent.  (*chainp) is marked DELETED and atomically replaced
2945  * with a duplicate.  Atomicy is at the very-fine spin-lock level in
2946  * order to ensure that lookups do not race us.
2947  *
2948  * The flush code will sometimes call this function with a deleted chain.
2949  * In this situation the old chain's memory is reallocated without
2950  * duplicating it.
2951  *
2952  * The new chain will be marked modified for the current transaction.
2953  */
2954 void
2955 hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
2956 			       int flags)
2957 {
2958 	hammer2_mount_t *hmp;
2959 	hammer2_chain_t *ochain;
2960 	hammer2_chain_t *nchain;
2961 	hammer2_chain_core_t *above;
2962 	size_t bytes;
2963 
2964 	if (hammer2_debug & 0x20000)
2965 		Debugger("dd");
2966 
2967 	/*
2968 	 * Note that we do not have to call setsubmod on ochain, calling it
2969 	 * on nchain is sufficient.
2970 	 */
2971 	ochain = *chainp;
2972 	hmp = ochain->hmp;
2973 
2974 	if (ochain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2975 		KKASSERT(ochain->data);
2976 	}
2977 
2978 	/*
2979 	 * First create a duplicate of the chain structure.
2980 	 * (nchain is allocated with one ref).
2981 	 *
2982 	 * In the case where nchain inherits ochains core, nchain is
2983 	 * effectively locked due to ochain being locked (and sharing the
2984 	 * core), until we can give nchain its own official ock.
2985 	 *
2986 	 * WARNING! Flusher concurrency can create two cases.  The first is
2987 	 *	    that the flusher might be working on a chain that has
2988 	 *	    been deleted in the live view but is live in the flusher's
2989 	 *	    view.  In the second case the flusher may be duplicating
2990 	 *	    a forward-transacted chain.  In both situations nchain
2991 	 *	    must be marked deleted.
2992 	 *
2993 	 * WARNING! hammer2_chain_core_alloc() also acts on these issues.
2994 	 */
2995 	nchain = hammer2_chain_alloc(hmp, ochain->pmp, trans, &ochain->bref);
2996 	if ((ochain->flags & HAMMER2_CHAIN_DELETED) ||
2997 	    (ochain->modify_tid > trans->sync_tid)) {
2998 		atomic_set_int(&nchain->flags, HAMMER2_CHAIN_DELETED);
2999 	}
3000 	if (flags & HAMMER2_DELDUP_RECORE)
3001 		hammer2_chain_core_alloc(trans, nchain, NULL);
3002 	else
3003 		hammer2_chain_core_alloc(trans, nchain, ochain);
3004 	above = ochain->above;
3005 
3006 	bytes = (hammer2_off_t)1 <<
3007 		(int)(ochain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
3008 	nchain->bytes = bytes;
3009 
3010 	/*
3011 	 * nchain inherits ochain's live state including its modification
3012 	 * state.  This function disposes of the original.  Because we are
3013 	 * doing this in-place under the same parent the block array
3014 	 * inserted/deleted state does not change.
3015 	 *
3016 	 * nchain copies ochain's data and must inherit ochain->update_lo.
3017 	 *
3018 	 * If ochain was previously marked FORCECOW we also flag nchain
3019 	 * FORCECOW (used during hardlink splits).  FORCECOW forces a
3020 	 * reallocation of the block when we modify the chain a little later,
3021 	 * it does not force another delete-duplicate.
3022 	 *
3023 	 * NOTE: bref.mirror_tid duplicated by virtue of bref copy in
3024 	 *	 hammer2_chain_alloc()
3025 	 */
3026 	nchain->data_count += ochain->data_count;
3027 	nchain->inode_count += ochain->inode_count;
3028 	atomic_set_int(&nchain->flags,
3029 		       ochain->flags & (HAMMER2_CHAIN_INITIAL |
3030 					HAMMER2_CHAIN_FORCECOW |
3031 					HAMMER2_CHAIN_UNLINKED));
3032 	if (ochain->modify_tid == trans->sync_tid)
3033 		atomic_set_int(&ochain->flags, HAMMER2_CHAIN_FORCECOW);
3034 	nchain->inode_reason = ochain->inode_reason + 0x1000;
3035 	nchain->update_lo = ochain->update_lo;
3036 	nchain->dsrc = ochain->bref;			/* DEBUG */
3037 	nchain->dsrc_dupfromat = trans->sync_tid;	/* DEBUG */
3038 	nchain->dsrc_dupfromflags = trans->flags;	/* DEBUG */
3039 	nchain->dsrc_reason = ochain->inode_reason;	/* DEBUG */
3040 	nchain->dsrc_ninserts = ochain->ninserts;	/* DEBUG */
3041 	nchain->dsrc_flags = ochain->flags;		/* DEBUG */
3042 	nchain->dsrc_modify = ochain->modify_tid;	/* DEBUG */
3043 	nchain->dsrc_delete = ochain->delete_tid;	/* DEBUG */
3044 	nchain->dsrc_update_lo = ochain->update_lo;	/* DEBUG */
3045 	nchain->dsrc_original = ochain;			/* DEBUG */
3046 
3047 	/*
3048 	 * Lock nchain so both chains are now locked (extremely important
3049 	 * for atomicy).  The shared core allows us to unlock ochain without
3050 	 * actually unlocking ochain.
3051 	 */
3052 	hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER);
3053 	/* extra ref still present from original allocation */
3054 
3055 	KKASSERT(ochain->flags & (HAMMER2_CHAIN_ONRBTREE |
3056 				  HAMMER2_CHAIN_ONDBTREE |
3057 				  HAMMER2_CHAIN_ONDBQ));
3058 	spin_lock(&above->cst.spin);
3059 
3060 	nchain->modify_tid = ochain->modify_tid;
3061 	nchain->delete_tid = HAMMER2_MAX_TID;
3062 
3063 	if (nchain->flags & HAMMER2_CHAIN_DELETED) {
3064 		/*
3065 		 * Special case, used by the flush code only in two cases:
3066 		 *
3067 		 * (1) The flush must operate on a chain that is visible to
3068 		 *     the flush but deleted in the live view.
3069 		 *
3070 		 * (2) The flush must operate on a forward-indexed chain in
3071 		 *     the live view, which typically
3072 		 *
3073 		 * In these situations nchain will be marked deleted and
3074 		 * insert before ochain.  nchain must inherit certain features
3075 		 * of ochain such as the BMAPPED state.
3076 		 */
3077 		KKASSERT(trans->flags & HAMMER2_TRANS_ISFLUSH);
3078 		KKASSERT(ochain->modify_tid < trans->sync_tid);
3079 		KKASSERT(ochain->delete_tid > trans->sync_tid);
3080 		atomic_set_int(&nchain->flags, HAMMER2_CHAIN_FLUSH_TEMPORARY);
3081 		hammer2_chain_insert(above, ochain, nchain, 0, 0);
3082 
3083 		if ((ochain->flags & HAMMER2_CHAIN_DELETED) &&
3084 		    ochain->modify_tid < trans->sync_tid) {
3085 			nchain->delete_tid = ochain->delete_tid;
3086 			ochain->delete_tid = trans->sync_tid;
3087 		} else if (ochain->modify_tid > trans->sync_tid) {
3088 			nchain->delete_tid = ochain->modify_tid;
3089 		}
3090 	} else {
3091 		/*
3092 		 * Normal case, delete-duplicate deletes ochain and nchain
3093 		 * is the new live chain.
3094 		 */
3095 		_hammer2_chain_delete_helper(trans, above, ochain);
3096 		hammer2_chain_insert(above, ochain, nchain,
3097 				     HAMMER2_CHAIN_INSERT_LIVE, 0);
3098 	}
3099 	spin_unlock(&above->cst.spin);
3100 
3101 	/*
3102 	 * ochain must be unlocked because ochain and nchain might share
3103 	 * a buffer cache buffer, so we need to release it so nchain can
3104 	 * potentially obtain it.
3105 	 */
3106 	hammer2_chain_setsubmod(trans, ochain);
3107 	hammer2_chain_unlock(ochain);
3108 
3109 	/*
3110 	 * Finishing fixing up nchain.  A new block will be allocated if
3111 	 * crossing a synchronization point (meta-data only).
3112 	 *
3113 	 * Calling hammer2_chain_modify() will update modify_tid to
3114 	 * (typically) trans->sync_tid.
3115 	 */
3116 	if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
3117 		hammer2_chain_modify(trans, &nchain,
3118 				     HAMMER2_MODIFY_OPTDATA |
3119 				     HAMMER2_MODIFY_NOREALLOC |
3120 				     HAMMER2_MODIFY_INPLACE);
3121 	} else if (nchain->flags & HAMMER2_CHAIN_INITIAL) {
3122 		hammer2_chain_modify(trans, &nchain,
3123 				     HAMMER2_MODIFY_OPTDATA |
3124 				     HAMMER2_MODIFY_INPLACE);
3125 	} else {
3126 		hammer2_chain_modify(trans, &nchain,
3127 				     HAMMER2_MODIFY_INPLACE);
3128 	}
3129 	hammer2_chain_drop(nchain);
3130 
3131 	/*
3132 	 * Unconditionally set FLUSH_CREATE to force the parent blockrefs to
3133 	 * update as the chain_modify() above won't necessarily do it.
3134 	 */
3135 	if ((nchain->flags & HAMMER2_CHAIN_FLUSH_CREATE) == 0) {
3136 		atomic_set_int(&nchain->flags, HAMMER2_CHAIN_FLUSH_CREATE);
3137 		hammer2_chain_ref(nchain);
3138 	}
3139 
3140 	/*
3141 	 * If nchain is in a DELETED state we must set FLUSH_DELETE
3142 	 */
3143 	if (nchain->flags & HAMMER2_CHAIN_DELETED)
3144 		KKASSERT((nchain->flags & HAMMER2_CHAIN_FLUSH_DELETE) == 0);
3145 #if 1
3146 	if ((nchain->flags & HAMMER2_CHAIN_FLUSH_DELETE) == 0 &&
3147 	    (nchain->flags & HAMMER2_CHAIN_DELETED)) {
3148 		atomic_set_int(&nchain->flags, HAMMER2_CHAIN_FLUSH_DELETE);
3149 		hammer2_chain_ref(nchain);
3150 	}
3151 #endif
3152 	hammer2_chain_setsubmod(trans, nchain);
3153 	*chainp = nchain;
3154 }
3155 
3156 /*
3157  * Create a snapshot of the specified {parent, ochain} with the specified
3158  * label.  The originating hammer2_inode must be exclusively locked for
3159  * safety.
3160  *
3161  * The ioctl code has already synced the filesystem.
3162  */
3163 int
3164 hammer2_chain_snapshot(hammer2_trans_t *trans, hammer2_chain_t **ochainp,
3165 		       hammer2_ioc_pfs_t *pfs)
3166 {
3167 	hammer2_mount_t *hmp;
3168 	hammer2_chain_t *ochain = *ochainp;
3169 	hammer2_chain_t *nchain;
3170 	hammer2_inode_data_t *ipdata;
3171 	hammer2_inode_t *nip;
3172 	size_t name_len;
3173 	hammer2_key_t lhc;
3174 	struct vattr vat;
3175 	uuid_t opfs_clid;
3176 	int error;
3177 
3178 	kprintf("snapshot %s ochain->refs %d ochain->flags %08x\n",
3179 		pfs->name, ochain->refs, ochain->flags);
3180 
3181 	name_len = strlen(pfs->name);
3182 	lhc = hammer2_dirhash(pfs->name, name_len);
3183 
3184 	hmp = ochain->hmp;
3185 	opfs_clid = ochain->data->ipdata.pfs_clid;
3186 
3187 	*ochainp = ochain;
3188 
3189 	/*
3190 	 * Create the snapshot directory under the super-root
3191 	 *
3192 	 * Set PFS type, generate a unique filesystem id, and generate
3193 	 * a cluster id.  Use the same clid when snapshotting a PFS root,
3194 	 * which theoretically allows the snapshot to be used as part of
3195 	 * the same cluster (perhaps as a cache).
3196 	 *
3197 	 * Copy the (flushed) ochain's blockref array.  Theoretically we
3198 	 * could use chain_duplicate() but it becomes difficult to disentangle
3199 	 * the shared core so for now just brute-force it.
3200 	 */
3201 	VATTR_NULL(&vat);
3202 	vat.va_type = VDIR;
3203 	vat.va_mode = 0755;
3204 	nchain = NULL;
3205 	nip = hammer2_inode_create(trans, hmp->sroot, &vat, proc0.p_ucred,
3206 				   pfs->name, name_len, &nchain, &error);
3207 
3208 	if (nip) {
3209 		ipdata = hammer2_chain_modify_ip(trans, nip, &nchain, 0);
3210 		ipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT;
3211 		kern_uuidgen(&ipdata->pfs_fsid, 1);
3212 		if (ochain->flags & HAMMER2_CHAIN_PFSROOT)
3213 			ipdata->pfs_clid = opfs_clid;
3214 		else
3215 			kern_uuidgen(&ipdata->pfs_clid, 1);
3216 		atomic_set_int(&nchain->flags, HAMMER2_CHAIN_PFSROOT);
3217 		ipdata->u.blockset = ochain->data->ipdata.u.blockset;
3218 
3219 		hammer2_inode_unlock_ex(nip, nchain);
3220 	}
3221 	return (error);
3222 }
3223 
3224 /*
3225  * Create an indirect block that covers one or more of the elements in the
3226  * current parent.  Either returns the existing parent with no locking or
3227  * ref changes or returns the new indirect block locked and referenced
3228  * and leaving the original parent lock/ref intact as well.
3229  *
3230  * If an error occurs, NULL is returned and *errorp is set to the error.
3231  *
3232  * The returned chain depends on where the specified key falls.
3233  *
3234  * The key/keybits for the indirect mode only needs to follow three rules:
3235  *
3236  * (1) That all elements underneath it fit within its key space and
3237  *
3238  * (2) That all elements outside it are outside its key space.
3239  *
3240  * (3) When creating the new indirect block any elements in the current
3241  *     parent that fit within the new indirect block's keyspace must be
3242  *     moved into the new indirect block.
3243  *
3244  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
3245  *     keyspace the the current parent, but lookup/iteration rules will
3246  *     ensure (and must ensure) that rule (2) for all parents leading up
3247  *     to the nearest inode or the root volume header is adhered to.  This
3248  *     is accomplished by always recursing through matching keyspaces in
3249  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
3250  *
3251  * The current implementation calculates the current worst-case keyspace by
3252  * iterating the current parent and then divides it into two halves, choosing
3253  * whichever half has the most elements (not necessarily the half containing
3254  * the requested key).
3255  *
3256  * We can also opt to use the half with the least number of elements.  This
3257  * causes lower-numbered keys (aka logical file offsets) to recurse through
3258  * fewer indirect blocks and higher-numbered keys to recurse through more.
3259  * This also has the risk of not moving enough elements to the new indirect
3260  * block and being forced to create several indirect blocks before the element
3261  * can be inserted.
3262  *
3263  * Must be called with an exclusively locked parent.
3264  */
3265 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent,
3266 				hammer2_key_t *keyp, int keybits,
3267 				hammer2_blockref_t *base, int count);
3268 static int hammer2_chain_indkey_normal(hammer2_chain_t *parent,
3269 				hammer2_key_t *keyp, int keybits,
3270 				hammer2_blockref_t *base, int count);
3271 static
3272 hammer2_chain_t *
3273 hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
3274 			      hammer2_key_t create_key, int create_bits,
3275 			      int for_type, int *errorp)
3276 {
3277 	hammer2_mount_t *hmp;
3278 	hammer2_chain_core_t *above;
3279 	hammer2_chain_core_t *icore;
3280 	hammer2_blockref_t *base;
3281 	hammer2_blockref_t *bref;
3282 	hammer2_blockref_t bcopy;
3283 	hammer2_chain_t *chain;
3284 	hammer2_chain_t *ichain;
3285 	hammer2_chain_t dummy;
3286 	hammer2_key_t key = create_key;
3287 	hammer2_key_t key_beg;
3288 	hammer2_key_t key_end;
3289 	hammer2_key_t key_next;
3290 	int keybits = create_bits;
3291 	int count;
3292 	int nbytes;
3293 	int cache_index;
3294 	int loops;
3295 	int reason;
3296 	int generation;
3297 	int maxloops = 300000;
3298 	int retry_same;
3299 	int wasdup;
3300 
3301 	/*
3302 	 * Calculate the base blockref pointer or NULL if the chain
3303 	 * is known to be empty.  We need to calculate the array count
3304 	 * for RB lookups either way.
3305 	 */
3306 	hmp = parent->hmp;
3307 	*errorp = 0;
3308 	KKASSERT(ccms_thread_lock_owned(&parent->core->cst));
3309 	above = parent->core;
3310 
3311 	/*hammer2_chain_modify(trans, &parent, HAMMER2_MODIFY_OPTDATA);*/
3312 	if (parent->flags & HAMMER2_CHAIN_INITIAL) {
3313 		base = NULL;
3314 
3315 		switch(parent->bref.type) {
3316 		case HAMMER2_BREF_TYPE_INODE:
3317 			count = HAMMER2_SET_COUNT;
3318 			break;
3319 		case HAMMER2_BREF_TYPE_INDIRECT:
3320 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3321 			count = parent->bytes / sizeof(hammer2_blockref_t);
3322 			break;
3323 		case HAMMER2_BREF_TYPE_VOLUME:
3324 			count = HAMMER2_SET_COUNT;
3325 			break;
3326 		case HAMMER2_BREF_TYPE_FREEMAP:
3327 			count = HAMMER2_SET_COUNT;
3328 			break;
3329 		default:
3330 			panic("hammer2_chain_create_indirect: "
3331 			      "unrecognized blockref type: %d",
3332 			      parent->bref.type);
3333 			count = 0;
3334 			break;
3335 		}
3336 	} else {
3337 		switch(parent->bref.type) {
3338 		case HAMMER2_BREF_TYPE_INODE:
3339 			base = &parent->data->ipdata.u.blockset.blockref[0];
3340 			count = HAMMER2_SET_COUNT;
3341 			break;
3342 		case HAMMER2_BREF_TYPE_INDIRECT:
3343 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3344 			base = &parent->data->npdata[0];
3345 			count = parent->bytes / sizeof(hammer2_blockref_t);
3346 			break;
3347 		case HAMMER2_BREF_TYPE_VOLUME:
3348 			base = &hmp->voldata.sroot_blockset.blockref[0];
3349 			count = HAMMER2_SET_COUNT;
3350 			break;
3351 		case HAMMER2_BREF_TYPE_FREEMAP:
3352 			base = &hmp->voldata.freemap_blockset.blockref[0];
3353 			count = HAMMER2_SET_COUNT;
3354 			break;
3355 		default:
3356 			panic("hammer2_chain_create_indirect: "
3357 			      "unrecognized blockref type: %d",
3358 			      parent->bref.type);
3359 			count = 0;
3360 			break;
3361 		}
3362 	}
3363 
3364 	/*
3365 	 * dummy used in later chain allocation (no longer used for lookups).
3366 	 */
3367 	bzero(&dummy, sizeof(dummy));
3368 	dummy.delete_tid = HAMMER2_MAX_TID;
3369 
3370 	/*
3371 	 * When creating an indirect block for a freemap node or leaf
3372 	 * the key/keybits must be fitted to static radix levels because
3373 	 * particular radix levels use particular reserved blocks in the
3374 	 * related zone.
3375 	 *
3376 	 * This routine calculates the key/radix of the indirect block
3377 	 * we need to create, and whether it is on the high-side or the
3378 	 * low-side.
3379 	 */
3380 	if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
3381 	    for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
3382 		keybits = hammer2_chain_indkey_freemap(parent, &key, keybits,
3383 						       base, count);
3384 	} else {
3385 		keybits = hammer2_chain_indkey_normal(parent, &key, keybits,
3386 						      base, count);
3387 	}
3388 
3389 	/*
3390 	 * Normalize the key for the radix being represented, keeping the
3391 	 * high bits and throwing away the low bits.
3392 	 */
3393 	key &= ~(((hammer2_key_t)1 << keybits) - 1);
3394 
3395 	/*
3396 	 * How big should our new indirect block be?  It has to be at least
3397 	 * as large as its parent.
3398 	 */
3399 	if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
3400 		nbytes = HAMMER2_IND_BYTES_MIN;
3401 	else
3402 		nbytes = HAMMER2_IND_BYTES_MAX;
3403 	if (nbytes < count * sizeof(hammer2_blockref_t))
3404 		nbytes = count * sizeof(hammer2_blockref_t);
3405 
3406 	/*
3407 	 * Ok, create our new indirect block
3408 	 */
3409 	if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
3410 	    for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
3411 		dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE;
3412 	} else {
3413 		dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
3414 	}
3415 	dummy.bref.key = key;
3416 	dummy.bref.keybits = keybits;
3417 	dummy.bref.data_off = hammer2_getradix(nbytes);
3418 	dummy.bref.methods = parent->bref.methods;
3419 
3420 	ichain = hammer2_chain_alloc(hmp, parent->pmp, trans, &dummy.bref);
3421 	atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
3422 	hammer2_chain_core_alloc(trans, ichain, NULL);
3423 	icore = ichain->core;
3424 	hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE);
3425 	hammer2_chain_drop(ichain);	/* excess ref from alloc */
3426 
3427 	/*
3428 	 * We have to mark it modified to allocate its block, but use
3429 	 * OPTDATA to allow it to remain in the INITIAL state.  Otherwise
3430 	 * it won't be acted upon by the flush code.
3431 	 */
3432 	hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA);
3433 
3434 	/*
3435 	 * Iterate the original parent and move the matching brefs into
3436 	 * the new indirect block.
3437 	 *
3438 	 * XXX handle flushes.
3439 	 */
3440 	key_beg = 0;
3441 	key_end = HAMMER2_MAX_KEY;
3442 	cache_index = 0;
3443 	spin_lock(&above->cst.spin);
3444 	loops = 0;
3445 	reason = 0;
3446 	retry_same = 0;
3447 
3448 	for (;;) {
3449 		if (++loops > 100000) {
3450 		    spin_unlock(&above->cst.spin);
3451 		    panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n",
3452 			  reason, parent, base, count, key_next);
3453 		}
3454 
3455 		/*
3456 		 * NOTE: spinlock stays intact, returned chain (if not NULL)
3457 		 *	 is not referenced or locked which means that we
3458 		 *	 cannot safely check its flagged / deletion status
3459 		 *	 until we lock it.
3460 		 */
3461 		chain = hammer2_combined_find(parent, base, count,
3462 					      &cache_index, &key_next,
3463 					      key_beg, key_end,
3464 					      &bref);
3465 		generation = above->generation;
3466 		if (bref == NULL)
3467 			break;
3468 		key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
3469 
3470 		/*
3471 		 * Skip keys that are not within the key/radix of the new
3472 		 * indirect block.  They stay in the parent.
3473 		 */
3474 		if ((~(((hammer2_key_t)1 << keybits) - 1) &
3475 		    (key ^ bref->key)) != 0) {
3476 			goto next_key_spinlocked;
3477 		}
3478 
3479 		/*
3480 		 * Load the new indirect block by acquiring the related
3481 		 * chains (potentially from media as it might not be
3482 		 * in-memory).  Then move it to the new parent (ichain)
3483 		 * via DELETE-DUPLICATE.
3484 		 *
3485 		 * chain is referenced but not locked.  We must lock the
3486 		 * chain to obtain definitive DUPLICATED/DELETED state
3487 		 */
3488 		if (chain) {
3489 			/*
3490 			 * Use chain already present in the RBTREE
3491 			 */
3492 			hammer2_chain_ref(chain);
3493 			wasdup = ((chain->flags &
3494 				   HAMMER2_CHAIN_DUPLICATED) != 0);
3495 			spin_unlock(&above->cst.spin);
3496 			hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER |
3497 						  HAMMER2_RESOLVE_NOREF);
3498 		} else {
3499 			/*
3500 			 * Get chain for blockref element.  _get returns NULL
3501 			 * on insertion race.
3502 			 */
3503 			bcopy = *bref;
3504 			spin_unlock(&above->cst.spin);
3505 			chain = hammer2_chain_get(parent, generation, &bcopy);
3506 			if (chain == NULL) {
3507 				reason = 1;
3508 				spin_lock(&above->cst.spin);
3509 				continue;
3510 			}
3511 			if (bcmp(&bcopy, bref, sizeof(bcopy))) {
3512 				reason = 2;
3513 				hammer2_chain_drop(chain);
3514 				spin_lock(&above->cst.spin);
3515 				continue;
3516 			}
3517 			hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER |
3518 						  HAMMER2_RESOLVE_NOREF);
3519 			wasdup = 0;
3520 		}
3521 
3522 		/*
3523 		 * This is always live so if the chain has been delete-
3524 		 * duplicated we raced someone and we have to retry.
3525 		 *
3526 		 * NOTE: Lookups can race delete-duplicate because
3527 		 *	 delete-duplicate does not lock the parent's core
3528 		 *	 (they just use the spinlock on the core).  We must
3529 		 *	 check for races by comparing the DUPLICATED flag before
3530 		 *	 releasing the spinlock with the flag after locking the
3531 		 *	 chain.
3532 		 *
3533 		 *	 (note reversed logic for this one)
3534 		 */
3535 		if (chain->flags & HAMMER2_CHAIN_DELETED) {
3536 			hammer2_chain_unlock(chain);
3537 			if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) &&
3538 			    wasdup == 0) {
3539 				retry_same = 1;
3540 			}
3541 			goto next_key;
3542 		}
3543 
3544 		/*
3545 		 * Shift the chain to the indirect block.
3546 		 *
3547 		 * WARNING! Can cause held-over chains to require a refactor.
3548 		 *	    Fortunately we have none (our locked chains are
3549 		 *	    passed into and modified by the call).
3550 		 */
3551 		hammer2_chain_delete(trans, chain, 0);
3552 		hammer2_chain_duplicate(trans, &ichain, &chain, NULL, 0, 1);
3553 		hammer2_chain_unlock(chain);
3554 		KKASSERT(parent->refs > 0);
3555 		chain = NULL;
3556 next_key:
3557 		spin_lock(&above->cst.spin);
3558 next_key_spinlocked:
3559 		if (--maxloops == 0)
3560 			panic("hammer2_chain_create_indirect: maxloops");
3561 		reason = 4;
3562 		if (retry_same == 0) {
3563 			if (key_next == 0 || key_next > key_end)
3564 				break;
3565 			key_beg = key_next;
3566 		}
3567 		/* loop */
3568 	}
3569 	spin_unlock(&above->cst.spin);
3570 
3571 	/*
3572 	 * Insert the new indirect block into the parent now that we've
3573 	 * cleared out some entries in the parent.  We calculated a good
3574 	 * insertion index in the loop above (ichain->index).
3575 	 *
3576 	 * We don't have to set FLUSH_CREATE here because we mark ichain
3577 	 * modified down below (so the normal modified -> flush -> set-moved
3578 	 * sequence applies).
3579 	 *
3580 	 * The insertion shouldn't race as this is a completely new block
3581 	 * and the parent is locked.
3582 	 */
3583 	KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
3584 	hammer2_chain_insert(above, NULL, ichain,
3585 			     HAMMER2_CHAIN_INSERT_SPIN |
3586 			     HAMMER2_CHAIN_INSERT_LIVE,
3587 			     0);
3588 
3589 	/*
3590 	 * Mark the new indirect block modified after insertion, which
3591 	 * will propagate up through parent all the way to the root and
3592 	 * also allocate the physical block in ichain for our caller,
3593 	 * and assign ichain->data to a pre-zero'd space (because there
3594 	 * is not prior data to copy into it).
3595 	 */
3596 	/*hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA);*/
3597 	hammer2_chain_setsubmod(trans, ichain);
3598 
3599 	/*
3600 	 * Figure out what to return.
3601 	 */
3602 	if (~(((hammer2_key_t)1 << keybits) - 1) &
3603 		   (create_key ^ key)) {
3604 		/*
3605 		 * Key being created is outside the key range,
3606 		 * return the original parent.
3607 		 */
3608 		hammer2_chain_unlock(ichain);
3609 	} else {
3610 		/*
3611 		 * Otherwise its in the range, return the new parent.
3612 		 * (leave both the new and old parent locked).
3613 		 */
3614 		parent = ichain;
3615 	}
3616 
3617 	return(parent);
3618 }
3619 
3620 /*
3621  * Calculate the keybits and highside/lowside of the freemap node the
3622  * caller is creating.
3623  *
3624  * This routine will specify the next higher-level freemap key/radix
3625  * representing the lowest-ordered set.  By doing so, eventually all
3626  * low-ordered sets will be moved one level down.
3627  *
3628  * We have to be careful here because the freemap reserves a limited
3629  * number of blocks for a limited number of levels.  So we can't just
3630  * push indiscriminately.
3631  */
3632 int
3633 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp,
3634 			     int keybits, hammer2_blockref_t *base, int count)
3635 {
3636 	hammer2_chain_core_t *above;
3637 	hammer2_chain_t *chain;
3638 	hammer2_blockref_t *bref;
3639 	hammer2_key_t key;
3640 	hammer2_key_t key_beg;
3641 	hammer2_key_t key_end;
3642 	hammer2_key_t key_next;
3643 	int cache_index;
3644 	int locount;
3645 	int hicount;
3646 	int maxloops = 300000;
3647 
3648 	key = *keyp;
3649 	above = parent->core;
3650 	locount = 0;
3651 	hicount = 0;
3652 	keybits = 64;
3653 
3654 	/*
3655 	 * Calculate the range of keys in the array being careful to skip
3656 	 * slots which are overridden with a deletion.
3657 	 */
3658 	key_beg = 0;
3659 	key_end = HAMMER2_MAX_KEY;
3660 	cache_index = 0;
3661 	spin_lock(&above->cst.spin);
3662 
3663 	for (;;) {
3664 		if (--maxloops == 0) {
3665 			panic("indkey_freemap shit %p %p:%d\n",
3666 			      parent, base, count);
3667 		}
3668 		chain = hammer2_combined_find(parent, base, count,
3669 					      &cache_index, &key_next,
3670 					      key_beg, key_end,
3671 					      &bref);
3672 
3673 		/*
3674 		 * Exhausted search
3675 		 */
3676 		if (bref == NULL)
3677 			break;
3678 
3679 		/*
3680 		 * NOTE: No need to check DUPLICATED here because we do
3681 		 *	 not release the spinlock.
3682 		 */
3683 		if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
3684 			if (key_next == 0 || key_next > key_end)
3685 				break;
3686 			key_beg = key_next;
3687 			continue;
3688 		}
3689 
3690 		/*
3691 		 * Use the full live (not deleted) element for the scan
3692 		 * iteration.  HAMMER2 does not allow partial replacements.
3693 		 *
3694 		 * XXX should be built into hammer2_combined_find().
3695 		 */
3696 		key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
3697 
3698 		if (keybits > bref->keybits) {
3699 			key = bref->key;
3700 			keybits = bref->keybits;
3701 		} else if (keybits == bref->keybits && bref->key < key) {
3702 			key = bref->key;
3703 		}
3704 		if (key_next == 0)
3705 			break;
3706 		key_beg = key_next;
3707 	}
3708 	spin_unlock(&above->cst.spin);
3709 
3710 	/*
3711 	 * Return the keybits for a higher-level FREEMAP_NODE covering
3712 	 * this node.
3713 	 */
3714 	switch(keybits) {
3715 	case HAMMER2_FREEMAP_LEVEL0_RADIX:
3716 		keybits = HAMMER2_FREEMAP_LEVEL1_RADIX;
3717 		break;
3718 	case HAMMER2_FREEMAP_LEVEL1_RADIX:
3719 		keybits = HAMMER2_FREEMAP_LEVEL2_RADIX;
3720 		break;
3721 	case HAMMER2_FREEMAP_LEVEL2_RADIX:
3722 		keybits = HAMMER2_FREEMAP_LEVEL3_RADIX;
3723 		break;
3724 	case HAMMER2_FREEMAP_LEVEL3_RADIX:
3725 		keybits = HAMMER2_FREEMAP_LEVEL4_RADIX;
3726 		break;
3727 	case HAMMER2_FREEMAP_LEVEL4_RADIX:
3728 		panic("hammer2_chain_indkey_freemap: level too high");
3729 		break;
3730 	default:
3731 		panic("hammer2_chain_indkey_freemap: bad radix");
3732 		break;
3733 	}
3734 	*keyp = key;
3735 
3736 	return (keybits);
3737 }
3738 
3739 /*
3740  * Calculate the keybits and highside/lowside of the indirect block the
3741  * caller is creating.
3742  */
3743 static int
3744 hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp,
3745 			    int keybits, hammer2_blockref_t *base, int count)
3746 {
3747 	hammer2_chain_core_t *above;
3748 	hammer2_blockref_t *bref;
3749 	hammer2_chain_t	*chain;
3750 	hammer2_key_t key_beg;
3751 	hammer2_key_t key_end;
3752 	hammer2_key_t key_next;
3753 	hammer2_key_t key;
3754 	int nkeybits;
3755 	int locount;
3756 	int hicount;
3757 	int cache_index;
3758 	int maxloops = 300000;
3759 
3760 	key = *keyp;
3761 	above = parent->core;
3762 	locount = 0;
3763 	hicount = 0;
3764 
3765 	/*
3766 	 * Calculate the range of keys in the array being careful to skip
3767 	 * slots which are overridden with a deletion.  Once the scan
3768 	 * completes we will cut the key range in half and shift half the
3769 	 * range into the new indirect block.
3770 	 */
3771 	key_beg = 0;
3772 	key_end = HAMMER2_MAX_KEY;
3773 	cache_index = 0;
3774 	spin_lock(&above->cst.spin);
3775 
3776 	for (;;) {
3777 		if (--maxloops == 0) {
3778 			panic("indkey_freemap shit %p %p:%d\n",
3779 			      parent, base, count);
3780 		}
3781 		chain = hammer2_combined_find(parent, base, count,
3782 					      &cache_index, &key_next,
3783 					      key_beg, key_end,
3784 					      &bref);
3785 
3786 		/*
3787 		 * Exhausted search
3788 		 */
3789 		if (bref == NULL)
3790 			break;
3791 
3792 		/*
3793 		 * NOTE: No need to check DUPLICATED here because we do
3794 		 *	 not release the spinlock.
3795 		 */
3796 		if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
3797 			if (key_next == 0 || key_next > key_end)
3798 				break;
3799 			key_beg = key_next;
3800 			continue;
3801 		}
3802 
3803 		/*
3804 		 * Use the full live (not deleted) element for the scan
3805 		 * iteration.  HAMMER2 does not allow partial replacements.
3806 		 *
3807 		 * XXX should be built into hammer2_combined_find().
3808 		 */
3809 		key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
3810 
3811 		/*
3812 		 * Expand our calculated key range (key, keybits) to fit
3813 		 * the scanned key.  nkeybits represents the full range
3814 		 * that we will later cut in half (two halves @ nkeybits - 1).
3815 		 */
3816 		nkeybits = keybits;
3817 		if (nkeybits < bref->keybits) {
3818 			if (bref->keybits > 64) {
3819 				kprintf("bad bref chain %p bref %p\n",
3820 					chain, bref);
3821 				Debugger("fubar");
3822 			}
3823 			nkeybits = bref->keybits;
3824 		}
3825 		while (nkeybits < 64 &&
3826 		       (~(((hammer2_key_t)1 << nkeybits) - 1) &
3827 		        (key ^ bref->key)) != 0) {
3828 			++nkeybits;
3829 		}
3830 
3831 		/*
3832 		 * If the new key range is larger we have to determine
3833 		 * which side of the new key range the existing keys fall
3834 		 * under by checking the high bit, then collapsing the
3835 		 * locount into the hicount or vise-versa.
3836 		 */
3837 		if (keybits != nkeybits) {
3838 			if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
3839 				hicount += locount;
3840 				locount = 0;
3841 			} else {
3842 				locount += hicount;
3843 				hicount = 0;
3844 			}
3845 			keybits = nkeybits;
3846 		}
3847 
3848 		/*
3849 		 * The newly scanned key will be in the lower half or the
3850 		 * upper half of the (new) key range.
3851 		 */
3852 		if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
3853 			++hicount;
3854 		else
3855 			++locount;
3856 
3857 		if (key_next == 0)
3858 			break;
3859 		key_beg = key_next;
3860 	}
3861 	spin_unlock(&above->cst.spin);
3862 	bref = NULL;	/* now invalid (safety) */
3863 
3864 	/*
3865 	 * Adjust keybits to represent half of the full range calculated
3866 	 * above (radix 63 max)
3867 	 */
3868 	--keybits;
3869 
3870 	/*
3871 	 * Select whichever half contains the most elements.  Theoretically
3872 	 * we can select either side as long as it contains at least one
3873 	 * element (in order to ensure that a free slot is present to hold
3874 	 * the indirect block).
3875 	 */
3876 	if (hammer2_indirect_optimize) {
3877 		/*
3878 		 * Insert node for least number of keys, this will arrange
3879 		 * the first few blocks of a large file or the first few
3880 		 * inodes in a directory with fewer indirect blocks when
3881 		 * created linearly.
3882 		 */
3883 		if (hicount < locount && hicount != 0)
3884 			key |= (hammer2_key_t)1 << keybits;
3885 		else
3886 			key &= ~(hammer2_key_t)1 << keybits;
3887 	} else {
3888 		/*
3889 		 * Insert node for most number of keys, best for heavily
3890 		 * fragmented files.
3891 		 */
3892 		if (hicount > locount)
3893 			key |= (hammer2_key_t)1 << keybits;
3894 		else
3895 			key &= ~(hammer2_key_t)1 << keybits;
3896 	}
3897 	*keyp = key;
3898 
3899 	return (keybits);
3900 }
3901 
3902 /*
3903  * Sets CHAIN_DELETED and CHAIN_FLUSH_DELETE in the chain being deleted and
3904  * set chain->delete_tid.  The chain is not actually marked possibly-free
3905  * in the freemap until the deletion is completely flushed out (because
3906  * a flush which doesn't cover the entire deletion is flushing the deleted
3907  * chain as if it were live).
3908  *
3909  * This function does NOT generate a modification to the parent.  It
3910  * would be nearly impossible to figure out which parent to modify anyway.
3911  * Such modifications are handled top-down by the flush code and are
3912  * properly merged using the flush synchronization point.
3913  *
3914  * The find/get code will properly overload the RBTREE check on top of
3915  * the bref check to detect deleted entries.
3916  *
3917  * This function is NOT recursive.  Any entity already pushed into the
3918  * chain (such as an inode) may still need visibility into its contents,
3919  * as well as the ability to read and modify the contents.  For example,
3920  * for an unlinked file which is still open.
3921  *
3922  * NOTE: This function does NOT set chain->modify_tid, allowing future
3923  *	 code to distinguish between live and deleted chains by testing
3924  *	 trans->sync_tid vs chain->modify_tid and chain->delete_tid.
3925  *
3926  * NOTE: Deletions normally do not occur in the middle of a duplication
3927  *	 chain but we use a trick for hardlink migration that refactors
3928  *	 the originating inode without deleting it, so we make no assumptions
3929  *	 here.
3930  */
3931 void
3932 hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *chain, int flags)
3933 {
3934 	KKASSERT(ccms_thread_lock_owned(&chain->core->cst));
3935 
3936 	/*
3937 	 * Nothing to do if already marked.
3938 	 */
3939 	if (chain->flags & HAMMER2_CHAIN_DELETED)
3940 		return;
3941 
3942 	/*
3943 	 * The setting of DELETED causes finds, lookups, and _next iterations
3944 	 * to no longer recognize the chain.  RB_SCAN()s will still have
3945 	 * visibility (needed for flush serialization points).
3946 	 *
3947 	 * We need the spinlock on the core whos RBTREE contains chain
3948 	 * to protect against races.
3949 	 */
3950 	spin_lock(&chain->above->cst.spin);
3951 	_hammer2_chain_delete_helper(trans, chain->above, chain);
3952 	spin_unlock(&chain->above->cst.spin);
3953 
3954 	hammer2_chain_setsubmod(trans, chain);
3955 }
3956 
3957 /*
3958  * Returns the index of the nearest element in the blockref array >= elm.
3959  * Returns (count) if no element could be found.  If delete_filter is non-zero
3960  * the scan filters out any blockrefs which match deleted chains on dbtree.
3961  *
3962  * Sets *key_nextp to the next key for loop purposes but does not modify
3963  * it if the next key would be higher than the current value of *key_nextp.
3964  * Note that *key_nexp can overflow to 0, which should be tested by the
3965  * caller.
3966  *
3967  * (*cache_indexp) is a heuristic and can be any value without effecting
3968  * the result.
3969  *
3970  * The spin lock on the related chain must be held.
3971  */
3972 int
3973 hammer2_base_find(hammer2_chain_t *parent,
3974 		  hammer2_blockref_t *base, int count,
3975 		  int *cache_indexp, hammer2_key_t *key_nextp,
3976 		  hammer2_key_t key_beg, hammer2_key_t key_end,
3977 		  int delete_filter)
3978 {
3979 	hammer2_chain_core_t *core = parent->core;
3980 	hammer2_blockref_t *scan;
3981 	hammer2_key_t scan_end;
3982 	int i;
3983 	int limit;
3984 
3985 	/*
3986 	 * Require the live chain's already have their core's counted
3987 	 * so we can optimize operations.
3988 	 */
3989         KKASSERT((parent->flags & HAMMER2_CHAIN_DUPLICATED) ||
3990 		 core->flags & HAMMER2_CORE_COUNTEDBREFS);
3991 
3992 	/*
3993 	 * Degenerate case
3994 	 */
3995 	if (count == 0 || base == NULL)
3996 		return(count);
3997 
3998 	/*
3999 	 * Sequential optimization using *cache_indexp.  This is the most
4000 	 * likely scenario.
4001 	 *
4002 	 * We can avoid trailing empty entries on live chains, otherwise
4003 	 * we might have to check the whole block array.
4004 	 */
4005 	i = *cache_indexp;
4006 	cpu_ccfence();
4007 	if (parent->flags & HAMMER2_CHAIN_DUPLICATED)
4008 		limit = count;
4009 	else
4010 		limit = core->live_zero;
4011 	if (i >= limit)
4012 		i = limit - 1;
4013 	if (i < 0)
4014 		i = 0;
4015 	KKASSERT(i < count);
4016 
4017 	/*
4018 	 * Search backwards
4019 	 */
4020 	scan = &base[i];
4021 	while (i > 0 && (scan->type == 0 || scan->key > key_beg)) {
4022 		--scan;
4023 		--i;
4024 	}
4025 	*cache_indexp = i;
4026 
4027 	/*
4028 	 * Search forwards, stop when we find a scan element which
4029 	 * encloses the key or until we know that there are no further
4030 	 * elements.
4031 	 */
4032 	while (i < count) {
4033 		if (scan->type != 0) {
4034 			scan_end = scan->key +
4035 				   ((hammer2_key_t)1 << scan->keybits) - 1;
4036 			if (scan->key > key_beg || scan_end >= key_beg) {
4037 				/*
4038 				 * Check to see if the entry is covered by
4039 				 * a deleted chain and ignore the entry if
4040 				 * it is and delete_filter != 0.
4041 				 */
4042 				if (delete_filter == 0)
4043 					break;
4044 				if (hammer2_chain_find_deleted(
4045 					parent, scan->key, scan_end) == NULL) {
4046 					break;
4047 				}
4048 			}
4049 		}
4050 		if (i >= limit)
4051 			return (count);
4052 		++scan;
4053 		++i;
4054 	}
4055 	if (i != count) {
4056 		*cache_indexp = i;
4057 		if (i >= limit) {
4058 			i = count;
4059 		} else {
4060 			scan_end = scan->key +
4061 				   ((hammer2_key_t)1 << scan->keybits);
4062 			if (scan_end && (*key_nextp > scan_end ||
4063 					 *key_nextp == 0)) {
4064 				*key_nextp = scan_end;
4065 			}
4066 		}
4067 	}
4068 	return (i);
4069 }
4070 
4071 /*
4072  * Do a combined search and return the next match either from the blockref
4073  * array or from the in-memory chain.  Sets *bresp to the returned bref in
4074  * both cases, or sets it to NULL if the search exhausted.  Only returns
4075  * a non-NULL chain if the search matched from the in-memory chain.
4076  *
4077  * When no in-memory chain has been found and a non-NULL bref is returned
4078  * in *bresp.
4079  *
4080  * Must be called with above's spinlock held.  Spinlock remains held
4081  * through the operation.
4082  *
4083  * The returned chain is not locked or referenced.  Use the returned bref
4084  * to determine if the search exhausted or not.  Iterate if the base find
4085  * is chosen but matches a deleted chain.
4086  */
4087 static hammer2_chain_t *
4088 hammer2_combined_find(hammer2_chain_t *parent,
4089 		      hammer2_blockref_t *base, int count,
4090 		      int *cache_indexp, hammer2_key_t *key_nextp,
4091 		      hammer2_key_t key_beg, hammer2_key_t key_end,
4092 		      hammer2_blockref_t **bresp)
4093 {
4094 	hammer2_blockref_t *bref;
4095 	hammer2_chain_t *chain;
4096 	int i;
4097 
4098 	/*
4099 	 * Lookup in block array and in rbtree.
4100 	 */
4101 	*key_nextp = key_end + 1;
4102 	i = hammer2_base_find(parent, base, count, cache_indexp,
4103 			      key_nextp, key_beg, key_end, 1);
4104 	chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end);
4105 
4106 	/*
4107 	 * Neither matched
4108 	 */
4109 	if (i == count && chain == NULL) {
4110 		*bresp = NULL;
4111 		return(NULL);
4112 	}
4113 
4114 	/*
4115 	 * Only chain matched.
4116 	 */
4117 	if (i == count) {
4118 		bref = &chain->bref;
4119 		goto found;
4120 	}
4121 
4122 	/*
4123 	 * Only blockref matched.
4124 	 */
4125 	if (chain == NULL) {
4126 		bref = &base[i];
4127 		goto found;
4128 	}
4129 
4130 	/*
4131 	 * Both in-memory and blockref matched, select the nearer element.
4132 	 *
4133 	 * If both are flush with the left-hand side or both are the
4134 	 * same distance away, select the chain.  In this situation the
4135 	 * chain must have been loaded from the matching blockmap.
4136 	 */
4137 	if ((chain->bref.key <= key_beg && base[i].key <= key_beg) ||
4138 	    chain->bref.key == base[i].key) {
4139 		KKASSERT(chain->bref.key == base[i].key);
4140 		if ((chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
4141 			kprintf("chain not bmapped %p.%d %08x\n", chain, chain->bref.type, chain->flags);
4142 			kprintf("in chain mod/del %016jx %016jx\n", chain->modify_tid, chain->delete_tid);
4143 			kprintf("and updlo/hi %016jx %016jx\n", chain->update_lo, chain->update_hi);
4144 		}
4145 		KKASSERT(chain->flags & HAMMER2_CHAIN_BMAPPED);
4146 		bref = &chain->bref;
4147 		goto found;
4148 	}
4149 
4150 	/*
4151 	 * Select the nearer key
4152 	 */
4153 	if (chain->bref.key < base[i].key) {
4154 		bref = &chain->bref;
4155 	} else {
4156 		bref = &base[i];
4157 		chain = NULL;
4158 	}
4159 
4160 	/*
4161 	 * If the bref is out of bounds we've exhausted our search.
4162 	 */
4163 found:
4164 	if (bref->key > key_end) {
4165 		*bresp = NULL;
4166 		chain = NULL;
4167 	} else {
4168 		*bresp = bref;
4169 	}
4170 	return(chain);
4171 }
4172 
4173 /*
4174  * Locate the specified block array element and delete it.  The element
4175  * must exist.
4176  *
4177  * The spin lock on the related chain must be held.
4178  *
4179  * NOTE: live_count was adjusted when the chain was deleted, so it does not
4180  *	 need to be adjusted when we commit the media change.
4181  */
4182 void
4183 hammer2_base_delete(hammer2_trans_t *trans, hammer2_chain_t *parent,
4184 		    hammer2_blockref_t *base, int count,
4185 		    int *cache_indexp, hammer2_chain_t *child)
4186 {
4187 	hammer2_blockref_t *elm = &child->bref;
4188 	hammer2_chain_core_t *core = parent->core;
4189 	hammer2_key_t key_next;
4190 	int i;
4191 
4192 	/*
4193 	 * Delete element.  Expect the element to exist.
4194 	 *
4195 	 * XXX see caller, flush code not yet sophisticated enough to prevent
4196 	 *     re-flushed in some cases.
4197 	 */
4198 	key_next = 0; /* max range */
4199 	i = hammer2_base_find(parent, base, count, cache_indexp,
4200 			      &key_next, elm->key, elm->key, 0);
4201 	if (i == count || base[i].type == 0 ||
4202 	    base[i].key != elm->key || base[i].keybits != elm->keybits) {
4203 		spin_unlock(&core->cst.spin);
4204 		panic("delete base %p element not found at %d/%d elm %p\n"
4205 		      "child ino_reason=%08x\n",
4206 		      base, i, count, elm,
4207 		      child->inode_reason);
4208 		return;
4209 	}
4210 	bzero(&base[i], sizeof(*base));
4211 	base[i].mirror_tid = (intptr_t)parent;		/* MEDIA DEBUG */
4212 	base[i].modify_tid = (intptr_t)child;		/* MEDIA DEBUG */
4213 	base[i].check.debug.sync_tid = trans->sync_tid;	/* MEDIA DEBUG */
4214 	++parent->nremoves;				/* DEBUG */
4215 
4216 	/*
4217 	 * We can only optimize core->live_zero for live chains.
4218 	 */
4219 	if ((parent->flags & HAMMER2_CHAIN_DUPLICATED) == 0) {
4220 		if (core->live_zero == i + 1) {
4221 			while (--i >= 0 && base[i].type == 0)
4222 				;
4223 			core->live_zero = i + 1;
4224 		}
4225 	}
4226 }
4227 
4228 /*
4229  * Insert the specified element.  The block array must not already have the
4230  * element and must have space available for the insertion.
4231  *
4232  * The spin lock on the related chain must be held.
4233  *
4234  * NOTE: live_count was adjusted when the chain was deleted, so it does not
4235  *	 need to be adjusted when we commit the media change.
4236  */
4237 void
4238 hammer2_base_insert(hammer2_trans_t *trans __unused, hammer2_chain_t *parent,
4239 		    hammer2_blockref_t *base, int count,
4240 		    int *cache_indexp, hammer2_chain_t *child)
4241 {
4242 	hammer2_blockref_t *elm = &child->bref;
4243 	hammer2_chain_core_t *core = parent->core;
4244 	hammer2_key_t key_next;
4245 	hammer2_key_t xkey;
4246 	int i;
4247 	int j;
4248 	int k;
4249 	int l;
4250 	int u = 1;
4251 
4252 	/*
4253 	 * Insert new element.  Expect the element to not already exist
4254 	 * unless we are replacing it.
4255 	 *
4256 	 * XXX see caller, flush code not yet sophisticated enough to prevent
4257 	 *     re-flushed in some cases.
4258 	 */
4259 	key_next = 0; /* max range */
4260 	i = hammer2_base_find(parent, base, count, cache_indexp,
4261 			      &key_next, elm->key, elm->key, 0);
4262 
4263 	/*
4264 	 * Shortcut fill optimization, typical ordered insertion(s) may not
4265 	 * require a search.
4266 	 */
4267 	KKASSERT(i >= 0 && i <= count);
4268 
4269 	/*
4270 	 * We can only optimize core->live_zero for live chains.
4271 	 */
4272 	if (i == count && core->live_zero < count) {
4273 		if ((parent->flags & HAMMER2_CHAIN_DUPLICATED) == 0) {
4274 			i = core->live_zero++;
4275 			base[i] = *elm;
4276 	++parent->ninserts;	/* DEBUG */
4277 			return;
4278 		}
4279 	}
4280 
4281 	xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1;
4282 	if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) {
4283 		if (child->flags & HAMMER2_CHAIN_FLUSH_TEMPORARY) {
4284 			kprintf("child %p special replace\n", child);
4285 			base[i] = *elm;
4286 			return;
4287 		} else {
4288 			spin_unlock(&core->cst.spin);
4289 			panic("insert base %p overlapping "
4290 			      "elements at %d elm %p\n",
4291 			      base, i, elm);
4292 		}
4293 	}
4294 
4295 	/*
4296 	 * Try to find an empty slot before or after.
4297 	 */
4298 	j = i;
4299 	k = i;
4300 	while (j > 0 || k < count) {
4301 		--j;
4302 		if (j >= 0 && base[j].type == 0) {
4303 			if (j == i - 1) {
4304 				base[j] = *elm;
4305 			} else {
4306 				bcopy(&base[j+1], &base[j],
4307 				      (i - j - 1) * sizeof(*base));
4308 				base[i - 1] = *elm;
4309 			}
4310 	++parent->ninserts;	/* DEBUG */
4311 			goto validate;
4312 		}
4313 		++k;
4314 		if (k < count && base[k].type == 0) {
4315 			bcopy(&base[i], &base[i+1],
4316 			      (k - i) * sizeof(hammer2_blockref_t));
4317 			base[i] = *elm;
4318 
4319 			/*
4320 			 * We can only update core->live_zero for live
4321 			 * chains.
4322 			 */
4323 			if ((parent->flags & HAMMER2_CHAIN_DUPLICATED) == 0) {
4324 				if (core->live_zero <= k)
4325 					core->live_zero = k + 1;
4326 			}
4327 			u = 2;
4328 	++parent->ninserts;	/* DEBUG */
4329 			goto validate;
4330 		}
4331 	}
4332 	panic("hammer2_base_insert: no room!");
4333 
4334 	/*
4335 	 * Debugging
4336 	 */
4337 validate:
4338 	key_next = 0;
4339 	for (l = 0; l < count; ++l) {
4340 		if (base[l].type) {
4341 			key_next = base[l].key +
4342 				   ((hammer2_key_t)1 << base[l].keybits) - 1;
4343 			break;
4344 		}
4345 	}
4346 	while (++l < count) {
4347 		if (base[l].type) {
4348 			if (base[l].key <= key_next)
4349 				panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l);
4350 			key_next = base[l].key +
4351 				   ((hammer2_key_t)1 << base[l].keybits) - 1;
4352 
4353 		}
4354 	}
4355 
4356 }
4357 
4358 #if 0
4359 
4360 /*
4361  * Sort the blockref array for the chain.  Used by the flush code to
4362  * sort the blockref[] array.
4363  *
4364  * The chain must be exclusively locked AND spin-locked.
4365  */
4366 typedef hammer2_blockref_t *hammer2_blockref_p;
4367 
4368 static
4369 int
4370 hammer2_base_sort_callback(const void *v1, const void *v2)
4371 {
4372 	hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1;
4373 	hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2;
4374 
4375 	/*
4376 	 * Make sure empty elements are placed at the end of the array
4377 	 */
4378 	if (bref1->type == 0) {
4379 		if (bref2->type == 0)
4380 			return(0);
4381 		return(1);
4382 	} else if (bref2->type == 0) {
4383 		return(-1);
4384 	}
4385 
4386 	/*
4387 	 * Sort by key
4388 	 */
4389 	if (bref1->key < bref2->key)
4390 		return(-1);
4391 	if (bref1->key > bref2->key)
4392 		return(1);
4393 	return(0);
4394 }
4395 
4396 void
4397 hammer2_base_sort(hammer2_chain_t *chain)
4398 {
4399 	hammer2_blockref_t *base;
4400 	int count;
4401 
4402 	switch(chain->bref.type) {
4403 	case HAMMER2_BREF_TYPE_INODE:
4404 		/*
4405 		 * Special shortcut for embedded data returns the inode
4406 		 * itself.  Callers must detect this condition and access
4407 		 * the embedded data (the strategy code does this for us).
4408 		 *
4409 		 * This is only applicable to regular files and softlinks.
4410 		 */
4411 		if (chain->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA)
4412 			return;
4413 		base = &chain->data->ipdata.u.blockset.blockref[0];
4414 		count = HAMMER2_SET_COUNT;
4415 		break;
4416 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
4417 	case HAMMER2_BREF_TYPE_INDIRECT:
4418 		/*
4419 		 * Optimize indirect blocks in the INITIAL state to avoid
4420 		 * I/O.
4421 		 */
4422 		KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0);
4423 		base = &chain->data->npdata[0];
4424 		count = chain->bytes / sizeof(hammer2_blockref_t);
4425 		break;
4426 	case HAMMER2_BREF_TYPE_VOLUME:
4427 		base = &chain->hmp->voldata.sroot_blockset.blockref[0];
4428 		count = HAMMER2_SET_COUNT;
4429 		break;
4430 	case HAMMER2_BREF_TYPE_FREEMAP:
4431 		base = &chain->hmp->voldata.freemap_blockset.blockref[0];
4432 		count = HAMMER2_SET_COUNT;
4433 		break;
4434 	default:
4435 		panic("hammer2_chain_lookup: unrecognized blockref type: %d",
4436 		      chain->bref.type);
4437 		base = NULL;	/* safety */
4438 		count = 0;	/* safety */
4439 	}
4440 	kqsort(base, count, sizeof(*base), hammer2_base_sort_callback);
4441 }
4442 
4443 #endif
4444 
4445 /*
4446  * Chain memory management
4447  */
4448 void
4449 hammer2_chain_wait(hammer2_chain_t *chain)
4450 {
4451 	tsleep(chain, 0, "chnflw", 1);
4452 }
4453 
4454 /*
4455  * Manage excessive memory resource use for chain and related
4456  * structures.
4457  */
4458 void
4459 hammer2_chain_memory_wait(hammer2_pfsmount_t *pmp)
4460 {
4461 	long waiting;
4462 	long count;
4463 	long limit;
4464 #if 0
4465 	static int zzticks;
4466 #endif
4467 
4468 	/*
4469 	 * Atomic check condition and wait.  Also do an early speedup of
4470 	 * the syncer to try to avoid hitting the wait.
4471 	 */
4472 	for (;;) {
4473 		waiting = pmp->inmem_dirty_chains;
4474 		cpu_ccfence();
4475 		count = waiting & HAMMER2_DIRTYCHAIN_MASK;
4476 
4477 		limit = pmp->mp->mnt_nvnodelistsize / 10;
4478 		if (limit < hammer2_limit_dirty_chains)
4479 			limit = hammer2_limit_dirty_chains;
4480 		if (limit < 1000)
4481 			limit = 1000;
4482 
4483 #if 0
4484 		if ((int)(ticks - zzticks) > hz) {
4485 			zzticks = ticks;
4486 			kprintf("count %ld %ld\n", count, limit);
4487 		}
4488 #endif
4489 
4490 		/*
4491 		 * Block if there are too many dirty chains present, wait
4492 		 * for the flush to clean some out.
4493 		 */
4494 		if (count > limit) {
4495 			tsleep_interlock(&pmp->inmem_dirty_chains, 0);
4496 			if (atomic_cmpset_long(&pmp->inmem_dirty_chains,
4497 					       waiting,
4498 				       waiting | HAMMER2_DIRTYCHAIN_WAITING)) {
4499 				speedup_syncer(pmp->mp);
4500 				tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED,
4501 				       "chnmem", hz);
4502 			}
4503 			continue;	/* loop on success or fail */
4504 		}
4505 
4506 		/*
4507 		 * Try to start an early flush before we are forced to block.
4508 		 */
4509 		if (count > limit * 7 / 10)
4510 			speedup_syncer(pmp->mp);
4511 		break;
4512 	}
4513 }
4514 
4515 void
4516 hammer2_chain_memory_inc(hammer2_pfsmount_t *pmp)
4517 {
4518 	if (pmp)
4519 		atomic_add_long(&pmp->inmem_dirty_chains, 1);
4520 }
4521 
4522 void
4523 hammer2_chain_memory_wakeup(hammer2_pfsmount_t *pmp)
4524 {
4525 	long waiting;
4526 
4527 	if (pmp == NULL)
4528 		return;
4529 
4530 	for (;;) {
4531 		waiting = pmp->inmem_dirty_chains;
4532 		cpu_ccfence();
4533 		if (atomic_cmpset_long(&pmp->inmem_dirty_chains,
4534 				       waiting,
4535 				       (waiting - 1) &
4536 					~HAMMER2_DIRTYCHAIN_WAITING)) {
4537 			break;
4538 		}
4539 	}
4540 
4541 	if (waiting & HAMMER2_DIRTYCHAIN_WAITING)
4542 		wakeup(&pmp->inmem_dirty_chains);
4543 }
4544 
4545 static
4546 void
4547 adjreadcounter(hammer2_blockref_t *bref, size_t bytes)
4548 {
4549 	long *counterp;
4550 
4551 	switch(bref->type) {
4552 	case HAMMER2_BREF_TYPE_DATA:
4553 		counterp = &hammer2_iod_file_read;
4554 		break;
4555 	case HAMMER2_BREF_TYPE_INODE:
4556 		counterp = &hammer2_iod_meta_read;
4557 		break;
4558 	case HAMMER2_BREF_TYPE_INDIRECT:
4559 		counterp = &hammer2_iod_indr_read;
4560 		break;
4561 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
4562 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
4563 		counterp = &hammer2_iod_fmap_read;
4564 		break;
4565 	default:
4566 		counterp = &hammer2_iod_volu_read;
4567 		break;
4568 	}
4569 	*counterp += bytes;
4570 }
4571