xref: /dragonfly/sys/vfs/hammer2/hammer2_flush.c (revision 82730a9c)
1 /*
2  * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #include <sys/cdefs.h>
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/types.h>
40 #include <sys/lock.h>
41 #include <sys/uuid.h>
42 
43 #include "hammer2.h"
44 
45 #define FLUSH_DEBUG 0
46 
47 /*
48  * Recursively flush the specified chain.  The chain is locked and
49  * referenced by the caller and will remain so on return.  The chain
50  * will remain referenced throughout but can temporarily lose its
51  * lock during the recursion to avoid unnecessarily stalling user
52  * processes.
53  */
54 struct hammer2_flush_info {
55 	hammer2_chain_t *parent;
56 	hammer2_trans_t	*trans;
57 	int		depth;
58 	int		diddeferral;
59 	int		pass;
60 	int		cache_index;
61 	int		domodify;
62 	struct h2_flush_deferral_list flush_list;
63 	hammer2_tid_t	sync_tid;	/* flush synchronization point */
64 };
65 
66 typedef struct hammer2_flush_info hammer2_flush_info_t;
67 
68 static void hammer2_chain_flush_core(hammer2_flush_info_t *info,
69 				hammer2_chain_t **chainp);
70 static int hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data);
71 static int hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data);
72 static void hammer2_flush_core_update(hammer2_chain_core_t *core,
73 			  hammer2_flush_info_t *info);
74 static void hammer2_rollup_stats(hammer2_chain_t *parent,
75 				hammer2_chain_t *child, int how);
76 
77 /*
78  * Can we ignore a chain for the purposes of flushing modifications
79  * to the media?
80  *
81  * This code is now degenerate.  We used to have to distinguish between
82  * deleted chains and deleted chains associated with inodes that were
83  * still open.  This mechanic has been fixed so the function is now
84  * a simple test.
85  */
86 static __inline
87 int
88 h2ignore_deleted(hammer2_flush_info_t *info, hammer2_chain_t *chain)
89 {
90 	return (chain->delete_tid <= info->sync_tid);
91 }
92 
93 #if 0
94 static __inline
95 void
96 hammer2_updatestats(hammer2_flush_info_t *info, hammer2_blockref_t *bref,
97 		    int how)
98 {
99 	hammer2_key_t bytes;
100 
101 	if (bref->type != 0) {
102 		bytes = 1 << (bref->data_off & HAMMER2_OFF_MASK_RADIX);
103 		if (bref->type == HAMMER2_BREF_TYPE_INODE)
104 			info->inode_count += how;
105 		if (how < 0)
106 			info->data_count -= bytes;
107 		else
108 			info->data_count += bytes;
109 	}
110 }
111 #endif
112 
113 /*
114  * Transaction support functions for writing to the filesystem.
115  *
116  * Initializing a new transaction allocates a transaction ID.  Typically
117  * passed a pmp (hmp passed as NULL), indicating a cluster transaction.  Can
118  * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single
119  * media target.  The latter mode is used by the recovery code.
120  *
121  * TWO TRANSACTION IDs can run concurrently, where one is a flush and the
122  * other is a set of any number of concurrent filesystem operations.  We
123  * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops>
124  * or we can have <running_flush> + <concurrent_fs_ops>.
125  *
126  * During a flush, new fs_ops are only blocked until the fs_ops prior to
127  * the flush complete.  The new fs_ops can then run concurrent with the flush.
128  *
129  * Buffer-cache transactions operate as fs_ops but never block.  A
130  * buffer-cache flush will run either before or after the current pending
131  * flush depending on its state.
132  *
133  * sync_tid vs real_tid.  For flush transactions ONLY, the flush operation
134  * actually uses two transaction ids, one for the flush operation itself,
135  * and <N+1> for any freemap allocations made as a side-effect.  real_tid
136  * is fixed at <N>, sync_tid is adjusted dynamically as-needed.
137  *
138  * NOTE: The sync_tid for a flush's freemap allocation will match the
139  *	 sync_tid of the following <concurrent_fs_ops> transaction(s).
140  *	 The freemap topology will be out-of-step by one transaction id
141  *	 in order to give the flusher a stable freemap topology to flush
142  *	 out.  This is fixed up at mount-time using a quick incremental
143  *	 scan.
144  */
145 void
146 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp,
147 		   hammer2_mount_t *hmp, int flags)
148 {
149 	hammer2_trans_t *head;
150 
151 	bzero(trans, sizeof(*trans));
152 	if (pmp) {
153 		trans->pmp = pmp;
154 		KKASSERT(hmp == NULL);
155 		hmp = pmp->cluster.chains[0]->hmp;	/* XXX */
156 	} else {
157 		trans->hmp_single = hmp;
158 		KKASSERT(hmp);
159 	}
160 
161 	hammer2_voldata_lock(hmp);
162 	trans->flags = flags;
163 	trans->td = curthread;
164 	/*trans->delete_gen = 0;*/	/* multiple deletions within trans */
165 
166 	if (flags & HAMMER2_TRANS_ISFLUSH) {
167 		/*
168 		 * If multiple flushes are trying to run we have to
169 		 * wait until it is our turn.  All flushes are serialized.
170 		 *
171 		 * We queue ourselves and then wait to become the head
172 		 * of the queue, allowing all prior flushes to complete.
173 		 *
174 		 * A unique transaction id is required to avoid confusion
175 		 * when updating the block tables.
176 		 */
177 		++hmp->flushcnt;
178 		++hmp->voldata.alloc_tid;
179 		trans->sync_tid = hmp->voldata.alloc_tid;
180 		trans->real_tid = trans->sync_tid;
181 		++hmp->voldata.alloc_tid;
182 		TAILQ_INSERT_TAIL(&hmp->transq, trans, entry);
183 		if (TAILQ_FIRST(&hmp->transq) != trans) {
184 			trans->blocked = 1;
185 			while (trans->blocked) {
186 				lksleep(&trans->sync_tid, &hmp->voldatalk,
187 					0, "h2multf", hz);
188 			}
189 		}
190 	} else if (hmp->flushcnt == 0) {
191 		/*
192 		 * No flushes are pending, we can go.
193 		 */
194 		TAILQ_INSERT_TAIL(&hmp->transq, trans, entry);
195 		trans->sync_tid = hmp->voldata.alloc_tid;
196 		trans->real_tid = trans->sync_tid;
197 
198 		/* XXX improve/optimize inode allocation */
199 	} else {
200 		/*
201 		 * One or more flushes are pending.  We insert after
202 		 * the current flush and may block.  We have priority
203 		 * over any flushes that are not the current flush.
204 		 *
205 		 * TRANS_BUFCACHE transactions cannot block.
206 		 */
207 		TAILQ_FOREACH(head, &hmp->transq, entry) {
208 			if (head->flags & HAMMER2_TRANS_ISFLUSH)
209 				break;
210 		}
211 		KKASSERT(head);
212 		TAILQ_INSERT_AFTER(&hmp->transq, head, trans, entry);
213 		trans->sync_tid = head->real_tid + 1;
214 		trans->real_tid = trans->sync_tid;
215 
216 		if ((trans->flags & HAMMER2_TRANS_BUFCACHE) == 0) {
217 			if (TAILQ_FIRST(&hmp->transq) != head) {
218 				trans->blocked = 1;
219 				while (trans->blocked) {
220 					lksleep(&trans->sync_tid,
221 						&hmp->voldatalk, 0,
222 						"h2multf", hz);
223 				}
224 			}
225 		}
226 	}
227 	if (flags & HAMMER2_TRANS_NEWINODE) {
228 		if (hmp->voldata.inode_tid < HAMMER2_INODE_START)
229 			hmp->voldata.inode_tid = HAMMER2_INODE_START;
230 		trans->inode_tid = hmp->voldata.inode_tid++;
231 	}
232 	hammer2_voldata_unlock(hmp, 0);
233 }
234 
235 void
236 hammer2_trans_done(hammer2_trans_t *trans)
237 {
238 	hammer2_mount_t *hmp;
239 	hammer2_trans_t *head;
240 	hammer2_trans_t *scan;
241 
242 	if (trans->pmp)
243 		hmp = trans->pmp->cluster.chains[0]->hmp;
244 	else
245 		hmp = trans->hmp_single;
246 
247 	/*
248 	 * Remove and adjust flushcnt
249 	 */
250 	hammer2_voldata_lock(hmp);
251 	TAILQ_REMOVE(&hmp->transq, trans, entry);
252 	if (trans->flags & HAMMER2_TRANS_ISFLUSH)
253 		--hmp->flushcnt;
254 
255 	/*
256 	 * Unblock the head of the queue and any additional transactions
257 	 * up to the next flush.
258 	 */
259 	head = TAILQ_FIRST(&hmp->transq);
260 	if (head && head->blocked) {
261 		head->blocked = 0;
262 		wakeup(&head->sync_tid);
263 
264 		scan = TAILQ_NEXT(head, entry);
265 		while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
266 			if (scan->blocked) {
267 				scan->blocked = 0;
268 				wakeup(&scan->sync_tid);
269 			}
270 			scan = TAILQ_NEXT(scan, entry);
271 		}
272 	}
273 	hammer2_voldata_unlock(hmp, 0);
274 }
275 
276 /*
277  * Flush the chain and all modified sub-chains through the specified
278  * synchronization point (sync_tid), propagating parent chain modifications
279  * and mirror_tid updates back up as needed.  Since we are recursing downward
280  * we do not have to deal with the complexities of multi-homed chains (chains
281  * with multiple parents).
282  *
283  * Caller must have interlocked against any non-flush-related modifying
284  * operations in progress whos modify_tid values are less than or equal
285  * to the passed sync_tid.
286  *
287  * Caller must have already vetted synchronization points to ensure they
288  * are properly flushed.  Only snapshots and cluster flushes can create
289  * these sorts of synchronization points.
290  *
291  * This routine can be called from several places but the most important
292  * is from the hammer2_vop_reclaim() function.  We want to try to completely
293  * clean out the inode structure to prevent disconnected inodes from
294  * building up and blowing out the kmalloc pool.  However, it is not actually
295  * necessary to flush reclaimed inodes to maintain HAMMER2's crash recovery
296  * capability.
297  *
298  * chain is locked on call and will remain locked on return.  If a flush
299  * occured, the chain's MOVED bit will be set indicating that its parent
300  * (which is not part of the flush) should be updated.  The chain may be
301  * replaced by the call.
302  */
303 void
304 hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
305 {
306 	hammer2_chain_t *chain = *chainp;
307 	hammer2_chain_t *scan;
308 	hammer2_chain_core_t *core;
309 	hammer2_flush_info_t info;
310 	int loops;
311 
312 	/*
313 	 * Execute the recursive flush and handle deferrals.
314 	 *
315 	 * Chains can be ridiculously long (thousands deep), so to
316 	 * avoid blowing out the kernel stack the recursive flush has a
317 	 * depth limit.  Elements at the limit are placed on a list
318 	 * for re-execution after the stack has been popped.
319 	 */
320 	bzero(&info, sizeof(info));
321 	TAILQ_INIT(&info.flush_list);
322 	info.trans = trans;
323 	info.sync_tid = trans->sync_tid;
324 	info.cache_index = -1;
325 
326 	core = chain->core;
327 #if FLUSH_DEBUG
328 	kprintf("CHAIN FLUSH trans %p.%016jx chain %p.%d mod %016jx upd %016jx\n", trans, trans->sync_tid, chain, chain->bref.type, chain->modify_tid, core->update_lo);
329 #endif
330 
331 	/*
332 	 * Extra ref needed because flush_core expects it when replacing
333 	 * chain.
334 	 */
335 	hammer2_chain_ref(chain);
336 	loops = 0;
337 
338 	for (;;) {
339 		/*
340 		 * Unwind deep recursions which had been deferred.  This
341 		 * can leave MOVED set for these chains, which will be
342 		 * handled when we [re]flush chain after the unwind.
343 		 */
344 		while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) {
345 			KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
346 			TAILQ_REMOVE(&info.flush_list, scan, flush_node);
347 			atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
348 
349 			/*
350 			 * Now that we've popped back up we can do a secondary
351 			 * recursion on the deferred elements.
352 			 *
353 			 * NOTE: hammer2_chain_flush() may replace scan.
354 			 */
355 			if (hammer2_debug & 0x0040)
356 				kprintf("deferred flush %p\n", scan);
357 			hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE);
358 			hammer2_chain_drop(scan);	/* ref from deferral */
359 			hammer2_chain_flush(trans, &scan);
360 			hammer2_chain_unlock(scan);
361 		}
362 
363 		/*
364 		 * [re]flush chain.
365 		 */
366 		info.diddeferral = 0;
367 		hammer2_chain_flush_core(&info, &chain);
368 #if FLUSH_DEBUG
369 		kprintf("flush_core_done parent=<base> chain=%p.%d %08x\n",
370 			chain, chain->bref.type, chain->flags);
371 #endif
372 
373 		/*
374 		 * Only loop if deep recursions have been deferred.
375 		 */
376 		if (TAILQ_EMPTY(&info.flush_list))
377 			break;
378 
379 		if (++loops % 1000 == 0) {
380 			kprintf("hammer2_chain_flush: excessive loops on %p\n",
381 				chain);
382 			if (hammer2_debug & 0x100000)
383 				Debugger("hell4");
384 		}
385 	}
386 	hammer2_chain_drop(chain);
387 	*chainp = chain;
388 }
389 
390 /*
391  * This is the core of the chain flushing code.  The chain is locked by the
392  * caller and must also have an extra ref on it by the caller, and remains
393  * locked and will have an extra ref on return.
394  *
395  * If the flush accomplished any work chain will be flagged MOVED
396  * indicating a copy-on-write propagation back up is required.
397  * Deep sub-nodes may also have been entered onto the deferral list.
398  * MOVED is never set on the volume root.
399  *
400  * NOTE: modify_tid is different from MODIFIED.  modify_tid is updated
401  *	 only when a chain is specifically modified, and not updated
402  *	 for copy-on-write propagations.  MODIFIED is set on any modification
403  *	 including copy-on-write propagations.
404  *
405  * NOTE: We are responsible for updating chain->bref.mirror_tid and
406  *	 core->update_lo  The caller is responsible for processing us into
407  *	 our parent (if any).
408  *
409  *	 We are also responsible for updating chain->core->update_lo to
410  *	 prevent repeated recursions due to deferrals.
411  *
412  * WARNING! bref.mirror_tid may only be updated if either the MODIFIED bit
413  *	    is already zero or if we clear it.
414  *
415  */
416 static void
417 hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp)
418 {
419 	hammer2_chain_t *chain = *chainp;
420 	hammer2_chain_t *saved_parent;
421 	hammer2_mount_t *hmp;
422 	hammer2_chain_core_t *core;
423 #if 0
424 	hammer2_blockref_t *bref;
425 	char *bdata;
426 	hammer2_io_t *dio;
427 	int error;
428 #endif
429 	int diddeferral;
430 
431 	hmp = chain->hmp;
432 	core = chain->core;
433 	diddeferral = info->diddeferral;
434 
435 	/*
436 	 * Check if we even have any work to do.
437 	 *
438 	 * We do not update core->update_lo because there might be other
439 	 * paths to the core and we haven't actually checked it.
440 	 *
441 	 * This bit of code is capable of short-cutting entire sub-trees
442 	 * if they have not been touched.
443 	 */
444 	if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0 &&
445 	    (core->update_lo >= info->sync_tid ||
446 	     chain->bref.mirror_tid >= info->sync_tid ||
447 	     chain->bref.mirror_tid >= core->update_hi)) {
448 		KKASSERT(chain->modify_tid <= info->sync_tid);
449 		/* don't update update_lo, there may be other paths to core */
450 		/* don't update bref.mirror_tid, scan2 is not called */
451 		return;
452 	}
453 
454 	/*
455 	 * Ignore chains which have already been flushed through the current
456 	 * synchronization point.
457 	 */
458 	KKASSERT (chain->bref.mirror_tid <= info->sync_tid);
459 	if (chain->bref.mirror_tid == info->sync_tid) {
460 		/* do not update core->update_lo, there may be another path */
461 		return;
462 	}
463 
464 	/*
465 	 * Ignore chains modified beyond the current flush point.  These
466 	 * will be treated as if they did not exist.  Subchains with lower
467 	 * modify_tid's will still be accessible via other parents.
468 	 *
469 	 * Do not update bref.mirror_tid here, it will interfere with
470 	 * synchronization.  e.g. inode flush tid 1, concurrent D-D tid 2,
471 	 * then later on inode flush tid 2.  If we were to set mirror_tid
472 	 * to 1 during inode flush tid 1 the blockrefs would only be partially
473 	 * updated (and likely panic).
474 	 *
475 	 * Do not update core->update_lo here, there might be other paths
476 	 * to the core and we haven't actually flushed it.
477 	 *
478 	 * (vchain and fchain are exceptions since they cannot be duplicated)
479 	 */
480 	if (chain->modify_tid > info->sync_tid &&
481 	    chain != &hmp->fchain && chain != &hmp->vchain) {
482 		/* do not update bref.mirror_tid, scan2 ignores chain */
483 		/* do not update core->update_lo, there may be another path */
484 		return;
485 	}
486 
487 	saved_parent = info->parent;
488 	info->parent = chain;
489 retry:
490 
491 	/*
492 	 * Chains deleted as-of the flush synchronization point require some
493 	 * special early handling to avoid double flushing because multiple
494 	 * deletions are sometimes forced within the same transaction.
495 	 * Allowing the flush to proceed through more than one path can wind
496 	 * up updating the chain's block table twice and cause an assertion.
497 	 *
498 	 * We don't check the 'same transaction' part but simply punt in this
499 	 * situation.  We must still check for multiple deletions, since any
500 	 * terminal (non-stale) deletion still requires processing to at
501 	 * least clean up the children, and also (for inodes) there might
502 	 * still be an open descriptor.
503 	 *
504 	 * Clear MODIFIED but set MOVED to ensure that the parent still
505 	 * deals with it.
506 	 */
507 	if (chain->delete_tid <= info->sync_tid &&
508 	    (chain->flags & HAMMER2_CHAIN_DUPLICATED)) {
509 		if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
510 #if 0
511 			/*
512 			 * XXX should be able to invalidate the buffer here.
513 			 * XXX problem if reused, snapshotted, or reactivated.
514 			 */
515 			if (chain->dio) {
516 				hammer2_io_setinval(chain->dio, chain->bytes);
517 			}
518 #endif
519 			if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
520 				hammer2_chain_ref(chain);
521 				atomic_set_int(&chain->flags,
522 					       HAMMER2_CHAIN_MOVED);
523 			}
524 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
525 			hammer2_chain_memory_wakeup(chain->pmp);
526 			hammer2_chain_drop(chain);
527 		}
528 
529 		/*
530 		 * Update mirror_tid, indicating that chain is synchronized
531 		 * on its modification and block table.  This probably isn't
532 		 * needed since scan2 should ignore deleted chains anyway.
533 		 *
534 		 * NOTE: bref.mirror_tid cannot be updated
535 		 *	 unless MODIFIED is cleared or already
536 		 *	 clear.
537 		 */
538 		if (chain->bref.mirror_tid < info->sync_tid)
539 			chain->bref.mirror_tid = info->sync_tid;
540 		/* do not update core->update_lo, there may be another path */
541 		info->parent = saved_parent;
542 		return;
543 	}
544 
545 	/*
546 	 * Recurse if we are not up-to-date.  Once we are done we will
547 	 * update update_lo if there were no deferrals.  update_lo can become
548 	 * higher than update_hi and is used to prevent re-recursions during
549 	 * the same flush cycle.
550 	 *
551 	 * update_hi was already checked and prevents initial recursions on
552 	 * subtrees which have not been modified.
553 	 *
554 	 * NOTE: We must recurse whether chain is flagged DELETED or not.
555 	 *	 However, if it is flagged DELETED we limit sync_tid to
556 	 *	 delete_tid to ensure that the chain's bref.mirror_tid is
557 	 *	 not fully updated and causes it to miss the non-DELETED
558 	 *	 path.
559 	 *
560 	 * NOTE: If a deferral occurs hammer2_chain_flush() will flush the
561 	 *	 deferred chain independently which will update it's
562 	 *	 bref.mirror_tid and prevent it from deferred again.
563 	 */
564 	if (chain->bref.mirror_tid < info->sync_tid &&
565 	    chain->bref.mirror_tid < core->update_hi) {
566 		hammer2_chain_layer_t *layer;
567 		int saved_domodify;
568 		int save_gen;
569 
570 		/*
571 		 * Races will bump update_hi above trans->sync_tid and should
572 		 * not affect this test.
573 		 *
574 		 * We don't want to set our chain to MODIFIED gratuitously.
575 		 *
576 		 * We need an extra ref on chain because we are going to
577 		 * release its lock temporarily in our child loop.
578 		 */
579 
580 		/*
581 		 * Run two passes.  The first pass handles MODIFIED and
582 		 * update_lo recursions while the second pass handles
583 		 * MOVED chains on the way back up.
584 		 *
585 		 * If the stack gets too deep we defer the chain.   Since
586 		 * hammer2_chain_core's can be shared at multiple levels
587 		 * in the tree, we may encounter a chain that we had already
588 		 * deferred.  We could undefer it but it will probably just
589 		 * defer again so it is best to leave it deferred.
590 		 *
591 		 * Scan1 is recursive.
592 		 *
593 		 * NOTE: The act of handling a modified/submodified chain can
594 		 *	 cause the MOVED Flag to be set.  It can also be set
595 		 *	 via hammer2_chain_delete() and in other situations.
596 		 *
597 		 * NOTE: RB_SCAN() must be used instead of RB_FOREACH()
598 		 *	 because children can be physically removed during
599 		 *	 the scan.
600 		 *
601 		 * NOTE: We would normally not care about insertions except
602 		 *	 that some insertions might occur from the flush
603 		 *	 itself, so loop on generation number changes.
604 		 */
605 		saved_domodify = info->domodify;
606 		info->domodify = 0;
607 
608 		if (chain->flags & HAMMER2_CHAIN_DEFERRED) {
609 			++info->diddeferral;
610 		} else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
611 			if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
612 				hammer2_chain_ref(chain);
613 				TAILQ_INSERT_TAIL(&info->flush_list,
614 						  chain, flush_node);
615 				atomic_set_int(&chain->flags,
616 					       HAMMER2_CHAIN_DEFERRED);
617 			}
618 			++info->diddeferral;
619 		} else {
620 			spin_lock(&core->cst.spin);
621 			KKASSERT(core->good == 0x1234 && core->sharecnt > 0);
622 			do {
623 				save_gen = core->generation;
624 				TAILQ_FOREACH_REVERSE(layer, &core->layerq,
625 						      h2_layer_list, entry) {
626 					++layer->refs;
627 					KKASSERT(layer->good == 0xABCD);
628 					RB_SCAN(hammer2_chain_tree,
629 						&layer->rbtree,
630 						NULL, hammer2_chain_flush_scan1,
631 						info);
632 					--layer->refs;
633 				}
634 			} while (core->generation != save_gen);
635 			spin_unlock(&core->cst.spin);
636 		}
637 
638 		if (info->parent != chain) {
639 			kprintf("ZZZ\n");
640 			hammer2_chain_drop(chain);
641 			hammer2_chain_ref(info->parent);
642 		}
643 		chain = info->parent;
644 
645 		/*
646 		 * chain was unlocked during the scan1 recursion and may
647 		 * have been deleted, destroyed, or even synchronously
648 		 * flushed due to aliasing.
649 		 *
650 		 * The flush continues normally in the first two places as
651 		 * the deletion or destruction does NOT affect the flush
652 		 * as-of the flush synchronization point.
653 		 *
654 		 * We must detect the last case and avoid flushing chain twice.
655 		 */
656 #if 0
657 		if (chain->delete_tid <= info->sync_tid &&
658 		    (chain->flags & HAMMER2_CHAIN_DUPLICATED)) {
659 			kprintf("xxx\n");
660 			goto retry;
661 		}
662 #endif
663 		if (chain->bref.mirror_tid >= info->sync_tid ||
664 		    chain->bref.mirror_tid >= core->update_hi) {
665 			kprintf("yyy\n");
666 			goto retry;
667 		}
668 
669 		/*
670 		 * If any deferral occurred we must set domodify to 0 to avoid
671 		 * potentially modifying the parent twice (now and when we run
672 		 * the deferral list), as doing so could cause the blockref
673 		 * update to run on a block array which has already been
674 		 * updated.
675 		 */
676 		if (info->domodify && diddeferral != info->diddeferral)
677 			info->domodify = 0;
678 
679 		/*
680 		 * THIS IS THE ONLY POINT IN THE FLUSH WHERE A PARENT IN THE
681 		 * NOMINAL TOPOLOGY, OTHER THAN FREEMAP ALLOCATIONS, IS
682 		 * MODIFIED.  FREEMAP ALLOCATIONS WILL MODIFY THE FREEMAP
683 		 * TOPOLOGY WITH SYNC_TID+1 AND DO NOT AFFECT THE CURRENT
684 		 * FLUSH.
685 		 *
686 		 * Modifying the parent can create issues if the current
687 		 * parent is already in a modified state with an earlier
688 		 * transaction id.  We want to avoid an endless flush loop
689 		 * on the original parent so we must clear its modified bit
690 		 * after creating the new parent, if they wind up being
691 		 * different.  Care must also be taken to avoid flushing the
692 		 * same parent twice.
693 		 *
694 		 * We are responsible for setting the parent into a modified
695 		 * state before we scan the children to update the parent's
696 		 * block table.  This must essentially be done as an atomic
697 		 * operation (the parent must remain locked throughout the
698 		 * operation), otherwise other transactions can squeeze a
699 		 * delete-duplicate in and create block table havoc.
700 		 *
701 		 * NOTE: Blockrefs are only updated on live chains.
702 		 *
703 		 * NOTE: Modifying the parent generally causes a
704 		 *	 delete-duplicate to occur from within the flush
705 		 *	 itself, with an allocation from the freemap occuring
706 		 *	 as an additional side-effect.
707 		 *
708 		 * NOTE: If the parent was deleted our modified chain will
709 		 *	 also be marked deleted, but since it inherits the
710 		 *	 parent's delete_tid it will still appear to be
711 		 *	 'live' for the purposes of the flush.
712 		 */
713 		if (info->domodify && !h2ignore_deleted(info, chain)) {
714 			KKASSERT(chain->modify_tid < info->sync_tid);
715 
716 			/*
717 			 * The scan1 loop and/or flush_core is reentrant,
718 			 * particularly when core->generation changes.  To
719 			 * avoid havoc we have to prevent repetitive
720 			 * delete-duplicates of the same chain.
721 			 *
722 			 * After executing the modify set the original chain's
723 			 * bref.mirror_tid to prevent any reentrancy during
724 			 * the current flush cycle.
725 			 */
726 			hammer2_chain_modify(info->trans, &info->parent,
727 					     HAMMER2_MODIFY_NO_MODIFY_TID);
728 			if (info->parent != chain) {
729 				/*
730 				 * NOTE: bref.mirror_tid cannot be updated
731 				 *	 unless MODIFIED is cleared or already
732 				 *	 clear.
733 				 */
734 				if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
735 					atomic_clear_int(&chain->flags,
736 							HAMMER2_CHAIN_MODIFIED);
737 					hammer2_chain_memory_wakeup(chain->pmp);
738 					hammer2_chain_drop(chain);
739 				}
740 				if (chain->bref.mirror_tid < info->sync_tid)
741 					chain->bref.mirror_tid = info->sync_tid;
742 				hammer2_chain_drop(chain);
743 				hammer2_chain_ref(info->parent);
744 			}
745 			chain = info->parent;
746 		}
747 
748 		KKASSERT(chain == info->parent);
749 
750 		/*
751 		 * Handle successfully flushed children who are in the MOVED
752 		 * state on the way back up the recursion.  This can have
753 		 * the side-effect of clearing MOVED.
754 		 *
755 		 * Scan2 may replace info->parent.  If it does it will also
756 		 * replace the extra ref we made.
757 		 *
758 		 * Scan2 is non-recursive.
759 		 */
760 		if (diddeferral != info->diddeferral) {
761 			spin_lock(&core->cst.spin);
762 		} else {
763 			KKASSERT(chain == info->parent);
764 			KKASSERT(info->domodify == 0 ||
765 				 (chain->flags & HAMMER2_CHAIN_FLUSHED) == 0);
766 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSHED);
767 			spin_lock(&core->cst.spin);
768 			KKASSERT(core->good == 0x1234 && core->sharecnt > 0);
769 			KKASSERT(info->parent->core == core);
770 			TAILQ_FOREACH_REVERSE(layer, &core->layerq,
771 					      h2_layer_list, entry) {
772 				info->pass = 1;
773 				++layer->refs;
774 				KKASSERT(layer->good == 0xABCD);
775 				RB_SCAN(hammer2_chain_tree, &layer->rbtree,
776 					NULL, hammer2_chain_flush_scan2, info);
777 				info->pass = 2;
778 				RB_SCAN(hammer2_chain_tree, &layer->rbtree,
779 					NULL, hammer2_chain_flush_scan2, info);
780 				--layer->refs;
781 			}
782 		}
783 
784 		/*
785 		 * info->parent must not have been replaced again
786 		 */
787 		KKASSERT(info->parent == chain);
788 
789 		*chainp = chain;
790 
791 		hammer2_chain_layer_check_locked(chain->hmp, core);
792 		spin_unlock(&core->cst.spin);
793 
794 		/*
795 		 * Update the core only if no deferrals occurred.  Otherwise
796 		 * we could end up clearing the MOVED bit in the children
797 		 * prematurely.
798 		 */
799 		if (diddeferral == info->diddeferral)
800 			hammer2_flush_core_update(core, info);
801 
802 		info->domodify = saved_domodify;
803 		KKASSERT(chain->refs > 1);
804 	} else {
805 		/*
806 		 * Update the core, no deferrals occurred in this path.
807 		 */
808 		hammer2_flush_core_update(core, info);
809 	}
810 	info->parent = saved_parent;
811 
812 #if FLUSH_DEBUG
813 	kprintf("POP    %p.%d defer=%d\n", chain, chain->bref.type, diddeferral);
814 #endif
815 
816 	/*
817 	 * Do not flush the chain if there were any deferrals.  It will be
818 	 * retried later after the deferrals are independently handled.
819 	 * Do not update update_lo or bref.mirror_tid.
820 	 */
821 	if (diddeferral != info->diddeferral) {
822 		if (hammer2_debug & 0x0008) {
823 			kprintf("%*.*s} %p/%d %04x (deferred)",
824 				info->depth, info->depth, "",
825 				chain, chain->refs, chain->flags);
826 		}
827 		/* do not update core->update_lo */
828 		/* do not update bref.mirror_tid */
829 		return;
830 	}
831 
832 	KKASSERT(chain->bref.mirror_tid < info->sync_tid);
833 
834 	/*
835 	 * Non-deferral path, chain is now deterministically being flushed.
836 	 * We've finished running the recursion and the blockref update.
837 	 *
838 	 * update bref.mirror_tid.  update_lo has already been updated.
839 	 *
840 	 * After this point we MUST dipose of the MODIFIED bit on chain.
841 	 */
842 	if (chain->bref.mirror_tid < info->sync_tid)
843 		chain->bref.mirror_tid = info->sync_tid;
844 
845 	/*
846 	 * Deal with deleted and destroyed chains on the way back up.
847 	 *
848 	 * Otherwise a deleted chain can be optimized by clearing MODIFIED
849 	 * without bothering to write it out.
850 	 *
851 	 * NOTE: We optimize this by noting that only 'inode' chains require
852 	 *	 this treatment.  When a file with an open descriptor is
853 	 *	 deleted only its inode is marked deleted.  Other deletions,
854 	 *	 such as indirect block deletions, will no longer be visible
855 	 *	 to the live filesystem and do not need to be updated.
856 	 */
857 	if (h2ignore_deleted(info, chain)) {
858 		/*
859 		 * At the moment we unconditionally set the MOVED bit because
860 		 * there are situations where it might not have been set due
861 		 * to similar delete-destroyed optimizations, and the parent
862 		 * of the parent still may need to be notified of the deletion.
863 		 */
864 		if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
865 			hammer2_chain_ref(chain);
866 			atomic_set_int(&chain->flags,
867 				       HAMMER2_CHAIN_MOVED);
868 		}
869 		if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
870 #if 0
871 			/*
872 			 * XXX should be able to invalidate the buffer here.
873 			 * XXX problem if reused, snapshotted, or reactivated.
874 			 */
875 			if (chain->dio) {
876 				hammer2_io_setinval(chain->dio, chain->bytes);
877 			}
878 #endif
879 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
880 			hammer2_chain_memory_wakeup(chain->pmp);
881 			hammer2_chain_drop(chain);
882 		}
883 		return;
884 	}
885 
886 	/*
887 	 * A degenerate flush might not have flushed anything and thus not
888 	 * processed modified blocks on the way back up.  Detect the case.
889 	 *
890 	 * This case can occur when a create, modify, and rename (to a
891 	 * different part of the topology) occurs in the same flush,
892 	 * resulting in a parent which effectively needs no modification.
893 	 */
894 	if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
895 #if 0
896 		kprintf("chain %p.%d %08x recursed but wasn't "
897 			"modified mirr=%016jx "
898 			"update_lo=%016jx synctid=%016jx\n",
899 			chain, chain->bref.type, chain->flags,
900 			chain->bref.mirror_tid,
901 			core->update_lo, info->sync_tid);
902 #endif
903 #if 0
904 		if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
905 			hammer2_chain_ref(chain);
906 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
907 		}
908 #endif
909 		return;
910 	}
911 
912 	/*
913 	 * Issue flush.
914 	 *
915 	 * A DELETED node that reaches this point must be flushed for
916 	 * synchronization point consistency.
917 	 *
918 	 * Update bref.mirror_tid, clear MODIFIED, and set MOVED.
919 	 *
920 	 * The caller will update the parent's reference to this chain
921 	 * by testing MOVED as long as the modification was in-bounds.
922 	 *
923 	 * MOVED is never set on the volume root as there is no parent
924 	 * to adjust.
925 	 */
926 	if (hammer2_debug & 0x1000) {
927 		kprintf("Flush %p.%d %016jx/%d sync_tid=%016jx data=%016jx\n",
928 			chain, chain->bref.type,
929 			chain->bref.key, chain->bref.keybits,
930 			info->sync_tid, chain->bref.data_off);
931 	}
932 	if (hammer2_debug & 0x2000) {
933 		Debugger("Flush hell");
934 	}
935 
936 	atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
937 	hammer2_chain_memory_wakeup(chain->pmp);
938 
939 	if ((chain->flags & HAMMER2_CHAIN_MOVED) ||
940 	    chain == &hmp->vchain ||
941 	    chain == &hmp->fchain) {
942 		/*
943 		 * Drop the ref from the MODIFIED bit we cleared,
944 		 * net -1 ref.
945 		 */
946 		hammer2_chain_drop(chain);
947 	} else {
948 		/*
949 		 * Drop the ref from the MODIFIED bit we cleared and
950 		 * set a ref for the MOVED bit we are setting.  Net 0 refs.
951 		 */
952 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
953 	}
954 
955 	/*
956 	 * If this is part of a recursive flush we can go ahead and write
957 	 * out the buffer cache buffer and pass a new bref back up the chain
958 	 * via the MOVED bit.
959 	 *
960 	 * Volume headers are NOT flushed here as they require special
961 	 * processing.
962 	 */
963 	switch(chain->bref.type) {
964 	case HAMMER2_BREF_TYPE_FREEMAP:
965 		hammer2_modify_volume(hmp);
966 		hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid;
967 		break;
968 	case HAMMER2_BREF_TYPE_VOLUME:
969 		/*
970 		 * The free block table is flushed by hammer2_vfs_sync()
971 		 * before it flushes vchain.  We must still hold fchain
972 		 * locked while copying voldata to volsync, however.
973 		 */
974 		hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
975 #if 0
976 		if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) ||
977 		    hmp->voldata.freemap_tid < info->trans->sync_tid) {
978 			/* this will modify vchain as a side effect */
979 			hammer2_chain_t *tmp = &hmp->fchain;
980 			hammer2_chain_flush(info->trans, &tmp);
981 			KKASSERT(tmp == &hmp->fchain);
982 		}
983 #endif
984 
985 		/*
986 		 * There is no parent to our root vchain and fchain to
987 		 * synchronize the bref to, their updated mirror_tid's
988 		 * must be synchronized to the volume header.
989 		 */
990 		hmp->voldata.mirror_tid = chain->bref.mirror_tid;
991 		/*hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid;*/
992 
993 		/*
994 		 * The volume header is flushed manually by the syncer, not
995 		 * here.  All we do here is adjust the crc's.
996 		 */
997 		KKASSERT(chain->data != NULL);
998 		KKASSERT(chain->dio == NULL);
999 
1000 		hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
1001 			hammer2_icrc32(
1002 				(char *)&hmp->voldata +
1003 				 HAMMER2_VOLUME_ICRC1_OFF,
1004 				HAMMER2_VOLUME_ICRC1_SIZE);
1005 		hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
1006 			hammer2_icrc32(
1007 				(char *)&hmp->voldata +
1008 				 HAMMER2_VOLUME_ICRC0_OFF,
1009 				HAMMER2_VOLUME_ICRC0_SIZE);
1010 		hmp->voldata.icrc_volheader =
1011 			hammer2_icrc32(
1012 				(char *)&hmp->voldata +
1013 				 HAMMER2_VOLUME_ICRCVH_OFF,
1014 				HAMMER2_VOLUME_ICRCVH_SIZE);
1015 		hmp->volsync = hmp->voldata;
1016 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
1017 		hammer2_chain_unlock(&hmp->fchain);
1018 		break;
1019 	case HAMMER2_BREF_TYPE_DATA:
1020 		/*
1021 		 * Data elements have already been flushed via the logical
1022 		 * file buffer cache.  Their hash was set in the bref by
1023 		 * the vop_write code.
1024 		 *
1025 		 * Make sure any device buffer(s) have been flushed out here.
1026 		 * (there aren't usually any to flush).
1027 		 */
1028 #if 0
1029 		/* XXX */
1030 		/* chain and chain->bref, NOWAIT operation */
1031 #endif
1032 		break;
1033 #if 0
1034 	case HAMMER2_BREF_TYPE_INDIRECT:
1035 		/*
1036 		 * Indirect blocks may be in an INITIAL state.  Use the
1037 		 * chain_lock() call to ensure that the buffer has been
1038 		 * instantiated (even though it is already locked the buffer
1039 		 * might not have been instantiated).
1040 		 *
1041 		 * Only write the buffer out if it is dirty, it is possible
1042 		 * the operating system had already written out the buffer.
1043 		 */
1044 		hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1045 		KKASSERT(chain->dio != NULL);
1046 
1047 		chain->data = NULL;
1048 		hammer2_io_bqrelse(&chain->dio);
1049 		hammer2_chain_unlock(chain);
1050 		break;
1051 #endif
1052 	case HAMMER2_BREF_TYPE_INDIRECT:
1053 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1054 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1055 	case HAMMER2_BREF_TYPE_INODE:
1056 		/*
1057 		 * Device-backed.  Buffer will be flushed by the sync
1058 		 * code XXX.
1059 		 */
1060 		KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
1061 		break;
1062 	default:
1063 		KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED);
1064 		panic("hammer2_chain_flush_core: unsupported embedded bref %d",
1065 		      chain->bref.type);
1066 		/* NOT REACHED */
1067 #if 0
1068 		/*
1069 		 * Embedded elements have to be flushed out.
1070 		 * (Basically just BREF_TYPE_INODE).
1071 		 */
1072 		KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED);
1073 		KKASSERT(chain->data != NULL);
1074 		KKASSERT(chain->dio == NULL);
1075 		bref = &chain->bref;
1076 
1077 		KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0);
1078 		KKASSERT(HAMMER2_DEC_CHECK(chain->bref.methods) ==
1079 			 HAMMER2_CHECK_ISCSI32 ||
1080 			 HAMMER2_DEC_CHECK(chain->bref.methods) ==
1081 			 HAMMER2_CHECK_FREEMAP);
1082 
1083 		/*
1084 		 * The data is embedded, we have to acquire the
1085 		 * buffer cache buffer and copy the data into it.
1086 		 */
1087 		error = hammer2_io_bread(hmp, bref->data_off, chain->bytes,
1088 					 &dio);
1089 		KKASSERT(error == 0);
1090 		bdata = hammer2_io_data(dio, bref->data_off);
1091 
1092 		/*
1093 		 * Copy the data to the buffer, mark the buffer
1094 		 * dirty, and convert the chain to unmodified.
1095 		 */
1096 		bcopy(chain->data, bdata, chain->bytes);
1097 		hammer2_io_bdwrite(&dio);
1098 
1099 		switch(HAMMER2_DEC_CHECK(chain->bref.methods)) {
1100 		case HAMMER2_CHECK_FREEMAP:
1101 			chain->bref.check.freemap.icrc32 =
1102 				hammer2_icrc32(chain->data, chain->bytes);
1103 			break;
1104 		case HAMMER2_CHECK_ISCSI32:
1105 			chain->bref.check.iscsi32.value =
1106 				hammer2_icrc32(chain->data, chain->bytes);
1107 			break;
1108 		default:
1109 			panic("hammer2_flush_core: bad crc type");
1110 			break; /* NOT REACHED */
1111 		}
1112 		if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
1113 			++hammer2_iod_meta_write;
1114 		else
1115 			++hammer2_iod_indr_write;
1116 #endif
1117 	}
1118 }
1119 
1120 /*
1121  * Flush helper scan1 (recursive)
1122  *
1123  * Flushes the children of the caller's chain (parent) and updates
1124  * the blockref, restricted by sync_tid.
1125  *
1126  * Ripouts during the loop should not cause any problems.  Because we are
1127  * flushing to a synchronization point, modification races will occur after
1128  * sync_tid and do not have to be flushed anyway.
1129  *
1130  * It is also ok if the parent is chain_duplicate()'d while unlocked because
1131  * the delete/duplication will install a delete_tid that is still larger than
1132  * our current sync_tid.
1133  *
1134  * WARNING! If we do not call chain_flush_core we must update bref.mirror_tid
1135  *	    ourselves.
1136  */
1137 static int
1138 hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data)
1139 {
1140 	hammer2_flush_info_t *info = data;
1141 	hammer2_trans_t *trans = info->trans;
1142 	hammer2_chain_t *parent = info->parent;
1143 	int	diddeferral;
1144 
1145 	if (hammer2_debug & 0x80000)
1146 		Debugger("hell3");
1147 	diddeferral = info->diddeferral;
1148 
1149 	/*
1150 	 * Child is beyond the flush synchronization zone, don't persue.
1151 	 * Remember that modifications generally delete-duplicate so if the
1152 	 * sub-tree is dirty another child will get us there.  But not this
1153 	 * one.
1154 	 *
1155 	 * Or MODIFIED is not set and child is already fully synchronized
1156 	 * with its sub-tree.  Don't persue.
1157 	 *
1158 	 * (child can never be fchain or vchain so a special check isn't
1159 	 *  needed).
1160 	 */
1161 	if (child->modify_tid > trans->sync_tid) {
1162 		KKASSERT(child->delete_tid >= child->modify_tid);
1163 		/* do not update child->core->update_lo, core not flushed */
1164 		/* do not update core->update_lo, there may be another path */
1165 		/* do not update mirror_tid, scan2 will ignore chain */
1166 		return (0);
1167 	}
1168 
1169 	/*
1170 	 * We must ref the child before unlocking the spinlock.
1171 	 *
1172 	 * The caller has added a ref to the parent so we can temporarily
1173 	 * unlock it in order to lock the child.
1174 	 */
1175 	hammer2_chain_ref(child);
1176 	spin_unlock(&parent->core->cst.spin);
1177 
1178 	hammer2_chain_unlock(parent);
1179 	hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
1180 
1181 	/*
1182 	 * No recursion needed if neither the child or anything under it
1183 	 * was messed with.
1184 	 */
1185 	if ((child->flags & HAMMER2_CHAIN_MODIFIED) == 0 &&
1186 	    child->core->update_lo >= info->sync_tid) {
1187 		KKASSERT((child->flags & HAMMER2_CHAIN_MODIFIED) == 0);
1188 		if (child->bref.mirror_tid < info->sync_tid)
1189 			child->bref.mirror_tid = info->sync_tid;
1190 		goto skip;
1191 	}
1192 
1193 	/*
1194 	 * XXX delete child if parent is deleted.  Propagate deletion
1195 	 *     downward.  TODO
1196 	 */
1197 
1198 
1199 	/*
1200 	 * Re-check original pre-lock conditions after locking.
1201 	 */
1202 	if (child->modify_tid > trans->sync_tid) {
1203 		hammer2_chain_unlock(child);
1204 		hammer2_chain_drop(child);
1205 		hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
1206 		spin_lock(&parent->core->cst.spin);
1207 		return (0);
1208 	}
1209 
1210 	if ((child->flags & HAMMER2_CHAIN_MODIFIED) == 0 &&
1211 	    child->core->update_lo >= info->sync_tid) {
1212 		KKASSERT((child->flags & HAMMER2_CHAIN_MODIFIED) == 0);
1213 		if (child->bref.mirror_tid < info->sync_tid)
1214 			child->bref.mirror_tid = info->sync_tid;
1215 		goto skip;
1216 	}
1217 
1218 	/*
1219 	 * Recurse and collect deferral data.
1220 	 */
1221 	++info->depth;
1222 	hammer2_chain_flush_core(info, &child);
1223 	--info->depth;
1224 
1225 skip:
1226 	/*
1227 	 * Check the conditions that could cause SCAN2 to modify the parent.
1228 	 * Modify the parent here instead of in SCAN2, which would cause
1229 	 * rollup chicken-and-egg races.
1230 	 *
1231 	 * Scan2 is expected to update bref.mirror_tid in the domodify case,
1232 	 * but will skip the child otherwise giving us the responsibility to
1233 	 * update bref.mirror_tid.
1234 	 *
1235 	 * WARNING!  Do NOT update the child's bref.mirror_tid right here,
1236 	 *	     even if there was no deferral.  Doing so would cause
1237 	 *	     confusion with the child's block array state in a
1238 	 *	     future flush.
1239 	 */
1240 	if (h2ignore_deleted(info, parent)) {
1241 		/*
1242 		 * Special optimization matching similar tests done in
1243 		 * flush_core, scan1, and scan2.  Avoid updating the block
1244 		 * table in the parent if the parent is no longer visible.
1245 		 */
1246 		;
1247 	} else if (child->delete_tid <= trans->sync_tid &&
1248 		   child->delete_tid > parent->bref.mirror_tid &&
1249 		   child->modify_tid <= parent->bref.mirror_tid) {
1250 		info->domodify = 1;
1251 	} else if (child->delete_tid > trans->sync_tid &&
1252 		   child->modify_tid > parent->bref.mirror_tid) {
1253 		info->domodify = 1;		/* base insertion */
1254 	}
1255 
1256 	/*
1257 	 * Relock to continue the loop
1258 	 */
1259 	hammer2_chain_unlock(child);
1260 	hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
1261 	hammer2_chain_drop(child);
1262 	KKASSERT(info->parent == parent);
1263 
1264 	spin_lock(&parent->core->cst.spin);
1265 	return (0);
1266 }
1267 
1268 /*
1269  * Flush helper scan2 (non-recursive)
1270  *
1271  * This pass on a chain's children propagates any MOVED or DELETED
1272  * elements back up the chain towards the root after those elements have
1273  * been fully flushed.  Unlike scan1, this function is NOT recursive and
1274  * the parent remains locked across the entire scan.
1275  *
1276  * SCAN2 is called twice, once with pass set to 1 and once with it set to 2.
1277  * We have to do this so base[] elements can be deleted in pass 1 to make
1278  * room for adding new elements in pass 2.
1279  *
1280  * This function also rolls up storage statistics.
1281  *
1282  * NOTE!  A deletion is a visbility issue, there can still be references to
1283  *	  deleted elements (for example, to an unlinked file which is still
1284  *	  open), and there can also be multiple chains pointing to the same
1285  *	  bref where some are deleted and some are not (for example due to
1286  *	  a rename).   So a chain marked for deletion is basically considered
1287  *	  to be live until it is explicitly destroyed or until its ref-count
1288  *	  reaches zero (also implying that MOVED and MODIFIED are clear).
1289  *
1290  * NOTE!  Info->parent will be locked but will only be instantiated/modified
1291  *	  if it is either MODIFIED or if scan1 determined that block table
1292  *	  updates will occur.
1293  *
1294  * NOTE!  SCAN2 is responsible for updating child->bref.mirror_tid only in
1295  *	  the case where it modifies the parent (does a base insertion
1296  *	  or deletion).  SCAN1 handled all other cases.
1297  */
1298 static int
1299 hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
1300 {
1301 	hammer2_flush_info_t *info = data;
1302 	hammer2_chain_t *parent = info->parent;
1303 	hammer2_chain_core_t *above = child->above;
1304 	hammer2_mount_t *hmp = child->hmp;
1305 	hammer2_trans_t *trans = info->trans;
1306 	hammer2_blockref_t *base;
1307 	int count;
1308 	int ok;
1309 
1310 #if FLUSH_DEBUG
1311 	kprintf("SCAN2 %p.%d %08x mod=%016jx del=%016jx trans=%016jx\n", child, child->bref.type, child->flags, child->modify_tid, child->delete_tid, info->trans->sync_tid);
1312 #endif
1313 	/*
1314 	 * Ignore children created after our flush point, treating them as
1315 	 * if they did not exist).  These children will not cause the parent
1316 	 * to be updated.
1317 	 *
1318 	 * Children deleted after our flush point are treated as having been
1319 	 * created for the purposes of the flush.  The parent's update_hi
1320 	 * will already be higher than our trans->sync_tid so the path for
1321 	 * the next flush is left intact.
1322 	 *
1323 	 * When we encounter such children and the parent chain has not been
1324 	 * deleted, delete/duplicated, or delete/duplicated-for-move, then
1325 	 * the parent may be used to funnel through several flush points.
1326 	 * These chains will still be visible to later flushes due to having
1327 	 * a higher update_hi than we can set in the current flush.
1328 	 */
1329 	if (child->modify_tid > trans->sync_tid) {
1330 		KKASSERT(child->delete_tid >= child->modify_tid);
1331 		goto finalize;
1332 	}
1333 
1334 #if 0
1335 	/*
1336 	 * Ignore children which have not changed.  The parent's block table
1337 	 * is already correct.
1338 	 *
1339 	 * XXX The MOVED bit is only cleared when all multi-homed parents
1340 	 *     have flushed, creating a situation where a re-flush can occur
1341 	 *     via a parent which has already flushed.  The hammer2_base_*()
1342 	 *     functions currently have a hack to deal with this case but
1343 	 *     we need something better.
1344 	 */
1345 	if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) {
1346 		KKASSERT((child->flags & HAMMER2_CHAIN_MODIFIED) == 0);
1347 		goto finalize;
1348 	}
1349 #endif
1350 
1351 	/*
1352 	 * Make sure child is referenced before we unlock.
1353 	 */
1354 	hammer2_chain_ref(child);
1355 	spin_unlock(&above->cst.spin);
1356 
1357 	/*
1358 	 * Parent reflushed after the child has passed them by should skip
1359 	 * due to the modify_tid test. XXX
1360 	 */
1361 	hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER);
1362 	KKASSERT(child->above == above);
1363 	KKASSERT(parent->core == above);
1364 
1365 	/*
1366 	 * The parent's blockref to the child must be deleted or updated.
1367 	 *
1368 	 * This point is not reached on successful DELETED optimizations
1369 	 * but can be reached on recursive deletions and restricted flushes.
1370 	 *
1371 	 * The chain_modify here may delete-duplicate the block.  This can
1372 	 * cause a multitude of issues if the block was already modified
1373 	 * by a later (post-flush) transaction.  Primarily blockrefs in
1374 	 * the later block can be out-of-date, so if the situation occurs
1375 	 * we can't throw away the MOVED bit on the current blocks until
1376 	 * the later blocks are flushed (so as to be able to regenerate all
1377 	 * the changes that were made).
1378 	 *
1379 	 * Because flushes are ordered we do not have to make a
1380 	 * modify/duplicate of indirect blocks.  That is, the flush
1381 	 * code does not have to kmalloc or duplicate anything.  We
1382 	 * can adjust the indirect block table in-place and reuse the
1383 	 * chain.  It IS possible that the chain has already been duplicated
1384 	 * or may wind up being duplicated on-the-fly by modifying code
1385 	 * on the frontend.  We simply use the original and ignore such
1386 	 * chains.  However, it does mean we can't clear the MOVED bit.
1387 	 *
1388 	 * XXX recursive deletions not optimized.
1389 	 */
1390 
1391 	switch(parent->bref.type) {
1392 	case HAMMER2_BREF_TYPE_INODE:
1393 		/*
1394 		 * Access the inode's block array.  However, there is no
1395 		 * block array if the inode is flagged DIRECTDATA.  The
1396 		 * DIRECTDATA case typicaly only occurs when a hardlink has
1397 		 * been shifted up the tree and the original inode gets
1398 		 * replaced with an OBJTYPE_HARDLINK placeholding inode.
1399 		 */
1400 		if (parent->data &&
1401 		    (parent->data->ipdata.op_flags &
1402 		     HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1403 			base = &parent->data->ipdata.u.blockset.blockref[0];
1404 		} else {
1405 			base = NULL;
1406 		}
1407 		count = HAMMER2_SET_COUNT;
1408 		break;
1409 	case HAMMER2_BREF_TYPE_INDIRECT:
1410 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1411 		if (parent->data)
1412 			base = &parent->data->npdata[0];
1413 		else
1414 			base = NULL;
1415 		count = parent->bytes / sizeof(hammer2_blockref_t);
1416 		break;
1417 	case HAMMER2_BREF_TYPE_VOLUME:
1418 		base = &hmp->voldata.sroot_blockset.blockref[0];
1419 		count = HAMMER2_SET_COUNT;
1420 		break;
1421 	case HAMMER2_BREF_TYPE_FREEMAP:
1422 		base = &parent->data->npdata[0];
1423 		count = HAMMER2_SET_COUNT;
1424 		break;
1425 	default:
1426 		base = NULL;
1427 		count = 0;
1428 		panic("hammer2_chain_flush_scan2: "
1429 		      "unrecognized blockref type: %d",
1430 		      parent->bref.type);
1431 	}
1432 
1433 	/*
1434 	 * Don't bother updating a deleted parent's blockrefs.
1435 	 *
1436 	 * Otherwise, we need to be COUNTEDBREFS synchronized for the
1437 	 * hammer2_base_*() functions.
1438 	 *
1439 	 * This test must match the similar one in flush_core.
1440 	 */
1441 #if FLUSH_DEBUG
1442 	kprintf("SCAN2 base=%p pass=%d PARENT %p.%d DTID=%016jx SYNC=%016jx\n",
1443 		base,
1444 		info->pass, parent, parent->bref.type, parent->delete_tid, trans->sync_tid);
1445 #endif
1446 	if (h2ignore_deleted(info, parent))
1447 		base = NULL;
1448 
1449 	/*
1450 	 * Update the parent's blockref table and propagate mirror_tid.
1451 	 *
1452 	 * NOTE! Children with modify_tid's beyond our flush point are
1453 	 *	 considered to not exist for the purposes of updating the
1454 	 *	 parent's blockref array.
1455 	 *
1456 	 * NOTE! SCAN1 has already put the parent in a modified state
1457 	 *	 so if it isn't we panic.
1458 	 *
1459 	 * NOTE! chain->modify_tid vs chain->bref.modify_tid.  The chain's
1460 	 *	 internal modify_tid is always updated based on creation
1461 	 *	 or delete-duplicate.  However, the bref.modify_tid is NOT
1462 	 *	 updated due to simple blockref updates.
1463 	 */
1464 #if FLUSH_DEBUG
1465 	kprintf("chain %p->%p pass %d trans %016jx sync %p.%d %016jx/%d C=%016jx D=%016jx PMIRROR %016jx\n",
1466 		parent, child,
1467 		info->pass, trans->sync_tid,
1468 		child, child->bref.type,
1469 		child->bref.key, child->bref.keybits,
1470 		child->modify_tid, child->delete_tid, parent->bref.mirror_tid);
1471 #endif
1472 
1473 	if (info->pass == 1 && child->delete_tid <= trans->sync_tid) {
1474 		/*
1475 		 * Deleting.  The block array is expected to contain the
1476 		 * child's entry if:
1477 		 *
1478 		 * (1) The deletion occurred after the parent's block table
1479 		 *     was last synchronized (delete_tid), and
1480 		 *
1481 		 * (2) The creation occurred before or during the parent's
1482 		 *     last block table synchronization.
1483 		 */
1484 #if FLUSH_DEBUG
1485 		kprintf("S2A %p.%d b=%p d/b=%016jx/%016jx m/b=%016jx/%016jx\n",
1486 			child, child->bref.type,
1487 			base, child->delete_tid, parent->bref.mirror_tid,
1488 			child->modify_tid, parent->bref.mirror_tid);
1489 #endif
1490 		if (base &&
1491 		    child->delete_tid > parent->bref.mirror_tid &&
1492 		    child->modify_tid <= parent->bref.mirror_tid) {
1493 			KKASSERT(child->flags & HAMMER2_CHAIN_MOVED);
1494 			KKASSERT(parent->modify_tid == trans->sync_tid ||
1495 				 (parent == &hmp->vchain ||
1496 				  parent == &hmp->fchain));
1497 			hammer2_rollup_stats(parent, child, -1);
1498 			spin_lock(&above->cst.spin);
1499 #if FLUSH_DEBUG
1500 			kprintf("trans %jx parent %p.%d child %p.%d m/d %016jx/%016jx "
1501 				"flg=%08x %016jx/%d delete\n",
1502 				trans->sync_tid,
1503 				parent, parent->bref.type,
1504 				child, child->bref.type,
1505 				child->modify_tid, child->delete_tid,
1506 				child->flags,
1507 				child->bref.key, child->bref.keybits);
1508 #endif
1509 			hammer2_base_delete(trans, parent, base, count,
1510 					    &info->cache_index, child);
1511 			spin_unlock(&above->cst.spin);
1512 		}
1513 	} else if (info->pass == 2 && child->delete_tid > trans->sync_tid) {
1514 		/*
1515 		 * Inserting.  The block array is expected to NOT contain
1516 		 * the child's entry if:
1517 		 *
1518 		 * (1) The creation occurred after the parent's block table
1519 		 *     was last synchronized (modify_tid), and
1520 		 *
1521 		 * (2) The child is not being deleted in the same
1522 		 *     transaction.
1523 		 */
1524 #if FLUSH_DEBUG
1525 		kprintf("S2B %p.%d b=%p d/b=%016jx/%016jx m/b=%016jx/%016jx\n",
1526 			child, child->bref.type,
1527 			base, child->delete_tid, parent->bref.mirror_tid,
1528 			child->modify_tid, parent->bref.mirror_tid);
1529 #endif
1530 		if (base &&
1531 		    child->modify_tid > parent->bref.mirror_tid) {
1532 			KKASSERT(child->flags & HAMMER2_CHAIN_MOVED);
1533 			KKASSERT(parent->modify_tid == trans->sync_tid ||
1534 				 (parent == &hmp->vchain ||
1535 				  parent == &hmp->fchain));
1536 			hammer2_rollup_stats(parent, child, 1);
1537 			spin_lock(&above->cst.spin);
1538 #if FLUSH_DEBUG
1539 			kprintf("trans %jx parent %p.%d child %p.%d m/d %016jx/%016jx "
1540 				"flg=%08x %016jx/%d insert\n",
1541 				trans->sync_tid,
1542 				parent, parent->bref.type,
1543 				child, child->bref.type,
1544 				child->modify_tid, child->delete_tid,
1545 				child->flags,
1546 				child->bref.key, child->bref.keybits);
1547 #endif
1548 			hammer2_base_insert(trans, parent, base, count,
1549 					    &info->cache_index, child);
1550 			spin_unlock(&above->cst.spin);
1551 		}
1552 	} else if (info->pass == 3 &&
1553 		   (child->delete_tid == HAMMER2_MAX_TID ||
1554 		    child->delete_tid <= trans->sync_tid) &&
1555 		   (child->flags & HAMMER2_CHAIN_MOVED)) {
1556 		/*
1557 		 * We can't clear the MOVED bit on children whos modify_tid
1558 		 * is beyond our current trans (was tested at top of scan2),
1559 		 * or on deleted children which have not yet been flushed
1560 		 * (handled above).
1561 		 *
1562 		 * Scan all parents of this child and determine if any of
1563 		 * them still need the child's MOVED bit.
1564 		 */
1565 		hammer2_chain_t *scan;
1566 
1567 		if (hammer2_debug & 0x4000)
1568 			kprintf("CHECKMOVED %p (parent=%p)", child, parent);
1569 
1570 		ok = 1;
1571 		spin_lock(&above->cst.spin);
1572 		TAILQ_FOREACH(scan, &above->ownerq, core_entry) {
1573 			/*
1574 			 * Can't clear child's MOVED until all parent's have
1575 			 * synchronized with it.
1576 			 *
1577 			 * Ignore deleted parents as-of this flush TID.
1578 			 * Ignore the current parent being flushed.
1579 			 */
1580 			if (h2ignore_deleted(info, scan))
1581 				continue;
1582 			if (scan == parent)
1583 				continue;
1584 
1585 			/*
1586 			 * For parents not already synchronized check to see
1587 			 * if the flush has gotten past them yet or not.
1588 			 *
1589 			 * This must roughly mimic the tests that
1590 			 * hammer2_chain_flush_core() runs or we could leave
1591 			 * children hanging around with MOVED set and cause
1592 			 * a memory leak.
1593 			 */
1594 			if (scan->bref.mirror_tid >= trans->sync_tid)
1595 				continue;
1596 			if (scan->bref.mirror_tid >= above->update_hi)
1597 				continue;
1598 
1599 			if (hammer2_debug & 0x4000) {
1600 				kprintf("(fail scan %p %016jx/%016jx)",
1601 					scan, scan->bref.mirror_tid,
1602 					child->modify_tid);
1603 			}
1604 			ok = 0;
1605 			break;
1606 		}
1607 		if (hammer2_debug & 0x4000)
1608 			kprintf("\n");
1609 		spin_unlock(&above->cst.spin);
1610 
1611 		/*
1612 		 * Can we finally clear MOVED?
1613 		 */
1614 		if (ok) {
1615 			if (hammer2_debug & 0x4000)
1616 				kprintf("clear moved %p.%d %016jx/%d\n",
1617 					child, child->bref.type,
1618 					child->bref.key, child->bref.keybits);
1619 			atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED);
1620 			if (child->flags & HAMMER2_CHAIN_MODIFIED) {
1621 				kprintf("modified child %p all parents updated\n",
1622 					child);
1623 				atomic_clear_int(&child->flags,
1624 						 HAMMER2_CHAIN_MODIFIED);
1625 				hammer2_chain_memory_wakeup(child->pmp);
1626 				hammer2_chain_drop(child);/* cleared MODIFIED */
1627 			}
1628 			hammer2_chain_drop(child);	/* cleared MOVED */
1629 		} else {
1630 			if (hammer2_debug & 0x4000)
1631 				kprintf("keep  moved %p.%d %016jx/%d\n",
1632 					child, child->bref.type,
1633 					child->bref.key, child->bref.keybits);
1634 		}
1635 	}
1636 
1637 	/*
1638 	 * Unlock the child.  This can wind up dropping the child's
1639 	 * last ref, removing it from the parent's RB tree, and deallocating
1640 	 * the structure.  The RB_SCAN() our caller is doing handles the
1641 	 * situation.
1642 	 */
1643 	hammer2_chain_unlock(child);
1644 	hammer2_chain_drop(child);
1645 	spin_lock(&above->cst.spin);
1646 
1647 	/*
1648 	 * The parent may have been delete-duplicated.
1649 	 */
1650 	info->parent = parent;
1651 finalize:
1652 	return (0);
1653 }
1654 
1655 /*
1656  * Update core->update_lo and attempt to clear the MOVED bit
1657  * for its children.
1658  *
1659  * This routine is only called after a sub-tree has been fully flushed
1660  * up to the current flush synchronization point.  Calling it under any
1661  * other condition will blow up flush tracking.
1662  */
1663 static
1664 void
1665 hammer2_flush_core_update(hammer2_chain_core_t *core,
1666 			  hammer2_flush_info_t *info)
1667 {
1668 	hammer2_chain_layer_t *layer;
1669 
1670 	spin_lock(&core->cst.spin);
1671 	if (core->update_lo < info->sync_tid)
1672 		core->update_lo = info->sync_tid;
1673 	TAILQ_FOREACH_REVERSE(layer, &core->layerq,
1674 			      h2_layer_list, entry) {
1675 		info->pass = 3;
1676 		++layer->refs;
1677 		KKASSERT(layer->good == 0xABCD);
1678 		RB_SCAN(hammer2_chain_tree, &layer->rbtree,
1679 			NULL, hammer2_chain_flush_scan2, info);
1680 		--layer->refs;
1681 		KKASSERT(info->parent->core == core);
1682 	}
1683 	spin_unlock(&core->cst.spin);
1684 }
1685 
1686 static
1687 void
1688 hammer2_rollup_stats(hammer2_chain_t *parent, hammer2_chain_t *child, int how)
1689 {
1690 #if 0
1691 	hammer2_chain_t *grandp;
1692 #endif
1693 
1694 	parent->data_count += child->data_count;
1695 	parent->inode_count += child->inode_count;
1696 	child->data_count = 0;
1697 	child->inode_count = 0;
1698 	if (how < 0) {
1699 		parent->data_count -= child->bytes;
1700 		if (child->bref.type == HAMMER2_BREF_TYPE_INODE) {
1701 			parent->inode_count -= 1;
1702 #if 0
1703 			/* XXX child->data may be NULL atm */
1704 			parent->data_count -= child->data->ipdata.data_count;
1705 			parent->inode_count -= child->data->ipdata.inode_count;
1706 #endif
1707 		}
1708 	} else if (how > 0) {
1709 		parent->data_count += child->bytes;
1710 		if (child->bref.type == HAMMER2_BREF_TYPE_INODE) {
1711 			parent->inode_count += 1;
1712 #if 0
1713 			/* XXX child->data may be NULL atm */
1714 			parent->data_count += child->data->ipdata.data_count;
1715 			parent->inode_count += child->data->ipdata.inode_count;
1716 #endif
1717 		}
1718 	}
1719 	if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
1720 		parent->data->ipdata.data_count += parent->data_count;
1721 		parent->data->ipdata.inode_count += parent->inode_count;
1722 #if 0
1723 		for (grandp = parent->above->first_parent;
1724 		     grandp;
1725 		     grandp = grandp->next_parent) {
1726 			grandp->data_count += parent->data_count;
1727 			grandp->inode_count += parent->inode_count;
1728 		}
1729 #endif
1730 		parent->data_count = 0;
1731 		parent->inode_count = 0;
1732 	}
1733 }
1734