xref: /dragonfly/sys/vfs/hammer2/hammer2_flush.c (revision 61c0377f)
1 /*
2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 /*
37  *			TRANSACTION AND FLUSH HANDLING
38  *
39  * Deceptively simple but actually fairly difficult to implement properly is
40  * how I would describe it.
41  *
42  * The biggest problem is that each PFS may belong to a cluster so its
43  * media modify_tid and mirror_tid fields are in a completely different
44  * domain than the topology related to the super-root.  Most of the code
45  * operates using modify_xid and delete_xid which are local identifiers.
46  *
47  * The second biggest problem is that we really want to allow flushes to run
48  * concurrently with new front-end operations, which means that the in-memory
49  * topology of hammer2_chain structures can represent both current state and
50  * snapshot-for-flush state.
51  */
52 
53 #include <sys/cdefs.h>
54 #include <sys/param.h>
55 #include <sys/systm.h>
56 #include <sys/types.h>
57 #include <sys/lock.h>
58 #include <sys/uuid.h>
59 
60 #include "hammer2.h"
61 
62 #define FLUSH_DEBUG 0
63 
64 /*
65  * Recursively flush the specified chain.  The chain is locked and
66  * referenced by the caller and will remain so on return.  The chain
67  * will remain referenced throughout but can temporarily lose its
68  * lock during the recursion to avoid unnecessarily stalling user
69  * processes.
70  */
71 struct hammer2_flush_info {
72 	hammer2_chain_t *parent;
73 	hammer2_trans_t	*trans;
74 	int		depth;
75 	int		diddeferral;
76 	int		cache_index;
77 	int		domodify;
78 	struct h2_flush_deferral_list flush_list;
79 	hammer2_xid_t	sync_xid;	/* memory synchronization point */
80 };
81 
82 typedef struct hammer2_flush_info hammer2_flush_info_t;
83 
84 static void hammer2_flush_core(hammer2_flush_info_t *info,
85 				hammer2_chain_t **chainp, int deleting);
86 static int hammer2_flush_pass1(hammer2_chain_t *child, void *data);
87 static int hammer2_flush_pass2(hammer2_chain_t *child, void *data);
88 static int hammer2_flush_pass3(hammer2_chain_t *child, void *data);
89 static int hammer2_flush_pass4(hammer2_chain_t *child, void *data);
90 static int hammer2_flush_pass5(hammer2_chain_t *child, void *data);
91 static void hammer2_rollup_stats(hammer2_chain_t *parent,
92 				hammer2_chain_t *child, int how);
93 
94 
95 /*
96  * Can we ignore a chain for the purposes of flushing modifications
97  * to the media?
98  *
99  * This code is now degenerate.  We used to have to distinguish between
100  * deleted chains and deleted chains associated with inodes that were
101  * still open.  This mechanic has been fixed so the function is now
102  * a simple test.
103  */
104 static __inline
105 int
106 h2ignore_deleted(hammer2_flush_info_t *info, hammer2_chain_t *chain)
107 {
108 	return (chain->delete_xid <= info->sync_xid);
109 }
110 
111 #if 0
112 static __inline
113 void
114 hammer2_updatestats(hammer2_flush_info_t *info, hammer2_blockref_t *bref,
115 		    int how)
116 {
117 	hammer2_key_t bytes;
118 
119 	if (bref->type != 0) {
120 		bytes = 1 << (bref->data_off & HAMMER2_OFF_MASK_RADIX);
121 		if (bref->type == HAMMER2_BREF_TYPE_INODE)
122 			info->inode_count += how;
123 		if (how < 0)
124 			info->data_count -= bytes;
125 		else
126 			info->data_count += bytes;
127 	}
128 }
129 #endif
130 
131 /*
132  * For now use a global transaction manager.  What we ultimately want to do
133  * is give each non-overlapping hmp/pmp group its own transaction manager.
134  *
135  * Transactions govern XID tracking on the physical media (the hmp), but they
136  * also govern TID tracking which is per-PFS and thus might cross multiple
137  * hmp's.  So we can't just stuff tmanage into hammer2_mount or
138  * hammer2_pfsmount.
139  */
140 static hammer2_trans_manage_t	tmanage;
141 
142 void
143 hammer2_trans_manage_init(void)
144 {
145 	lockinit(&tmanage.translk, "h2trans", 0, 0);
146 	TAILQ_INIT(&tmanage.transq);
147 	tmanage.flush_xid = 1;
148 	tmanage.alloc_xid = tmanage.flush_xid + 1;
149 }
150 
151 hammer2_xid_t
152 hammer2_trans_newxid(hammer2_pfsmount_t *pmp __unused)
153 {
154 	hammer2_xid_t xid;
155 
156 	for (;;) {
157 		xid = atomic_fetchadd_int(&tmanage.alloc_xid, 1);
158 		if (xid)
159 			break;
160 	}
161 	return xid;
162 }
163 
164 /*
165  * Transaction support functions for writing to the filesystem.
166  *
167  * Initializing a new transaction allocates a transaction ID.  Typically
168  * passed a pmp (hmp passed as NULL), indicating a cluster transaction.  Can
169  * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single
170  * media target.  The latter mode is used by the recovery code.
171  *
172  * TWO TRANSACTION IDs can run concurrently, where one is a flush and the
173  * other is a set of any number of concurrent filesystem operations.  We
174  * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops>
175  * or we can have <running_flush> + <concurrent_fs_ops>.
176  *
177  * During a flush, new fs_ops are only blocked until the fs_ops prior to
178  * the flush complete.  The new fs_ops can then run concurrent with the flush.
179  *
180  * Buffer-cache transactions operate as fs_ops but never block.  A
181  * buffer-cache flush will run either before or after the current pending
182  * flush depending on its state.
183  */
184 void
185 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, int flags)
186 {
187 	hammer2_trans_manage_t *tman;
188 	hammer2_trans_t *head;
189 
190 	tman = &tmanage;
191 
192 	bzero(trans, sizeof(*trans));
193 	trans->pmp = pmp;
194 	trans->flags = flags;
195 	trans->td = curthread;
196 
197 	lockmgr(&tman->translk, LK_EXCLUSIVE);
198 
199 	if (flags & HAMMER2_TRANS_ISFLUSH) {
200 		/*
201 		 * If multiple flushes are trying to run we have to
202 		 * wait until it is our turn.  All flushes are serialized.
203 		 *
204 		 * We queue ourselves and then wait to become the head
205 		 * of the queue, allowing all prior flushes to complete.
206 		 *
207 		 * Multiple normal transactions can share the current
208 		 * transaction id but a flush transaction needs its own
209 		 * unique TID for proper block table update accounting.
210 		 */
211 		++tman->flushcnt;
212 		++pmp->alloc_tid;
213 		pmp->flush_tid = pmp->alloc_tid;
214 		tman->flush_xid = hammer2_trans_newxid(pmp);
215 		trans->sync_xid = tman->flush_xid;
216 		++pmp->alloc_tid;
217 		TAILQ_INSERT_TAIL(&tman->transq, trans, entry);
218 		if (TAILQ_FIRST(&tman->transq) != trans) {
219 			trans->blocked = 1;
220 			while (trans->blocked) {
221 				lksleep(&trans->sync_xid, &tman->translk,
222 					0, "h2multf", hz);
223 			}
224 		}
225 	} else if (tman->flushcnt == 0) {
226 		/*
227 		 * No flushes are pending, we can go.  Use prior flush_xid + 1.
228 		 *
229 		 * WARNING!  Also see hammer2_chain_setsubmod()
230 		 */
231 		TAILQ_INSERT_TAIL(&tman->transq, trans, entry);
232 		trans->sync_xid = tman->flush_xid + 1;
233 
234 		/* XXX improve/optimize inode allocation */
235 	} else if (trans->flags & HAMMER2_TRANS_BUFCACHE) {
236 		/*
237 		 * A buffer cache transaction is requested while a flush
238 		 * is in progress.  The flush's PREFLUSH flag must be set
239 		 * in this situation.
240 		 *
241 		 * The buffer cache flush takes on the main flush's
242 		 * transaction id.
243 		 */
244 		TAILQ_FOREACH(head, &tman->transq, entry) {
245 			if (head->flags & HAMMER2_TRANS_ISFLUSH)
246 				break;
247 		}
248 		KKASSERT(head);
249 		KKASSERT(head->flags & HAMMER2_TRANS_PREFLUSH);
250 		trans->flags |= HAMMER2_TRANS_PREFLUSH;
251 		TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry);
252 		trans->sync_xid = head->sync_xid;
253 		trans->flags |= HAMMER2_TRANS_CONCURRENT;
254 		/* not allowed to block */
255 	} else {
256 		/*
257 		 * A normal transaction is requested while a flush is in
258 		 * progress.  We insert after the current flush and may
259 		 * block.
260 		 *
261 		 * WARNING!  Also see hammer2_chain_setsubmod()
262 		 */
263 		TAILQ_FOREACH(head, &tman->transq, entry) {
264 			if (head->flags & HAMMER2_TRANS_ISFLUSH)
265 				break;
266 		}
267 		KKASSERT(head);
268 		TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry);
269 		trans->sync_xid = head->sync_xid + 1;
270 		trans->flags |= HAMMER2_TRANS_CONCURRENT;
271 
272 		/*
273 		 * XXX for now we must block new transactions, synchronous
274 		 * flush mode is on by default.
275 		 *
276 		 * If synchronous flush mode is enabled concurrent
277 		 * frontend transactions during the flush are not
278 		 * allowed (except we don't have a choice for buffer
279 		 * cache ops).
280 		 */
281 		if (hammer2_synchronous_flush > 0 ||
282 		    TAILQ_FIRST(&tman->transq) != head) {
283 			trans->blocked = 1;
284 			while (trans->blocked) {
285 				lksleep(&trans->sync_xid,
286 					&tman->translk, 0,
287 					"h2multf", hz);
288 			}
289 		}
290 	}
291 	if (flags & HAMMER2_TRANS_NEWINODE) {
292 		if (pmp->spmp_hmp) {
293 			/*
294 			 * Super-root transaction, all new inodes have an
295 			 * inode number of 1.  Normal pfs inode cache
296 			 * semantics are not used.
297 			 */
298 			trans->inode_tid = 1;
299 		} else {
300 			/*
301 			 * Normal transaction
302 			 */
303 			if (pmp->inode_tid < HAMMER2_INODE_START)
304 				pmp->inode_tid = HAMMER2_INODE_START;
305 			trans->inode_tid = pmp->inode_tid++;
306 		}
307 	}
308 
309 	lockmgr(&tman->translk, LK_RELEASE);
310 }
311 
312 /*
313  * This may only be called while in a flush transaction.  It's a bit of a
314  * hack but after flushing a PFS we need to flush each volume root as part
315  * of the same transaction.
316  */
317 void
318 hammer2_trans_spmp(hammer2_trans_t *trans, hammer2_pfsmount_t *spmp)
319 {
320 	++spmp->alloc_tid;
321 	spmp->flush_tid = spmp->alloc_tid;
322 	++spmp->alloc_tid;
323 	trans->pmp = spmp;
324 }
325 
326 
327 void
328 hammer2_trans_done(hammer2_trans_t *trans)
329 {
330 	hammer2_trans_manage_t *tman;
331 	hammer2_trans_t *head;
332 	hammer2_trans_t *scan;
333 
334 	tman = &tmanage;
335 
336 	/*
337 	 * Remove.
338 	 */
339 	lockmgr(&tman->translk, LK_EXCLUSIVE);
340 	TAILQ_REMOVE(&tman->transq, trans, entry);
341 	head = TAILQ_FIRST(&tman->transq);
342 
343 	/*
344 	 * Adjust flushcnt if this was a flush, clear TRANS_CONCURRENT
345 	 * up through the next flush.  (If the head is a flush then we
346 	 * stop there, unlike the unblock code following this section).
347 	 */
348 	if (trans->flags & HAMMER2_TRANS_ISFLUSH) {
349 		--tman->flushcnt;
350 		scan = head;
351 		while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
352 			atomic_clear_int(&scan->flags,
353 					 HAMMER2_TRANS_CONCURRENT);
354 			scan = TAILQ_NEXT(scan, entry);
355 		}
356 	}
357 
358 	/*
359 	 * Unblock the head of the queue and any additional transactions
360 	 * up to the next flush.  The head can be a flush and it will be
361 	 * unblocked along with the non-flush transactions following it
362 	 * (which are allowed to run concurrently with it).
363 	 *
364 	 * In synchronous flush mode we stop if the head transaction is
365 	 * a flush.
366 	 */
367 	if (head && head->blocked) {
368 		head->blocked = 0;
369 		wakeup(&head->sync_xid);
370 
371 		if (hammer2_synchronous_flush > 0)
372 			scan = head;
373 		else
374 			scan = TAILQ_NEXT(head, entry);
375 		while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
376 			if (scan->blocked) {
377 				scan->blocked = 0;
378 				wakeup(&scan->sync_xid);
379 			}
380 			scan = TAILQ_NEXT(scan, entry);
381 		}
382 	}
383 	lockmgr(&tman->translk, LK_RELEASE);
384 }
385 
386 /*
387  * Flush the chain and all modified sub-chains through the specified
388  * synchronization point, propagating parent chain modifications and
389  * mirror_tid updates back up as needed.  Since we are recursing downward
390  * we do not have to deal with the complexities of multi-homed chains (chains
391  * with multiple parents).
392  *
393  * Caller must have interlocked against any non-flush-related modifying
394  * operations in progress whos modify_xid values are less than or equal
395  * to the passed sync_xid.
396  *
397  * Caller must have already vetted synchronization points to ensure they
398  * are properly flushed.  Only snapshots and cluster flushes can create
399  * these sorts of synchronization points.
400  *
401  * This routine can be called from several places but the most important
402  * is from VFS_SYNC.
403  *
404  * chain is locked on call and will remain locked on return.  If a flush
405  * occured, the chain's FLUSH_CREATE and/or FLUSH_DELETE bit will be set
406  * indicating that its parent (which is not part of the flush) should be
407  * updated.  The chain may be replaced by the call if it was modified.
408  */
409 void
410 hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
411 {
412 	hammer2_chain_t *chain = *chainp;
413 	hammer2_chain_t *scan;
414 	hammer2_chain_core_t *core;
415 	hammer2_flush_info_t info;
416 	int loops;
417 
418 	/*
419 	 * Execute the recursive flush and handle deferrals.
420 	 *
421 	 * Chains can be ridiculously long (thousands deep), so to
422 	 * avoid blowing out the kernel stack the recursive flush has a
423 	 * depth limit.  Elements at the limit are placed on a list
424 	 * for re-execution after the stack has been popped.
425 	 */
426 	bzero(&info, sizeof(info));
427 	TAILQ_INIT(&info.flush_list);
428 	info.trans = trans;
429 	info.sync_xid = trans->sync_xid;
430 	info.cache_index = -1;
431 
432 	core = chain->core;
433 
434 	/*
435 	 * Extra ref needed because flush_core expects it when replacing
436 	 * chain.
437 	 */
438 	hammer2_chain_ref(chain);
439 	loops = 0;
440 
441 	for (;;) {
442 		/*
443 		 * Unwind deep recursions which had been deferred.  This
444 		 * can leave the FLUSH_* bits set for these chains, which
445 		 * will be handled when we [re]flush chain after the unwind.
446 		 */
447 		while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) {
448 			KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
449 			TAILQ_REMOVE(&info.flush_list, scan, flush_node);
450 			atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
451 
452 			/*
453 			 * Now that we've popped back up we can do a secondary
454 			 * recursion on the deferred elements.
455 			 *
456 			 * NOTE: hammer2_flush() may replace scan.
457 			 */
458 			if (hammer2_debug & 0x0040)
459 				kprintf("deferred flush %p\n", scan);
460 			hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE);
461 			hammer2_chain_drop(scan);	/* ref from deferral */
462 			hammer2_flush(trans, &scan);
463 			hammer2_chain_unlock(scan);
464 		}
465 
466 		/*
467 		 * [re]flush chain.
468 		 */
469 		info.diddeferral = 0;
470 		hammer2_flush_core(&info, &chain, 0);
471 
472 		/*
473 		 * Only loop if deep recursions have been deferred.
474 		 */
475 		if (TAILQ_EMPTY(&info.flush_list))
476 			break;
477 
478 		if (++loops % 1000 == 0) {
479 			kprintf("hammer2_flush: excessive loops on %p\n",
480 				chain);
481 			if (hammer2_debug & 0x100000)
482 				Debugger("hell4");
483 		}
484 	}
485 	hammer2_chain_drop(chain);
486 	*chainp = chain;
487 }
488 
489 /*
490  * This is the core of the chain flushing code.  The chain is locked by the
491  * caller and must also have an extra ref on it by the caller, and remains
492  * locked and will have an extra ref on return.  Upon return, the caller can
493  * test the FLUSH_CREATE and FLUSH_DELETE bits to determine what action must
494  * be taken on the parent.
495  *
496  * (1) Determine if this node is a candidate for the flush, return if it is
497  *     not.  fchain and vchain are always candidates for the flush.
498  *
499  * (2) If we recurse too deep the chain is entered onto the deferral list and
500  *     the current flush stack is aborted until after the deferral list is
501  *     run.
502  *
503  * (3) Recursively flush live children (rbtree).  This can create deferrals.
504  *     A successful flush clears the MODIFIED bit in the children.
505  *
506  * (4) Recursively flush deleted children (dbtree).  Deletions may be
507  *     considered 'live' if the delete_tid is beyond the flush_tid.  If
508  *     considered 'dead' the recursion is still needed in order to clean
509  *     up the chain.  This can create deferrals.
510  *
511  *     A successful flush clears the MODIFIED bit in the children.
512  *
513  * (5) Calculate block table updates on chain based on the children scans
514  *     in (3) and (4) by testing the FLUSH_CREATE and FLUSH_DELETE bits,
515  *     modifying chain if necessary to perform the block table updates.
516  *     Deletions must be removed from dbtree when removed from the
517  *     chain's block table.
518  *
519  *     If 'chain' itself is marked DELETED but treated as live, the block
520  *     table update(s) must be propagated to all contemporary chains.  In
521  *     fact, all contemporary chains must be locked and updated uninterrupted
522  *     to avoid lookup races.  Once MODIFIED and FLUSH_CREATE is cleared,
523  *     a chain can be unloaded from memory with the expectation that it can
524  *     be reloaded later via the block table at any time.
525  *
526  *			WARNING ON BREF MODIFY_TID/MIRROR_TID
527  *
528  * blockref.modify_tid and blockref.mirror_tid are consistent only within a
529  * PFS.  This is why we cannot cache sync_tid in the transaction structure.
530  * Instead we access it from the pmp.
531  */
532 static void
533 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
534 		   int deleting)
535 {
536 	hammer2_chain_t *chain = *chainp;
537 	hammer2_chain_t *saved_parent;
538 	hammer2_mount_t *hmp;
539 	hammer2_pfsmount_t *pmp;
540 	hammer2_chain_core_t *core;
541 	int diddeferral;
542 	int saved_domodify;
543 
544 	hmp = chain->hmp;
545 	pmp = chain->pmp;
546 	core = chain->core;
547 	diddeferral = info->diddeferral;
548 
549 	/*
550 	 * (1) Check if we even have any work to do.
551 	 *
552 	 * This bit of code is capable of short-cutting entire sub-trees
553 	 * if they have not been touched or if they have already been
554 	 * flushed.
555 	 */
556 	if (/*(chain->flags & HAMMER2_CHAIN_MODIFIED) == 0 &&*/
557 	    (chain->update_xlo >= info->sync_xid ||	/* already synced */
558 	     chain->update_xlo >= chain->update_xhi)) {	/* old/unchanged */
559 		/* update_xlo/_xhi already filters chain out, do not update */
560 		/* don't update bref.mirror_tid, pass2 is not called */
561 		return;
562 	}
563 
564 	/*
565 	 * mirror_tid should not be forward-indexed
566 	 */
567 	KKASSERT(chain->bref.mirror_tid <= pmp->flush_tid);
568 
569 	/*
570 	 * Ignore chains modified beyond the current flush point.  These
571 	 * will be treated as if they did not exist.  Subchains with lower
572 	 * modify_xid's will still be accessible via other parents.
573 	 *
574 	 * Do not update bref.mirror_tid here, it will interfere with
575 	 * synchronization.  e.g. inode flush tid 1, concurrent D-D tid 2,
576 	 * then later on inode flush tid 2.  If we were to set mirror_tid
577 	 * to 1 during inode flush tid 1 the blockrefs would only be partially
578 	 * updated (and likely panic).
579 	 *
580 	 * We must update chain->update_xlo here to prevent re-entry in this
581 	 * flush transaction.
582 	 *
583 	 * (vchain and fchain are exceptions since they cannot be duplicated)
584 	 */
585 	if (chain->modify_xid > info->sync_xid &&
586 	    chain != &hmp->fchain && chain != &hmp->vchain) {
587 		/* do not update bref.mirror_tid, pass2 ignores chain */
588 		/* chain->update_xlo = info->sync_xid; */
589 		return;
590 	}
591 
592 	/*
593 	 * (2) Recurse downward and check recursion depth.
594 	 * (3) Flush live children
595 	 * (4) Flush deleted children
596 	 *
597 	 * We adjust update_xlo if not deferring chain to prevent re-entry
598 	 * in this flush cycle, but it must be set AFTER the flush in case
599 	 * a deeper flush hits the chain.  Otherwise the deeper flush cannot
600 	 * complete.  We re-check the condition after finishing the flushes.
601 	 *
602 	 * update_xhi was already checked and prevents initial recursions on
603 	 * subtrees which have not been modified.
604 	 */
605 	saved_parent = info->parent;
606 	saved_domodify = info->domodify;
607 	info->parent = chain;
608 	info->domodify = 0;
609 
610 	if (chain->flags & HAMMER2_CHAIN_DEFERRED) {
611 		++info->diddeferral;
612 	} else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
613 		if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
614 			hammer2_chain_ref(chain);
615 			TAILQ_INSERT_TAIL(&info->flush_list,
616 					  chain, flush_node);
617 			atomic_set_int(&chain->flags,
618 				       HAMMER2_CHAIN_DEFERRED);
619 		}
620 		++info->diddeferral;
621 	} else {
622 		hammer2_chain_t *scan;
623 
624 		/*
625 		 * The flush is queue-agnostic when running pass1, but order
626 		 * is important to catch any races where an existing
627 		 * flush-visible child is moved from rbtree->dbtree/dbq.
628 		 *
629 		 * New children added by concurrent operations are not visible
630 		 * to the flush anyway so we don't care about those races.
631 		 * However, the flush itself can move a child from dbq to
632 		 * dbtree (rare in pass1 but it is possible).
633 		 *
634 		 * pass1 can handle re-execution of a child.
635 		 */
636 		spin_lock(&core->cst.spin);
637 		KKASSERT(core->good == 0x1234 && core->sharecnt > 0);
638 		RB_SCAN(hammer2_chain_tree, &core->rbtree,
639 			NULL, hammer2_flush_pass1, info);
640 		RB_SCAN(hammer2_chain_tree, &core->dbtree,
641 			NULL, hammer2_flush_pass1, info);
642 		scan = TAILQ_FIRST(&core->dbq);
643 		while (scan) {
644 			KKASSERT(scan->flags & HAMMER2_CHAIN_ONDBQ);
645 			hammer2_flush_pass1(scan, info);
646 			if (scan->flags & HAMMER2_CHAIN_ONDBQ)
647 				scan = TAILQ_NEXT(scan, db_entry);
648 			else
649 				scan = TAILQ_FIRST(&core->dbq);
650 		}
651 		spin_unlock(&core->cst.spin);
652 	}
653 
654 	/*
655 	 * Stop if deferred, do not update update_xlo.
656 	 */
657 	if (info->diddeferral) {
658 		goto done;
659 	}
660 
661 	/*
662 	 * If a block table update is needed place the parent in a modified
663 	 * state, which might delete-duplicate it.
664 	 *
665 	 * - To prevent loops and other confusion, we synchronize update_xlo
666 	 *   for the original chain.
667 	 *
668 	 * - The original parent will not be used by the flush so we can
669 	 *   clear its MODIFIED bit.
670 	 */
671 	if (info->domodify) {
672 		hammer2_chain_modify(info->trans, &info->parent, 0);
673 		if (info->parent != chain) {
674 			/*
675 			 * chain	- old
676 			 * info->parent - new
677 			 *
678 			 * NOTE: bref.mirror_tid cannot be updated
679 			 *	 unless MODIFIED is cleared or already
680 			 *	 clear.
681 			 */
682 			chain->inode_reason += 0x10000000;
683 			info->parent->inode_reason += 0x100;
684 			KKASSERT(info->parent->core == chain->core);
685 			if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
686 				atomic_clear_int(&chain->flags,
687 						HAMMER2_CHAIN_MODIFIED);
688 				hammer2_pfs_memory_wakeup(pmp);
689 				hammer2_chain_drop(chain);
690 			}
691 #if 0
692 			if (chain->flags & HAMMER2_CHAIN_FLUSH_CREATE) {
693 				atomic_clear_int(&chain->flags,
694 						HAMMER2_CHAIN_FLUSH_CREATE);
695 				hammer2_chain_drop(chain);
696 			}
697 			if (info->parent->flags & HAMMER2_CHAIN_FLUSH_DELETE) {
698 				atomic_clear_int(&info->parent->flags,
699 						HAMMER2_CHAIN_FLUSH_DELETE);
700 				hammer2_chain_drop(info->parent);
701 			}
702 #endif
703 			if (chain->update_xlo < info->sync_xid)
704 				chain->update_xlo = info->sync_xid;
705 			KKASSERT(info->parent->update_xlo < info->sync_xid);
706 			hammer2_chain_drop(chain);
707 			hammer2_chain_ref(info->parent);
708 		}
709 		chain = info->parent;
710 	}
711 
712 	/*
713 	 * If a blocktable update is needed determine if this is the last
714 	 * parent requiring modification (check all parents using the core).
715 	 *
716 	 * Set bit 1 (0x02) of domodify if this is the last parent,
717 	 * which will cause scan2 to clear FLUSH_CREATE and FLUSH_DELETE.
718 	 */
719 	if (1) {
720 		hammer2_chain_t *scan;
721 
722 		spin_lock(&core->cst.spin);
723 		TAILQ_FOREACH(scan, &core->ownerq, core_entry) {
724 			/*
725 			 * Ignore the current parent being processed (we do
726 			 * not adjust update_xlo until after the fixup).
727 			 */
728 			if (scan == chain)
729 				continue;
730 
731 			/*
732 			 * Ignore chains which have already been updated
733 			 * Ignore unmodified chains (lo >= hi).
734 			 */
735 			if ((scan->flags & HAMMER2_CHAIN_MODIFIED) == 0 &&
736 			    (scan->update_xlo >= info->sync_xid ||
737 			     scan->update_xlo >= scan->update_xhi)) {
738 				continue;
739 			}
740 
741 			/*
742 			 * Cannot exhaust all parents if one is not visible
743 			 * to the flush.  The root chains are special-cased
744 			 * because they cannot really be delete-duplicated.
745 			 */
746 			if (scan != &scan->hmp->fchain &&
747 			    scan != &scan->hmp->vchain &&
748 			    scan->modify_xid > info->sync_xid) {
749 				break;
750 			}
751 
752 			/*
753 			 * Fail if update_xlo has not been synchronized to
754 			 * at least our sync_xid on any modified parent chain.
755 			 */
756 			if (scan->update_xlo < info->sync_xid)
757 				break;
758 		}
759 		spin_unlock(&core->cst.spin);
760 		if (scan == NULL)
761 			info->domodify |= 2;
762 	}
763 
764 	/*
765 	 * (5) Calculate block table updates or child cleanups.
766 	 *     (this whole operation has to be atomic)
767 	 *
768 	 * domodify 0x01 - block table updates
769 	 * 	    0x02 - child cleanups
770 	 *
771 	 *	pass2 - Process deletions from dbtree and dbq.
772 	 *	pass3 - Process insertions from rbtree, dbtree, and dbq.
773 	 *	pass4 - Cleanup child flags on the last parent and
774 	 *		Adjust queues on the live parent (deletions).
775 	 *	pass5 - Cleanup child flags on the last parent and
776 	 *		Adjust queues on the live parent (insertions).
777 	 *
778 	 *	Queue adjustments had to be separated into deletions and
779 	 *	insertions because both can occur on dbtree.
780 	 */
781 	if (info->domodify) {
782 		hammer2_chain_t *scan;
783 
784 		spin_lock(&core->cst.spin);
785 
786 		while ((info->domodify & 1) && info->parent) {
787 			/* PASS2 - Deletions */
788 			RB_SCAN(hammer2_chain_tree, &core->rbtree,
789 				NULL, hammer2_flush_pass2, info);
790 			RB_SCAN(hammer2_chain_tree, &core->dbtree,
791 				NULL, hammer2_flush_pass2, info);
792 			scan = TAILQ_FIRST(&core->dbq);
793 			TAILQ_FOREACH(scan, &core->dbq, db_entry) {
794 				KKASSERT(scan->flags & HAMMER2_CHAIN_ONDBQ);
795 				hammer2_flush_pass2(scan, info);
796 			}
797 
798 			/* PASS3 - Insertions */
799 			RB_SCAN(hammer2_chain_tree, &core->rbtree,
800 				NULL, hammer2_flush_pass3, info);
801 			RB_SCAN(hammer2_chain_tree, &core->dbtree,
802 				NULL, hammer2_flush_pass3, info);
803 			TAILQ_FOREACH(scan, &core->dbq, db_entry) {
804 				KKASSERT(scan->flags & HAMMER2_CHAIN_ONDBQ);
805 				hammer2_flush_pass3(scan, info);
806 			}
807 			info->parent = TAILQ_NEXT(info->parent, core_entry);
808 			if (info->parent)
809 				kprintf("FLUSH SPECIAL UPDATE (%p) %p.%d %08x\n",
810 					chain, info->parent,
811 					info->parent->bref.type,
812 					info->parent->flags);
813 		}
814 		info->parent = chain;
815 
816 		/* PASS4 - Cleanup */
817 		RB_SCAN(hammer2_chain_tree, &core->rbtree,
818 			NULL, hammer2_flush_pass4, info);
819 		scan = TAILQ_FIRST(&core->dbq);
820 		while (scan) {
821 			KKASSERT(scan->flags & HAMMER2_CHAIN_ONDBQ);
822 			hammer2_flush_pass4(scan, info);
823 			if (scan->flags & HAMMER2_CHAIN_ONDBQ)
824 				scan = TAILQ_NEXT(scan, db_entry);
825 			else
826 				scan = TAILQ_FIRST(&core->dbq);
827 		}
828 		RB_SCAN(hammer2_chain_tree, &core->dbtree,
829 			NULL, hammer2_flush_pass4, info);
830 
831 		/* PASS5 - Cleanup */
832 		RB_SCAN(hammer2_chain_tree, &core->rbtree,
833 			NULL, hammer2_flush_pass5, info);
834 		scan = TAILQ_FIRST(&core->dbq);
835 		while (scan) {
836 			KKASSERT(scan->flags & HAMMER2_CHAIN_ONDBQ);
837 			hammer2_flush_pass5(scan, info);
838 			if (scan->flags & HAMMER2_CHAIN_ONDBQ)
839 				scan = TAILQ_NEXT(scan, db_entry);
840 			else
841 				scan = TAILQ_FIRST(&core->dbq);
842 		}
843 		RB_SCAN(hammer2_chain_tree, &core->dbtree,
844 			NULL, hammer2_flush_pass5, info);
845 
846 		spin_unlock(&core->cst.spin);
847 	}
848 
849 	/*
850 	 * Synchronize update_xlo to prevent reentrant block updates of this
851 	 * parent.
852 	 */
853 	chain->update_xlo = info->sync_xid;
854 
855 	/*
856 	 * Skip the flush if the chain was not placed in a modified state
857 	 * or was not already in a modified state.
858 	 */
859 	if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0)
860 		goto done;
861 
862 	/*
863 	 * FLUSH THE CHAIN (on the way back up the recursion)
864 	 *
865 	 * Chain is now deterministically being flushed and not being deferred.
866 	 * We've finished running the recursion and the blockref update.
867 	 *
868 	 * update bref.mirror_tid.  update_xlo has already been updated.
869 	 */
870 	chain->bref.mirror_tid = pmp->flush_tid;
871 
872 	/*
873 	 * Dispose of the modified bit.  FLUSH_CREATE should already be
874 	 * set.
875 	 */
876 	KKASSERT((chain->flags & HAMMER2_CHAIN_FLUSH_CREATE) ||
877 		 chain == &hmp->vchain);
878 	atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
879 	hammer2_pfs_memory_wakeup(pmp);
880 
881 	if ((chain->flags & HAMMER2_CHAIN_FLUSH_CREATE) ||
882 	    chain == &hmp->vchain ||
883 	    chain == &hmp->fchain) {
884 		/*
885 		 * Drop the ref from the MODIFIED bit we cleared,
886 		 * net -1 ref.
887 		 */
888 		hammer2_chain_drop(chain);
889 	} else {
890 		/*
891 		 * Drop the ref from the MODIFIED bit we cleared and
892 		 * set a ref for the FLUSH_CREATE bit we are setting.
893 		 * Net 0 refs.
894 		 */
895 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSH_CREATE);
896 	}
897 
898 	/*
899 	 * Skip the actual flush operation if the chain has been deleted
900 	 * in our flus hview.  There will be no block table entry that
901 	 * references it.
902 	 */
903 	if (h2ignore_deleted(info, chain))
904 		goto done;
905 
906 	/*
907 	 * Issue flush.
908 	 *
909 	 * A DELETED node that reaches this point must be flushed for
910 	 * synchronization point consistency.
911 	 *
912 	 * Update bref.mirror_tid, clear MODIFIED, and set MOVED.
913 	 *
914 	 * The caller will update the parent's reference to this chain
915 	 * by testing MOVED as long as the modification was in-bounds.
916 	 *
917 	 * MOVED is never set on the volume root as there is no parent
918 	 * to adjust.
919 	 */
920 	if (hammer2_debug & 0x1000) {
921 		kprintf("Flush %p.%d %016jx/%d sync_xid=%08x data=%016jx\n",
922 			chain, chain->bref.type,
923 			chain->bref.key, chain->bref.keybits,
924 			info->sync_xid, chain->bref.data_off);
925 	}
926 	if (hammer2_debug & 0x2000) {
927 		Debugger("Flush hell");
928 	}
929 
930 	/*
931 	 * If this is part of a recursive flush we can go ahead and write
932 	 * out the buffer cache buffer and pass a new bref back up the chain
933 	 * via the MOVED bit.
934 	 *
935 	 * Volume headers are NOT flushed here as they require special
936 	 * processing.
937 	 */
938 	switch(chain->bref.type) {
939 	case HAMMER2_BREF_TYPE_FREEMAP:
940 		KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED);
941 		hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid;
942 		break;
943 	case HAMMER2_BREF_TYPE_VOLUME:
944 		/*
945 		 * The free block table is flushed by hammer2_vfs_sync()
946 		 * before it flushes vchain.  We must still hold fchain
947 		 * locked while copying voldata to volsync, however.
948 		 */
949 		hammer2_voldata_lock(hmp);
950 		hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
951 #if 0
952 		if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) ||
953 		    hmp->voldata.freemap_tid < info->trans->sync_tid) {
954 			/* this will modify vchain as a side effect */
955 			hammer2_chain_t *tmp = &hmp->fchain;
956 			hammer2_chain_flush(info->trans, &tmp);
957 			KKASSERT(tmp == &hmp->fchain);
958 		}
959 #endif
960 
961 		/*
962 		 * There is no parent to our root vchain and fchain to
963 		 * synchronize the bref to, their updated mirror_tid's
964 		 * must be synchronized to the volume header.
965 		 */
966 		hmp->voldata.mirror_tid = chain->bref.mirror_tid;
967 		hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid;
968 		kprintf("mirror_tid %016llx\n", chain->bref.mirror_tid);
969 
970 		/*
971 		 * The volume header is flushed manually by the syncer, not
972 		 * here.  All we do here is adjust the crc's.
973 		 */
974 		KKASSERT(chain->data != NULL);
975 		KKASSERT(chain->dio == NULL);
976 
977 		hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
978 			hammer2_icrc32(
979 				(char *)&hmp->voldata +
980 				 HAMMER2_VOLUME_ICRC1_OFF,
981 				HAMMER2_VOLUME_ICRC1_SIZE);
982 		hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
983 			hammer2_icrc32(
984 				(char *)&hmp->voldata +
985 				 HAMMER2_VOLUME_ICRC0_OFF,
986 				HAMMER2_VOLUME_ICRC0_SIZE);
987 		hmp->voldata.icrc_volheader =
988 			hammer2_icrc32(
989 				(char *)&hmp->voldata +
990 				 HAMMER2_VOLUME_ICRCVH_OFF,
991 				HAMMER2_VOLUME_ICRCVH_SIZE);
992 		hmp->volsync = hmp->voldata;
993 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
994 		hammer2_chain_unlock(&hmp->fchain);
995 		hammer2_voldata_unlock(hmp);
996 		break;
997 	case HAMMER2_BREF_TYPE_DATA:
998 		/*
999 		 * Data elements have already been flushed via the logical
1000 		 * file buffer cache.  Their hash was set in the bref by
1001 		 * the vop_write code.
1002 		 *
1003 		 * Make sure any device buffer(s) have been flushed out here.
1004 		 * (there aren't usually any to flush).
1005 		 */
1006 		break;
1007 #if 0
1008 	case HAMMER2_BREF_TYPE_INDIRECT:
1009 		/*
1010 		 * Indirect blocks may be in an INITIAL state.  Use the
1011 		 * chain_lock() call to ensure that the buffer has been
1012 		 * instantiated (even though it is already locked the buffer
1013 		 * might not have been instantiated).
1014 		 *
1015 		 * Only write the buffer out if it is dirty, it is possible
1016 		 * the operating system had already written out the buffer.
1017 		 */
1018 		hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1019 		KKASSERT(chain->dio != NULL);
1020 
1021 		chain->data = NULL;
1022 		hammer2_io_bqrelse(&chain->dio);
1023 		hammer2_chain_unlock(chain);
1024 		break;
1025 #endif
1026 	case HAMMER2_BREF_TYPE_INDIRECT:
1027 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1028 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1029 		KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
1030 		break;
1031 	case HAMMER2_BREF_TYPE_INODE:
1032 		if (chain->data->ipdata.op_flags & HAMMER2_OPFLAG_PFSROOT) {
1033 			/*
1034 			 * non-NULL pmp if mounted as a PFS.  We must sync
1035 			 * fields cached in the pmp.
1036 			 */
1037 			hammer2_inode_data_t *ipdata;
1038 
1039 			ipdata = &chain->data->ipdata;
1040 			ipdata->pfs_inum = pmp->inode_tid;
1041 		} else {
1042 			/* can't be mounted as a PFS */
1043 			KKASSERT((chain->flags & HAMMER2_CHAIN_PFSROOT) == 0);
1044 		}
1045 		KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
1046 		break;
1047 	default:
1048 		KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED);
1049 		panic("hammer2_flush_core: unsupported embedded bref %d",
1050 		      chain->bref.type);
1051 		/* NOT REACHED */
1052 	}
1053 
1054 	/*
1055 	 * Final cleanup after flush
1056 	 */
1057 done:
1058 	KKASSERT(chain->refs > 1);
1059 	info->domodify = saved_domodify;
1060 	info->parent = saved_parent;
1061 	*chainp = chain;
1062 
1063 	KKASSERT(chain->bref.mirror_tid <= chain->pmp->flush_tid);
1064 }
1065 
1066 /*
1067  * Flush helper pass1 (recursive)
1068  *
1069  * Flushes the children of the caller's chain (info->parent), restricted
1070  * by sync_tid.  Set info->domodify if the child's blockref must propagate
1071  * back up to the parent.
1072  *
1073  * Ripouts can move child from rbtree to dbtree or dbq but the caller's
1074  * flush scan order prevents any chains from being lost.  A child can be
1075  * executes more than once (update_xlo is used to prevent infinite recursions).
1076  *
1077  * WARNING! If we do not call hammer2_flush_core() we must update
1078  *	    bref.mirror_tid ourselves to indicate that the flush has
1079  *	    processed the child.
1080  *
1081  * WARNING! parent->core spinlock is held on entry and return.
1082  *
1083  * WARNING! Flushes do not cross PFS boundaries.  Specifically, a flush must
1084  *	    not cross a pfs-root boundary.
1085  */
1086 static int
1087 hammer2_flush_pass1(hammer2_chain_t *child, void *data)
1088 {
1089 	hammer2_flush_info_t *info = data;
1090 	hammer2_trans_t *trans = info->trans;
1091 	hammer2_chain_t *parent = info->parent;
1092 
1093 	/*
1094 	 * Child modified in a later transactions, nothing to flush in this
1095 	 * transaction.
1096 	 *
1097 	 * Remember that modifications generally delete-duplicate so if the
1098 	 * sub-tree is dirty another child will get us there.  But not this
1099 	 * one.
1100 	 *
1101 	 * (child can never be fchain or vchain so a special check isn't
1102 	 *  needed).
1103 	 */
1104 	if (child->modify_xid > trans->sync_xid) {
1105 		KKASSERT(child->delete_xid >= child->modify_xid);
1106 		/*child->update_xlo = info->sync_xid;*/
1107 		/* do not update mirror_tid, pass2 will ignore chain */
1108 		return (0);
1109 	}
1110 
1111 	/*
1112 	 * We must ref the child before unlocking the spinlock.
1113 	 *
1114 	 * The caller has added a ref to the parent so we can temporarily
1115 	 * unlock it in order to lock the child.
1116 	 */
1117 	hammer2_chain_ref(child);
1118 	spin_unlock(&parent->core->cst.spin);
1119 
1120 	hammer2_chain_unlock(parent);
1121 	hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
1122 
1123 	/*
1124 	 * Never recurse across a mounted PFS boundary.
1125 	 *
1126 	 * Recurse and collect deferral data.  We only recursively sync
1127 	 * (basically) if update_xlo has not been updated, indicating that
1128 	 * the child has not already been processed.
1129 	 */
1130 	if ((child->flags & HAMMER2_CHAIN_PFSBOUNDARY) == 0 ||
1131 	    child->pmp == NULL) {
1132 		if ((child->flags & HAMMER2_CHAIN_MODIFIED) ||
1133 		    (child->update_xlo < info->sync_xid &&
1134 		     child->update_xlo < child->update_xhi)) {
1135 			++info->depth;
1136 			hammer2_flush_core(info, &child, 0); /* XXX deleting */
1137 			--info->depth;
1138 		}
1139 	}
1140 
1141 	/*
1142 	 * Determine if domodify should be set.  Do not otherwise adjust
1143 	 * the child or pass2 will get confused.
1144 	 *
1145 	 * Insertion:
1146 	 *	- child is flagged as possibly needing block table insertion.
1147 	 *	- child not deleted or deletion is beyond transaction id
1148 	 *	- child created beyond parent synchronization point
1149 	 *	- parent not deleted as-of this transaction
1150 	 */
1151 	if ((child->flags & HAMMER2_CHAIN_FLUSH_CREATE) &&
1152 	    child->delete_xid > trans->sync_xid &&
1153 	    child->modify_xid > parent->update_xlo &&
1154 	    parent->delete_xid > trans->sync_xid) {
1155 		info->domodify = 1;
1156 	}
1157 
1158 	/*
1159 	 * Removal:
1160 	 *	- child is flagged as possibly needing block table removal.
1161 	 *	- child deleted before or during this transaction
1162 	 *	- child created prior or during parent synchronization point
1163 	 *	- parent not yet synchronized to child deletion
1164 	 *	- parent not deleted as-of this transaction
1165 	 */
1166 	if ((child->flags & HAMMER2_CHAIN_FLUSH_DELETE) &&
1167 	    child->delete_xid <= trans->sync_xid &&
1168 	    child->modify_xid <= parent->update_xlo &&
1169 	    child->delete_xid > parent->update_xlo &&
1170 	    parent->delete_xid > trans->sync_xid) {
1171 		info->domodify = 1;
1172 	}
1173 
1174 	/*
1175 	 * Relock to continue the loop
1176 	 */
1177 	hammer2_chain_unlock(child);
1178 	hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
1179 	hammer2_chain_drop(child);
1180 	KKASSERT(info->parent == parent);
1181 
1182 	spin_lock(&parent->core->cst.spin);
1183 	return (0);
1184 }
1185 
1186 /*
1187  * PASS2 - BLOCKTABLE DELETIONS
1188  */
1189 static int
1190 hammer2_flush_pass2(hammer2_chain_t *child, void *data)
1191 {
1192 	hammer2_flush_info_t *info = data;
1193 	hammer2_chain_t *parent = info->parent;
1194 	hammer2_mount_t *hmp = child->hmp;
1195 	hammer2_trans_t *trans = info->trans;
1196 	hammer2_blockref_t *base;
1197 	int count;
1198 
1199 	/*
1200 	 * Prefilter - Ignore children not flagged as needing a parent
1201 	 *	       blocktable update.
1202 	 */
1203 	if ((child->flags & HAMMER2_CHAIN_FLUSH_DELETE) == 0)
1204 		return (0);
1205 
1206 	/*
1207 	 * Prefilter - Ignore children created after our flush_tid (not
1208 	 *	       visible to our flush).
1209 	 */
1210 	if (child->modify_xid > trans->sync_xid) {
1211 		KKASSERT(child->delete_xid >= child->modify_xid);
1212 		return 0;
1213 	}
1214 
1215 	/*
1216 	 * Prefilter - Don't bother updating the blockrefs for a deleted
1217 	 *	       parent (from the flush's perspective).  Otherwise,
1218 	 *	       we need to be COUNTEDBREFS synchronized for the
1219 	 *	       hammer2_base_*() functions.
1220 	 *
1221 	 * NOTE: This test must match the similar one in flush_core.
1222 	 */
1223 	if (h2ignore_deleted(info, parent))
1224 		return 0;
1225 
1226 	/*
1227 	 * Calculate blockmap pointer
1228 	 */
1229 	switch(parent->bref.type) {
1230 	case HAMMER2_BREF_TYPE_INODE:
1231 		/*
1232 		 * Access the inode's block array.  However, there is no
1233 		 * block array if the inode is flagged DIRECTDATA.  The
1234 		 * DIRECTDATA case typicaly only occurs when a hardlink has
1235 		 * been shifted up the tree and the original inode gets
1236 		 * replaced with an OBJTYPE_HARDLINK placeholding inode.
1237 		 */
1238 		if (parent->data &&
1239 		    (parent->data->ipdata.op_flags &
1240 		     HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1241 			base = &parent->data->ipdata.u.blockset.blockref[0];
1242 		} else {
1243 			base = NULL;
1244 		}
1245 		count = HAMMER2_SET_COUNT;
1246 		break;
1247 	case HAMMER2_BREF_TYPE_INDIRECT:
1248 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1249 		if (parent->data)
1250 			base = &parent->data->npdata[0];
1251 		else
1252 			base = NULL;
1253 		count = parent->bytes / sizeof(hammer2_blockref_t);
1254 		break;
1255 	case HAMMER2_BREF_TYPE_VOLUME:
1256 		base = &hmp->voldata.sroot_blockset.blockref[0];
1257 		count = HAMMER2_SET_COUNT;
1258 		break;
1259 	case HAMMER2_BREF_TYPE_FREEMAP:
1260 		base = &parent->data->npdata[0];
1261 		count = HAMMER2_SET_COUNT;
1262 		break;
1263 	default:
1264 		base = NULL;
1265 		count = 0;
1266 		panic("hammer2_flush_pass2: unrecognized blockref type: %d",
1267 		      parent->bref.type);
1268 	}
1269 
1270 	/*
1271 	 * Removal
1272 	 *	- child is flagged for removal
1273 	 *	- child deleted before or during this transaction
1274 	 *	- child created prior or during parent synchronization point
1275 	 *	- parent not yet synchronized to child's deletion
1276 	 */
1277 	if (child->delete_xid <= trans->sync_xid &&
1278 	    child->modify_xid <= parent->update_xlo &&
1279 	    child->delete_xid > parent->update_xlo) {
1280 		/* can't assert BMAPPED because state adjustment may occur
1281 		 * before we are done, and BMAPPED only applies to the live
1282 		 * parent.
1283 		 *KKASSERT(child->flags & HAMMER2_CHAIN_BMAPPED);*/
1284 		if (base) {
1285 			hammer2_rollup_stats(parent, child, -1);
1286 			hammer2_base_delete(trans, parent, base, count,
1287 					    &info->cache_index, child);
1288 		}
1289 	}
1290 
1291 	return 0;
1292 }
1293 
1294 /*
1295  * PASS3 - BLOCKTABLE INSERTIONS
1296  */
1297 static int
1298 hammer2_flush_pass3(hammer2_chain_t *child, void *data)
1299 {
1300 	hammer2_flush_info_t *info = data;
1301 	hammer2_chain_t *parent = info->parent;
1302 	hammer2_mount_t *hmp = child->hmp;
1303 	hammer2_trans_t *trans = info->trans;
1304 	hammer2_blockref_t *base;
1305 	int count;
1306 
1307 	/*
1308 	 * Prefilter - Ignore children not flagged as needing a parent
1309 	 *	       blocktable update.
1310 	 */
1311 	if ((child->flags & HAMMER2_CHAIN_FLUSH_CREATE) == 0)
1312 		return (0);
1313 
1314 	/*
1315 	 * Prefilter - Ignore children created after our flush_tid (not
1316 	 *	       visible to our flush).
1317 	 */
1318 	if (child->modify_xid > trans->sync_xid) {
1319 		KKASSERT(child->delete_xid >= child->modify_xid);
1320 		return 0;
1321 	}
1322 
1323 	/*
1324 	 * Prefilter - Don't bother updating the blockrefs for a deleted
1325 	 *	       parent (from the flush's perspective).  Otherwise,
1326 	 *	       we need to be COUNTEDBREFS synchronized for the
1327 	 *	       hammer2_base_*() functions.
1328 	 *
1329 	 * NOTE: This test must match the similar one in flush_core.
1330 	 */
1331 	if (h2ignore_deleted(info, parent))
1332 		return 0;
1333 
1334 	/*
1335 	 * Calculate blockmap pointer
1336 	 */
1337 	switch(parent->bref.type) {
1338 	case HAMMER2_BREF_TYPE_INODE:
1339 		/*
1340 		 * Access the inode's block array.  However, there is no
1341 		 * block array if the inode is flagged DIRECTDATA.  The
1342 		 * DIRECTDATA case typicaly only occurs when a hardlink has
1343 		 * been shifted up the tree and the original inode gets
1344 		 * replaced with an OBJTYPE_HARDLINK placeholding inode.
1345 		 */
1346 		if (parent->data &&
1347 		    (parent->data->ipdata.op_flags &
1348 		     HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1349 			base = &parent->data->ipdata.u.blockset.blockref[0];
1350 		} else {
1351 			base = NULL;
1352 		}
1353 		count = HAMMER2_SET_COUNT;
1354 		break;
1355 	case HAMMER2_BREF_TYPE_INDIRECT:
1356 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1357 		if (parent->data)
1358 			base = &parent->data->npdata[0];
1359 		else
1360 			base = NULL;
1361 		count = parent->bytes / sizeof(hammer2_blockref_t);
1362 		break;
1363 	case HAMMER2_BREF_TYPE_VOLUME:
1364 		base = &hmp->voldata.sroot_blockset.blockref[0];
1365 		count = HAMMER2_SET_COUNT;
1366 		break;
1367 	case HAMMER2_BREF_TYPE_FREEMAP:
1368 		base = &parent->data->npdata[0];
1369 		count = HAMMER2_SET_COUNT;
1370 		break;
1371 	default:
1372 		base = NULL;
1373 		count = 0;
1374 		panic("hammer2_flush_pass3: "
1375 		      "unrecognized blockref type: %d",
1376 		      parent->bref.type);
1377 	}
1378 
1379 	/*
1380 	 * Insertion
1381 	 *	- child is flagged as possibly needing block table insertion.
1382 	 *	- child not deleted or deletion is beyond transaction id
1383 	 *	- child created beyond parent synchronization point
1384 	 */
1385 	if (child->delete_xid > trans->sync_xid &&
1386 	    child->modify_xid > parent->update_xlo) {
1387 		if (base) {
1388 			hammer2_rollup_stats(parent, child, 1);
1389 			hammer2_base_insert(trans, parent, base, count,
1390 					    &info->cache_index, child);
1391 		}
1392 	}
1393 
1394 	return 0;
1395 }
1396 
1397 /*
1398  * PASS4 - CLEANUP CHILDREN (non-recursive, but CAN be re-entrant)
1399  *
1400  * Adjust queues and set or clear BMAPPED appropriately if processing
1401  * the live parent.  pass4 handles deletions, pass5 handles insertions.
1402  * Separate passes are required because both deletions and insertions can
1403  * occur on dbtree.
1404  *
1405  * Cleanup FLUSH_CREATE/FLUSH_DELETE on the last parent.
1406  */
1407 static int
1408 hammer2_flush_pass4(hammer2_chain_t *child, void *data)
1409 {
1410 	hammer2_flush_info_t *info = data;
1411 	hammer2_chain_t *parent = info->parent;
1412 	hammer2_chain_core_t *above = child->above;
1413 	hammer2_trans_t *trans = info->trans;
1414 
1415 	/*
1416 	 * Prefilter - Ignore children created after our flush_tid (not
1417 	 *	       visible to our flush).
1418 	 */
1419 	if (child->modify_xid > trans->sync_xid) {
1420 		KKASSERT(child->delete_xid >= child->modify_xid);
1421 		return 0;
1422 	}
1423 
1424 	/*
1425 	 * Ref and lock child for operation, spinlock must be temporarily
1426 	 * Make sure child is referenced before we unlock.
1427 	 */
1428 	hammer2_chain_ref(child);
1429 	spin_unlock(&above->cst.spin);
1430 	hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER);
1431 	KKASSERT(child->above == above);
1432 	KKASSERT(parent->core == above);
1433 
1434 	/*
1435 	 * Adjust BMAPPED state and rbtree/queue only when we hit the
1436 	 * actual live parent.
1437 	 */
1438 	if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) {
1439 		spin_lock(&above->cst.spin);
1440 
1441 		/*
1442 		 * Deleting from blockmap, move child out of dbtree
1443 		 * and clear BMAPPED.  Child should not be on RBTREE.
1444 		 */
1445 		if (child->delete_xid <= trans->sync_xid &&
1446 		    child->modify_xid <= parent->update_xlo &&
1447 		    child->delete_xid > parent->update_xlo &&
1448 		    (child->flags & HAMMER2_CHAIN_BMAPPED)) {
1449 			KKASSERT(child->flags & HAMMER2_CHAIN_ONDBTREE);
1450 			RB_REMOVE(hammer2_chain_tree, &above->dbtree, child);
1451 			atomic_clear_int(&child->flags, HAMMER2_CHAIN_ONDBTREE);
1452 			atomic_clear_int(&child->flags, HAMMER2_CHAIN_BMAPPED);
1453 		}
1454 
1455 		/*
1456 		 * Not on any list, place child on DBQ
1457 		 */
1458 		if ((child->flags & (HAMMER2_CHAIN_ONRBTREE |
1459 				     HAMMER2_CHAIN_ONDBTREE |
1460 				     HAMMER2_CHAIN_ONDBQ)) == 0) {
1461 			KKASSERT((child->flags & HAMMER2_CHAIN_BMAPPED) == 0);
1462 			TAILQ_INSERT_TAIL(&above->dbq, child, db_entry);
1463 			atomic_set_int(&child->flags, HAMMER2_CHAIN_ONDBQ);
1464 		}
1465 		spin_unlock(&above->cst.spin);
1466 	}
1467 
1468 	/*
1469 	 * Unlock the child.  This can wind up dropping the child's
1470 	 * last ref, removing it from the parent's RB tree, and deallocating
1471 	 * the structure.  The RB_SCAN() our caller is doing handles the
1472 	 * situation.
1473 	 */
1474 	hammer2_chain_unlock(child);
1475 	hammer2_chain_drop(child);
1476 	spin_lock(&above->cst.spin);
1477 
1478 	/*
1479 	 * The parent may have been delete-duplicated.
1480 	 */
1481 	return (0);
1482 }
1483 
1484 static int
1485 hammer2_flush_pass5(hammer2_chain_t *child, void *data)
1486 {
1487 	hammer2_flush_info_t *info = data;
1488 	hammer2_chain_t *parent = info->parent;
1489 	hammer2_chain_t *xchain;
1490 	hammer2_chain_core_t *above = child->above;
1491 	hammer2_trans_t *trans = info->trans;
1492 
1493 	/*
1494 	 * Prefilter - Ignore children created after our flush_tid (not
1495 	 *	       visible to our flush).
1496 	 */
1497 	if (child->modify_xid > trans->sync_xid) {
1498 		KKASSERT(child->delete_xid >= child->modify_xid);
1499 		return 0;
1500 	}
1501 
1502 	/*
1503 	 * Ref and lock child for operation, spinlock must be temporarily
1504 	 * Make sure child is referenced before we unlock.
1505 	 */
1506 	hammer2_chain_ref(child);
1507 	spin_unlock(&above->cst.spin);
1508 	hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER);
1509 	KKASSERT(child->above == above);
1510 	KKASSERT(parent->core == above);
1511 
1512 	/*
1513 	 * Adjust BMAPPED state and rbtree/queue only when we hit the
1514 	 * actual live parent.
1515 	 */
1516 	if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) {
1517 		spin_lock(&above->cst.spin);
1518 
1519 		/*
1520 		 * Inserting into blockmap, place child in rbtree or dbtree.
1521 		 */
1522 		if (child->delete_xid > trans->sync_xid &&
1523 		    child->modify_xid > parent->update_xlo &&
1524 		    (child->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
1525 			if (child->flags & HAMMER2_CHAIN_ONDBQ) {
1526 				TAILQ_REMOVE(&above->dbq, child, db_entry);
1527 				atomic_clear_int(&child->flags,
1528 						 HAMMER2_CHAIN_ONDBQ);
1529 			}
1530 			if ((child->flags & HAMMER2_CHAIN_DELETED) == 0 &&
1531 			    (child->flags & HAMMER2_CHAIN_ONRBTREE) == 0) {
1532 				KKASSERT((child->flags &
1533 					  (HAMMER2_CHAIN_ONDBTREE |
1534 					   HAMMER2_CHAIN_ONDBQ)) == 0);
1535 				xchain = RB_INSERT(hammer2_chain_tree,
1536 						   &above->rbtree, child);
1537 				KKASSERT(xchain == NULL);
1538 				atomic_set_int(&child->flags,
1539 					       HAMMER2_CHAIN_ONRBTREE);
1540 			} else
1541 			if ((child->flags & HAMMER2_CHAIN_DELETED) &&
1542 			    (child->flags & HAMMER2_CHAIN_ONDBTREE) == 0) {
1543 				KKASSERT((child->flags &
1544 					  (HAMMER2_CHAIN_ONRBTREE |
1545 					   HAMMER2_CHAIN_ONDBQ)) == 0);
1546 				xchain = RB_INSERT(hammer2_chain_tree,
1547 						   &above->dbtree, child);
1548 				KKASSERT(xchain == NULL);
1549 				atomic_set_int(&child->flags,
1550 					       HAMMER2_CHAIN_ONDBTREE);
1551 			}
1552 			atomic_set_int(&child->flags, HAMMER2_CHAIN_BMAPPED);
1553 			KKASSERT(child->flags &
1554 				 (HAMMER2_CHAIN_ONRBTREE |
1555 				  HAMMER2_CHAIN_ONDBTREE |
1556 				  HAMMER2_CHAIN_ONDBQ));
1557 		}
1558 
1559 		/*
1560 		 * Not on any list, place child on DBQ
1561 		 */
1562 		if ((child->flags & (HAMMER2_CHAIN_ONRBTREE |
1563 				     HAMMER2_CHAIN_ONDBTREE |
1564 				     HAMMER2_CHAIN_ONDBQ)) == 0) {
1565 			KKASSERT((child->flags & HAMMER2_CHAIN_BMAPPED) == 0);
1566 			TAILQ_INSERT_TAIL(&above->dbq, child, db_entry);
1567 			atomic_set_int(&child->flags, HAMMER2_CHAIN_ONDBQ);
1568 		}
1569 		spin_unlock(&above->cst.spin);
1570 	}
1571 
1572 	/*
1573 	 * Cleanup flags on last parent iterated for flush.
1574 	 */
1575 	if (info->domodify & 2) {
1576 		if (child->flags & HAMMER2_CHAIN_FLUSH_CREATE) {
1577 			atomic_clear_int(&child->flags,
1578 					 HAMMER2_CHAIN_FLUSH_CREATE);
1579 			hammer2_chain_drop(child);
1580 		}
1581 		if ((child->flags & HAMMER2_CHAIN_FLUSH_DELETE) &&
1582 		    child->delete_xid <= trans->sync_xid) {
1583 			KKASSERT((parent->flags & HAMMER2_CHAIN_DELETED) ||
1584 				 (child->flags & HAMMER2_CHAIN_ONDBTREE) == 0);
1585 			/* XXX delete-duplicate chain insertion mech wrong */
1586 			KKASSERT((parent->flags & HAMMER2_CHAIN_DELETED) ||
1587 				 (child->flags & HAMMER2_CHAIN_BMAPPED) == 0);
1588 			atomic_clear_int(&child->flags,
1589 					 HAMMER2_CHAIN_FLUSH_DELETE);
1590 			hammer2_chain_drop(child);
1591 		}
1592 	}
1593 
1594 	/*
1595 	 * Unlock the child.  This can wind up dropping the child's
1596 	 * last ref, removing it from the parent's RB tree, and deallocating
1597 	 * the structure.  The RB_SCAN() our caller is doing handles the
1598 	 * situation.
1599 	 */
1600 	hammer2_chain_unlock(child);
1601 	hammer2_chain_drop(child);
1602 	spin_lock(&above->cst.spin);
1603 
1604 	/*
1605 	 * The parent may have been delete-duplicated.
1606 	 */
1607 	return (0);
1608 }
1609 
1610 void
1611 hammer2_rollup_stats(hammer2_chain_t *parent, hammer2_chain_t *child, int how)
1612 {
1613 #if 0
1614 	hammer2_chain_t *grandp;
1615 #endif
1616 
1617 	parent->data_count += child->data_count;
1618 	parent->inode_count += child->inode_count;
1619 	child->data_count = 0;
1620 	child->inode_count = 0;
1621 	if (how < 0) {
1622 		parent->data_count -= child->bytes;
1623 		if (child->bref.type == HAMMER2_BREF_TYPE_INODE) {
1624 			parent->inode_count -= 1;
1625 #if 0
1626 			/* XXX child->data may be NULL atm */
1627 			parent->data_count -= child->data->ipdata.data_count;
1628 			parent->inode_count -= child->data->ipdata.inode_count;
1629 #endif
1630 		}
1631 	} else if (how > 0) {
1632 		parent->data_count += child->bytes;
1633 		if (child->bref.type == HAMMER2_BREF_TYPE_INODE) {
1634 			parent->inode_count += 1;
1635 #if 0
1636 			/* XXX child->data may be NULL atm */
1637 			parent->data_count += child->data->ipdata.data_count;
1638 			parent->inode_count += child->data->ipdata.inode_count;
1639 #endif
1640 		}
1641 	}
1642 	if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
1643 		parent->data->ipdata.data_count += parent->data_count;
1644 		parent->data->ipdata.inode_count += parent->inode_count;
1645 #if 0
1646 		for (grandp = parent->above->first_parent;
1647 		     grandp;
1648 		     grandp = grandp->next_parent) {
1649 			grandp->data_count += parent->data_count;
1650 			grandp->inode_count += parent->inode_count;
1651 		}
1652 #endif
1653 		parent->data_count = 0;
1654 		parent->inode_count = 0;
1655 	}
1656 }
1657