1 /*
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2022 Tomohiro Kusumi <tkusumi@netbsd.org>
5  * Copyright (c) 2011-2022 The DragonFly Project.  All rights reserved.
6  *
7  * This code is derived from software contributed to The DragonFly Project
8  * by Matthew Dillon <dillon@dragonflybsd.org>
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in
18  *    the documentation and/or other materials provided with the
19  *    distribution.
20  * 3. Neither the name of The DragonFly Project nor the names of its
21  *    contributors may be used to endorse or promote products derived
22  *    from this software without specific, prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
27  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
28  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
34  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 /*
38  *			TRANSACTION AND FLUSH HANDLING
39  *
40  * Deceptively simple but actually fairly difficult to implement properly is
41  * how I would describe it.
42  *
43  * Flushing generally occurs bottom-up but requires a top-down scan to
44  * locate chains with MODIFIED and/or UPDATE bits set.  The ONFLUSH flag
45  * tells how to recurse downward to find these chains.
46  */
47 
48 /*
49 #include <sys/cdefs.h>
50 #include <sys/param.h>
51 #include <sys/systm.h>
52 #include <sys/types.h>
53 #include <sys/lock.h>
54 #include <sys/vnode.h>
55 #include <sys/buf.h>
56 */
57 
58 #include "hammer2.h"
59 
60 #define HAMMER2_FLUSH_DEPTH_LIMIT	60      /* stack recursion limit */
61 
62 
63 /*
64  * Recursively flush the specified chain.  The chain is locked and
65  * referenced by the caller and will remain so on return.  The chain
66  * will remain referenced throughout but can temporarily lose its
67  * lock during the recursion to avoid unnecessarily stalling user
68  * processes.
69  */
70 struct hammer2_flush_info {
71 	hammer2_chain_t *parent;
72 	int		depth;
73 	int		error;			/* cumulative error */
74 	int		flags;
75 #ifdef HAMMER2_SCAN_DEBUG
76 	long		scan_count;
77 	long		scan_mod_count;
78 	long		scan_upd_count;
79 	long		scan_onf_count;
80 	long		scan_del_count;
81 	long		scan_btype[7];
82 #endif
83 };
84 
85 typedef struct hammer2_flush_info hammer2_flush_info_t;
86 
87 static int hammer2_flush_core(hammer2_flush_info_t *info,
88 				hammer2_chain_t *chain, int flags);
89 static int hammer2_flush_recurse(hammer2_chain_t *child, void *data);
90 
91 /*
92  * Any per-pfs transaction initialization goes here.
93  */
94 void
95 hammer2_trans_manage_init(hammer2_pfs_t *pmp)
96 {
97 }
98 
99 /*
100  * Transaction support for any modifying operation.  Transactions are used
101  * in the pmp layer by the frontend and in the spmp layer by the backend.
102  *
103  * 0			- Normal transaction.  Interlocks against just the
104  *			  COPYQ portion of an ISFLUSH transaction.
105  *
106  * TRANS_ISFLUSH	- Flush transaction.  Interlocks against other flush
107  *			  transactions.
108  *
109  *			  When COPYQ is also specified, waits for the count
110  *			  to drop to 1.
111  *
112  * TRANS_BUFCACHE	- Buffer cache transaction.  No interlock.
113  *
114  * TRANS_SIDEQ		- Run the sideq (only tested in trans_done())
115  *
116  * Initializing a new transaction allocates a transaction ID.  Typically
117  * passed a pmp (hmp passed as NULL), indicating a cluster transaction.  Can
118  * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single
119  * media target.  The latter mode is used by the recovery code.
120  */
121 void
122 hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
123 {
124 	uint32_t oflags;
125 	uint32_t nflags;
126 	int dowait;
127 
128 	for (;;) {
129 		oflags = pmp->trans.flags;
130 		cpu_ccfence();
131 		dowait = 0;
132 
133 		if (flags & HAMMER2_TRANS_ISFLUSH) {
134 			/*
135 			 * Interlock against other flush transactions.
136 			 */
137 			if (oflags & HAMMER2_TRANS_ISFLUSH) {
138 				nflags = oflags | HAMMER2_TRANS_WAITING;
139 				dowait = 1;
140 			} else {
141 				nflags = (oflags | flags) + 1;
142 			}
143 		} else if (flags & HAMMER2_TRANS_BUFCACHE) {
144 			/*
145 			 * Requesting strategy transaction from buffer-cache,
146 			 * or a VM getpages/putpages through the buffer cache.
147 			 * We must allow such transactions in all situations
148 			 * to avoid deadlocks.
149 			 */
150 			nflags = (oflags | flags) + 1;
151 		} else {
152 			/*
153 			 * Normal transaction.  We do not interlock against
154 			 * BUFCACHE or ISFLUSH.
155 			 *
156 			 * Note that vnode locks may be held going into
157 			 * this call.
158 			 *
159 			 * NOTE: Remember that non-modifying operations
160 			 *	 such as read, stat, readdir, etc, do
161 			 *	 not use transactions.
162 			 */
163 			nflags = (oflags | flags) + 1;
164 		}
165 		if (dowait)
166 			tsleep_interlock(&pmp->trans.sync_wait, 0);
167 		if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
168 			if (dowait == 0)
169 				break;
170 			tsleep(&pmp->trans.sync_wait, PINTERLOCKED,
171 			       "h2trans", hz);
172 			/* retry */
173 		} else {
174 			cpu_pause();
175 			/* retry */
176 		}
177 		/* retry */
178 	}
179 
180 #if 0
181 	/*
182 	 * When entering a FLUSH transaction with COPYQ set, wait for the
183 	 * transaction count to drop to 1 (our flush transaction only)
184 	 * before proceeding.
185 	 *
186 	 * This waits for all non-flush transactions to complete and blocks
187 	 * new non-flush transactions from starting until COPYQ is cleared.
188 	 * (the flush will then proceed after clearing COPYQ).  This should
189 	 * be a very short stall on modifying operations.
190 	 */
191 	while ((flags & HAMMER2_TRANS_ISFLUSH) &&
192 	       (flags & HAMMER2_TRANS_COPYQ)) {
193 		oflags = pmp->trans.flags;
194 		cpu_ccfence();
195 		if ((oflags & HAMMER2_TRANS_MASK) == 1)
196 			break;
197 		nflags = oflags | HAMMER2_TRANS_WAITING;
198 		tsleep_interlock(&pmp->trans.sync_wait, 0);
199 		if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
200 			tsleep(&pmp->trans.sync_wait, PINTERLOCKED,
201 			       "h2trans2", hz);
202 		}
203 	}
204 #endif
205 }
206 
207 /*
208  * Start a sub-transaction, there is no 'subdone' function.  This will
209  * issue a new modify_tid (mtid) for the current transaction, which is a
210  * CLC (cluster level change) id and not a per-node id.
211  *
212  * This function must be called for each XOP when multiple XOPs are run in
213  * sequence within a transaction.
214  *
215  * Callers typically update the inode with the transaction mtid manually
216  * to enforce sequencing.
217  */
218 hammer2_tid_t
219 hammer2_trans_sub(hammer2_pfs_t *pmp)
220 {
221 	hammer2_tid_t mtid;
222 
223 	mtid = atomic_fetchadd_64(&pmp->modify_tid, 1);
224 
225 	return (mtid);
226 }
227 
228 void
229 hammer2_trans_setflags(hammer2_pfs_t *pmp, uint32_t flags)
230 {
231 	atomic_set_int(&pmp->trans.flags, flags);
232 }
233 
234 /*
235  * Typically used to clear trans flags asynchronously.  If TRANS_WAITING
236  * is in the mask, and was previously set, this function will wake up
237  * any waiters.
238  */
239 void
240 hammer2_trans_clearflags(hammer2_pfs_t *pmp, uint32_t flags)
241 {
242 	uint32_t oflags;
243 	uint32_t nflags;
244 
245 	for (;;) {
246 		oflags = pmp->trans.flags;
247 		cpu_ccfence();
248 		nflags = oflags & ~flags;
249 		if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
250 			if ((oflags ^ nflags) & HAMMER2_TRANS_WAITING)
251 				wakeup(&pmp->trans.sync_wait);
252 			break;
253 		}
254 		cpu_pause();
255 		/* retry */
256 	}
257 }
258 
259 void
260 hammer2_trans_done(hammer2_pfs_t *pmp, uint32_t flags)
261 {
262 	uint32_t oflags;
263 	uint32_t nflags;
264 
265 #if 0
266 	/*
267 	 * Modifying ops on the front-end can cause dirty inodes to
268 	 * build up in the sideq.  We don't flush these on inactive/reclaim
269 	 * due to potential deadlocks, so we have to deal with them from
270 	 * inside other nominal modifying front-end transactions.
271 	 */
272 	if ((flags & HAMMER2_TRANS_SIDEQ) &&
273 	    pmp->sideq_count > hammer2_limit_dirty_inodes / 2 &&
274 	    pmp->sideq_count > (pmp->inum_count >> 3) &&
275 	    pmp->mp) {
276 		speedup_syncer(pmp->mp);
277 	}
278 #endif
279 
280 	/*
281 	 * Clean-up the transaction.  Wakeup any waiters when finishing
282 	 * a flush transaction or transitioning the non-flush transaction
283 	 * count from 2->1 while a flush transaction is pending.
284 	 */
285 	for (;;) {
286 		oflags = pmp->trans.flags;
287 		cpu_ccfence();
288 		KKASSERT(oflags & HAMMER2_TRANS_MASK);
289 
290 		nflags = (oflags - 1) & ~flags;
291 		if (flags & HAMMER2_TRANS_ISFLUSH) {
292 			nflags &= ~HAMMER2_TRANS_WAITING;
293 		}
294 		if ((oflags & (HAMMER2_TRANS_ISFLUSH|HAMMER2_TRANS_MASK)) ==
295 		    (HAMMER2_TRANS_ISFLUSH|2)) {
296 			nflags &= ~HAMMER2_TRANS_WAITING;
297 		}
298 		if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
299 			if ((oflags ^ nflags) & HAMMER2_TRANS_WAITING)
300 				wakeup(&pmp->trans.sync_wait);
301 			break;
302 		}
303 		cpu_pause();
304 		/* retry */
305 	}
306 }
307 
308 /*
309  * Obtain new, unique inode number (not serialized by caller).
310  */
311 hammer2_tid_t
312 hammer2_trans_newinum(hammer2_pfs_t *pmp)
313 {
314 	hammer2_tid_t tid;
315 
316 	tid = atomic_fetchadd_64(&pmp->inode_tid, 1);
317 
318 	return tid;
319 }
320 
321 /*
322  * Assert that a strategy call is ok here.  Currently we allow strategy
323  * calls in all situations, including during flushes.  Previously:
324  *	(old) (1) In a normal transaction.
325  */
326 void
327 hammer2_trans_assert_strategy(hammer2_pfs_t *pmp)
328 {
329 #if 0
330 	KKASSERT((pmp->trans.flags & HAMMER2_TRANS_ISFLUSH) == 0);
331 #endif
332 }
333 
334 /*
335  * Flush the chain and all modified sub-chains through the specified
336  * synchronization point, propagating blockref updates back up.  As
337  * part of this propagation, mirror_tid and inode/data usage statistics
338  * propagates back upward.
339  *
340  * Returns a HAMMER2 error code, 0 if no error.  Note that I/O errors from
341  * buffers dirtied during the flush operation can occur later.
342  *
343  * modify_tid (clc - cluster level change) is not propagated.
344  *
345  * update_tid (clc) is used for validation and is not propagated by this
346  * function.
347  *
348  * This routine can be called from several places but the most important
349  * is from VFS_SYNC (frontend) via hammer2_xop_inode_flush (backend).
350  *
351  * chain is locked on call and will remain locked on return.  The chain's
352  * UPDATE flag indicates that its parent's block table (which is not yet
353  * part of the flush) should be updated.
354  *
355  * flags:
356  *	HAMMER2_FLUSH_TOP	Indicates that this is the top of the flush.
357  *				Is cleared for the recursion.
358  *
359  *	HAMMER2_FLUSH_ALL	Recurse everything
360  *
361  *	HAMMER2_FLUSH_INODE_STOP
362  *				Stop at PFS inode or normal inode boundary
363  */
364 int
365 hammer2_flush(hammer2_chain_t *chain, int flags)
366 {
367 	hammer2_flush_info_t info;
368 	hammer2_dev_t *hmp;
369 	int loops;
370 
371 	/*
372 	 * Execute the recursive flush and handle deferrals.
373 	 *
374 	 * Chains can be ridiculously long (thousands deep), so to
375 	 * avoid blowing out the kernel stack the recursive flush has a
376 	 * depth limit.  Elements at the limit are placed on a list
377 	 * for re-execution after the stack has been popped.
378 	 */
379 	bzero(&info, sizeof(info));
380 	info.flags = flags & ~HAMMER2_FLUSH_TOP;
381 
382 	/*
383 	 * Calculate parent (can be NULL), if not NULL the flush core
384 	 * expects the parent to be referenced so it can easily lock/unlock
385 	 * it without it getting ripped up.
386 	 */
387 	if ((info.parent = chain->parent) != NULL)
388 		hammer2_chain_ref(info.parent);
389 
390 	/*
391 	 * Extra ref needed because flush_core expects it when replacing
392 	 * chain.
393 	 */
394 	hammer2_chain_ref(chain);
395 	hmp = chain->hmp;
396 	loops = 0;
397 
398 	for (;;) {
399 		/*
400 		 * [re]flush chain as the deep recursion may have generated
401 		 * additional modifications.
402 		 */
403 		if (info.parent != chain->parent) {
404 			if (hammer2_debug & 0x0040) {
405 				kprintf("LOST CHILD4 %p->%p "
406 					"(actual parent %p)\n",
407 					info.parent, chain, chain->parent);
408 			}
409 			hammer2_chain_drop(info.parent);
410 			info.parent = chain->parent;
411 			hammer2_chain_ref(info.parent);
412 		}
413 		if (hammer2_flush_core(&info, chain, flags) == 0)
414 			break;
415 
416 		if (++loops % 1000 == 0) {
417 			kprintf("hammer2_flush: excessive loops on %p\n",
418 				chain);
419 			if (hammer2_debug & 0x100000)
420 				Debugger("hell4");
421 		}
422 	}
423 #ifdef HAMMER2_SCAN_DEBUG
424 	if (info.scan_count >= 10)
425 	kprintf("hammer2_flush: scan_count %ld (%ld,%ld,%ld,%ld) "
426 		"bt(%ld,%ld,%ld,%ld,%ld,%ld)\n",
427 		info.scan_count,
428 		info.scan_mod_count,
429 		info.scan_upd_count,
430 		info.scan_onf_count,
431 		info.scan_del_count,
432 		info.scan_btype[1],
433 		info.scan_btype[2],
434 		info.scan_btype[3],
435 		info.scan_btype[4],
436 		info.scan_btype[5],
437 		info.scan_btype[6]);
438 #endif
439 	hammer2_chain_drop(chain);
440 	if (info.parent)
441 		hammer2_chain_drop(info.parent);
442 	return (info.error);
443 }
444 
445 /*
446  * This is the core of the chain flushing code.  The chain is locked by the
447  * caller and must also have an extra ref on it by the caller, and remains
448  * locked and will have an extra ref on return.  info.parent is referenced
449  * but not locked.
450  *
451  * Upon return, the caller can test the UPDATE bit on the chain to determine
452  * if the parent needs updating.
453  *
454  * If non-zero is returned, the chain's parent changed during the flush and
455  * the caller must retry the operation.
456  *
457  * (1) Determine if this node is a candidate for the flush, return if it is
458  *     not.  fchain and vchain are always candidates for the flush.
459  *
460  * (2) If we recurse too deep the chain is entered onto the deferral list and
461  *     the current flush stack is aborted until after the deferral list is
462  *     run.
463  *
464  * (3) Recursively flush live children (rbtree).  This can create deferrals.
465  *     A successful flush clears the MODIFIED and UPDATE bits on the children
466  *     and typically causes the parent to be marked MODIFIED as the children
467  *     update the parent's block table.  A parent might already be marked
468  *     MODIFIED due to a deletion (whos blocktable update in the parent is
469  *     handled by the frontend), or if the parent itself is modified by the
470  *     frontend for other reasons.
471  *
472  * (4) Permanently disconnected sub-trees are cleaned up by the front-end.
473  *     Deleted-but-open inodes can still be individually flushed via the
474  *     filesystem syncer.
475  *
476  * (5) Delete parents on the way back up if they are normal indirect blocks
477  *     and have no children.
478  *
479  * (6) Note that an unmodified child may still need the block table in its
480  *     parent updated (e.g. rename/move).  The child will have UPDATE set
481  *     in this case.
482  *
483  *			WARNING ON BREF MODIFY_TID/MIRROR_TID
484  *
485  * blockref.modify_tid is consistent only within a PFS, and will not be
486  * consistent during synchronization.  mirror_tid is consistent across the
487  * block device regardless of the PFS.
488  */
489 static int
490 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
491 		   int flags)
492 {
493 	hammer2_chain_t *parent;
494 	hammer2_dev_t *hmp;
495 	int save_error;
496 	int retry;
497 
498 	retry = 0;
499 
500 	/*
501 	 * (1) Optimize downward recursion to locate nodes needing action.
502 	 *     Nothing to do if none of these flags are set.
503 	 */
504 	if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0)
505 		return 0;
506 
507 	hmp = chain->hmp;
508 
509 	/*
510 	 * NOTE: parent can be NULL, usually due to destroy races.
511 	 */
512 	parent = info->parent;
513 	KKASSERT(chain->parent == parent);
514 
515 	/*
516 	 * Downward search recursion
517 	 *
518 	 * We must be careful on cold stops, which often occur on inode
519 	 * boundaries due to the way hammer2_vfs_sync() sequences the flush.
520 	 * Be sure to issue an appropriate chain_setflush()
521 	 */
522 	if ((chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) &&
523 	    (flags & HAMMER2_FLUSH_ALL) == 0 &&
524 	    (flags & HAMMER2_FLUSH_TOP) == 0 &&
525 	    chain->pmp && chain->pmp->mp) {
526 		/*
527 		 * If FLUSH_ALL is not specified the caller does not want
528 		 * to recurse through PFS roots that have been mounted.
529 		 *
530 		 * (If the PFS has not been mounted there may not be
531 		 *  anything monitoring its chains and its up to us
532 		 *  to flush it).
533 		 *
534 		 * The typical sequence is to flush dirty PFS's starting at
535 		 * their root downward, then flush the device root (vchain).
536 		 * It is this second flush that typically leaves out the
537 		 * ALL flag.
538 		 *
539 		 * However we must still process the PFSROOT chains for block
540 		 * table updates in their parent (which IS part of our flush).
541 		 *
542 		 * NOTE: The volume root, vchain, does not set PFSBOUNDARY.
543 		 *
544 		 * NOTE: We must re-set ONFLUSH in the parent to retain if
545 		 *	 this chain (that we are skipping) requires work.
546 		 */
547 		if (chain->flags & (HAMMER2_CHAIN_ONFLUSH |
548 				    HAMMER2_CHAIN_DESTROY |
549 				    HAMMER2_CHAIN_MODIFIED)) {
550 			hammer2_chain_setflush(parent);
551 		}
552 		goto done;
553 	} else if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
554 		   (flags & HAMMER2_FLUSH_INODE_STOP) &&
555 		   (flags & HAMMER2_FLUSH_ALL) == 0 &&
556 		   (flags & HAMMER2_FLUSH_TOP) == 0 &&
557 		   chain->pmp && chain->pmp->mp) {
558 		/*
559 		 * When FLUSH_INODE_STOP is specified we are being asked not
560 		 * to include any inode changes for inodes we encounter,
561 		 * with the exception of the inode that the flush began with.
562 		 * So: INODE, INODE_STOP, and TOP==0 basically.
563 		 *
564 		 * Dirty inodes are flushed based on the hammer2_inode
565 		 * in-memory structure, issuing a chain_setflush() here
566 		 * will only cause unnecessary traversals of the topology.
567 		 */
568 		goto done;
569 #if 0
570 		/*
571 		 * If FLUSH_INODE_STOP is specified and both ALL and TOP
572 		 * are clear, we must not flush the chain.  The chain should
573 		 * have already been flushed and any further ONFLUSH/UPDATE
574 		 * setting will be related to the next flush.
575 		 *
576 		 * This features allows us to flush inodes independently of
577 		 * each other and meta-data above the inodes separately.
578 		 */
579 		if (chain->flags & (HAMMER2_CHAIN_ONFLUSH |
580 				    HAMMER2_CHAIN_DESTROY |
581 				    HAMMER2_CHAIN_MODIFIED)) {
582 			if (parent)
583 				hammer2_chain_setflush(parent);
584 		}
585 #endif
586 	} else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
587 		/*
588 		 * Recursion depth reached.
589 		 */
590 		panic("hammer2: flush depth limit");
591 	} else if (chain->flags & (HAMMER2_CHAIN_ONFLUSH |
592 				   HAMMER2_CHAIN_DESTROY)) {
593 		/*
594 		 * Downward recursion search (actual flush occurs bottom-up).
595 		 * pre-clear ONFLUSH.  It can get set again due to races or
596 		 * flush errors, which we want so the scan finds us again in
597 		 * the next flush.
598 		 *
599 		 * We must also recurse if DESTROY is set so we can finally
600 		 * get rid of the related children, otherwise the node will
601 		 * just get re-flushed on lastdrop.
602 		 *
603 		 * WARNING!  The recursion will unlock/relock info->parent
604 		 *	     (which is 'chain'), potentially allowing it
605 		 *	     to be ripped up.
606 		 */
607 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
608 		save_error = info->error;
609 		info->error = 0;
610 		info->parent = chain;
611 
612 		/*
613 		 * We may have to do this twice to catch any indirect
614 		 * block maintenance that occurs.
615 		 */
616 		hammer2_spin_ex(&chain->core.spin);
617 		RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
618 			NULL, hammer2_flush_recurse, info);
619 		if (chain->flags & HAMMER2_CHAIN_ONFLUSH) {
620 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
621 			RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
622 				NULL, hammer2_flush_recurse, info);
623 		}
624 		hammer2_spin_unex(&chain->core.spin);
625 		info->parent = parent;
626 
627 		/*
628 		 * Re-set the flush bits if the flush was incomplete or
629 		 * an error occurred.  If an error occurs it is typically
630 		 * an allocation error.  Errors do not cause deferrals.
631 		 */
632 		if (info->error)
633 			hammer2_chain_setflush(chain);
634 		info->error |= save_error;
635 
636 		/*
637 		 * If we lost the parent->chain association we have to
638 		 * stop processing this chain because it is no longer
639 		 * in this recursion.  If it moved, it will be handled
640 		 * by the ONFLUSH flag elsewhere.
641 		 */
642 		if (chain->parent != parent) {
643 			kprintf("LOST CHILD2 %p->%p (actual parent %p)\n",
644 				parent, chain, chain->parent);
645 			goto done;
646 		}
647 	}
648 
649 	/*
650 	 * Now we are in the bottom-up part of the recursion.
651 	 *
652 	 * We continue to try to update the chain on lower-level errors, but
653 	 * the flush code may decide not to flush the volume root.
654 	 *
655 	 * XXX should we continue to try to update the chain if an error
656 	 *     occurred?
657 	 */
658 
659 	/*
660 	 * Both parent and chain must be locked in order to flush chain,
661 	 * in order to properly update the parent under certain conditions.
662 	 *
663 	 * In addition, we can't safely unlock/relock the chain once we
664 	 * start flushing the chain itself, which we would have to do later
665 	 * on in order to lock the parent if we didn't do that now.
666 	 */
667 	hammer2_chain_ref_hold(chain);
668 	hammer2_chain_unlock(chain);
669 	if (parent)
670 		hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
671 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE);
672 	hammer2_chain_drop_unhold(chain);
673 
674 	/*
675 	 * Can't process if we can't access their content.
676 	 */
677 	if ((parent && parent->error) || chain->error) {
678 		kprintf("hammer2: chain error during flush\n");
679 		info->error |= chain->error;
680 		if (parent) {
681 			info->error |= parent->error;
682 			hammer2_chain_unlock(parent);
683 		}
684 		goto done;
685 	}
686 
687 	if (chain->parent != parent) {
688 		if (hammer2_debug & 0x0040) {
689 			kprintf("LOST CHILD3 %p->%p (actual parent %p)\n",
690 				parent, chain, chain->parent);
691 		}
692 		KKASSERT(parent != NULL);
693 		hammer2_chain_unlock(parent);
694 		retry = 1;
695 		goto done;
696 	}
697 
698 	/*
699 	 * Propagate the DESTROY flag downwards.  This dummies up the flush
700 	 * code and tries to invalidate related buffer cache buffers to
701 	 * avoid the disk write.
702 	 */
703 	if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY))
704 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
705 
706 	/*
707 	 * Dispose of the modified bit.
708 	 *
709 	 * If parent is present, the UPDATE bit should already be set.
710 	 * UPDATE should already be set.
711 	 * bref.mirror_tid should already be set.
712 	 */
713 	if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
714 		KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) ||
715 			 chain->parent == NULL);
716 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
717 		atomic_add_long(&hammer2_count_modified_chains, -1);
718 
719 		/*
720 		 * Manage threads waiting for excessive dirty memory to
721 		 * be retired.
722 		 */
723 		if (chain->pmp)
724 			hammer2_pfs_memory_wakeup(chain->pmp, -1);
725 
726 #if 0
727 		if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0 &&
728 		    chain != &hmp->vchain &&
729 		    chain != &hmp->fchain) {
730 			/*
731 			 * Set UPDATE bit indicating that the parent block
732 			 * table requires updating.
733 			 */
734 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
735 		}
736 #endif
737 
738 		/*
739 		 * Issue the flush.  This is indirect via the DIO.
740 		 *
741 		 * NOTE: A DELETED node that reaches this point must be
742 		 *	 flushed for synchronization point consistency.
743 		 *
744 		 * NOTE: Even though MODIFIED was already set, the related DIO
745 		 *	 might not be dirty due to a system buffer cache
746 		 *	 flush and must be set dirty if we are going to make
747 		 *	 further modifications to the buffer.  Chains with
748 		 *	 embedded data don't need this.
749 		 */
750 		if (hammer2_debug & 0x1000) {
751 			kprintf("Flush %p.%d %016jx/%d data=%016jx\n",
752 				chain, chain->bref.type,
753 				(uintmax_t)chain->bref.key,
754 				chain->bref.keybits,
755 				(uintmax_t)chain->bref.data_off);
756 		}
757 
758 		/*
759 		 * Update chain CRCs for flush.
760 		 *
761 		 * NOTE: Volume headers are NOT flushed here as they require
762 		 *	 special processing.
763 		 */
764 		switch(chain->bref.type) {
765 		case HAMMER2_BREF_TYPE_FREEMAP:
766 			/*
767 			 * Update the volume header's freemap_tid to the
768 			 * freemap's flushing mirror_tid.
769 			 *
770 			 * (note: embedded data, do not call setdirty)
771 			 */
772 			KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED);
773 			KKASSERT(chain == &hmp->fchain);
774 			hmp->voldata.freemap_tid = chain->bref.mirror_tid;
775 			if (hammer2_debug & 0x8000) {
776 				/* debug only, avoid syslogd loop */
777 				kprintf("sync freemap mirror_tid %08jx\n",
778 					(intmax_t)chain->bref.mirror_tid);
779 			}
780 
781 			/*
782 			 * The freemap can be flushed independently of the
783 			 * main topology, but for the case where it is
784 			 * flushed in the same transaction, and flushed
785 			 * before vchain (a case we want to allow for
786 			 * performance reasons), make sure modifications
787 			 * made during the flush under vchain use a new
788 			 * transaction id.
789 			 *
790 			 * Otherwise the mount recovery code will get confused.
791 			 */
792 			++hmp->voldata.mirror_tid;
793 			break;
794 		case HAMMER2_BREF_TYPE_VOLUME:
795 			/*
796 			 * The free block table is flushed by
797 			 * hammer2_vfs_sync() before it flushes vchain.
798 			 * We must still hold fchain locked while copying
799 			 * voldata to volsync, however.
800 			 *
801 			 * These do not error per-say since their data does
802 			 * not need to be re-read from media on lock.
803 			 *
804 			 * (note: embedded data, do not call setdirty)
805 			 */
806 			hammer2_chain_lock(&hmp->fchain,
807 					   HAMMER2_RESOLVE_ALWAYS);
808 			hammer2_voldata_lock(hmp);
809 			if (hammer2_debug & 0x8000) {
810 				/* debug only, avoid syslogd loop */
811 				kprintf("sync volume  mirror_tid %08jx\n",
812 					(intmax_t)chain->bref.mirror_tid);
813 			}
814 
815 			/*
816 			 * Update the volume header's mirror_tid to the
817 			 * main topology's flushing mirror_tid.  It is
818 			 * possible that voldata.mirror_tid is already
819 			 * beyond bref.mirror_tid due to the bump we made
820 			 * above in BREF_TYPE_FREEMAP.
821 			 */
822 			if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
823 				hmp->voldata.mirror_tid =
824 					chain->bref.mirror_tid;
825 			}
826 
827 			/*
828 			 * The volume header is flushed manually by the
829 			 * syncer, not here.  All we do here is adjust the
830 			 * crc's.
831 			 */
832 			KKASSERT(chain->data != NULL);
833 			KKASSERT(chain->dio == NULL);
834 
835 			hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
836 				hammer2_icrc32(
837 					(char *)&hmp->voldata +
838 					 HAMMER2_VOLUME_ICRC1_OFF,
839 					HAMMER2_VOLUME_ICRC1_SIZE);
840 			hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
841 				hammer2_icrc32(
842 					(char *)&hmp->voldata +
843 					 HAMMER2_VOLUME_ICRC0_OFF,
844 					HAMMER2_VOLUME_ICRC0_SIZE);
845 			hmp->voldata.icrc_volheader =
846 				hammer2_icrc32(
847 					(char *)&hmp->voldata +
848 					 HAMMER2_VOLUME_ICRCVH_OFF,
849 					HAMMER2_VOLUME_ICRCVH_SIZE);
850 
851 			if (hammer2_debug & 0x8000) {
852 				/* debug only, avoid syslogd loop */
853 				kprintf("syncvolhdr %016jx %016jx\n",
854 					hmp->voldata.mirror_tid,
855 					hmp->vchain.bref.mirror_tid);
856 			}
857 			hmp->volsync = hmp->voldata;
858 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
859 			hammer2_voldata_unlock(hmp);
860 			hammer2_chain_unlock(&hmp->fchain);
861 			break;
862 		case HAMMER2_BREF_TYPE_DATA:
863 			/*
864 			 * Data elements have already been flushed via the
865 			 * logical file buffer cache.  Their hash was set in
866 			 * the bref by the vop_write code.  Do not re-dirty.
867 			 *
868 			 * Make sure any device buffer(s) have been flushed
869 			 * out here (there aren't usually any to flush) XXX.
870 			 */
871 			break;
872 		case HAMMER2_BREF_TYPE_INDIRECT:
873 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
874 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
875 			/*
876 			 * Buffer I/O will be cleaned up when the volume is
877 			 * flushed (but the kernel is free to flush it before
878 			 * then, as well).
879 			 */
880 			hammer2_chain_setcheck(chain, chain->data);
881 			break;
882 		case HAMMER2_BREF_TYPE_DIRENT:
883 			/*
884 			 * A directory entry can use the check area to store
885 			 * the filename for filenames <= 64 bytes, don't blow
886 			 * it up!
887 			 */
888 			if (chain->bytes)
889 				hammer2_chain_setcheck(chain, chain->data);
890 			break;
891 		case HAMMER2_BREF_TYPE_INODE:
892 			/*
893 			 * NOTE: We must call io_setdirty() to make any late
894 			 *	 changes to the inode data, the system might
895 			 *	 have already flushed the buffer.
896 			 */
897 			if (chain->data->ipdata.meta.op_flags &
898 			    HAMMER2_OPFLAG_PFSROOT) {
899 				/*
900 				 * non-NULL pmp if mounted as a PFS.  We must
901 				 * sync fields cached in the pmp? XXX
902 				 */
903 				hammer2_inode_data_t *ipdata;
904 
905 				hammer2_io_setdirty(chain->dio);
906 				ipdata = &chain->data->ipdata;
907 				if (chain->pmp) {
908 					ipdata->meta.pfs_inum =
909 						chain->pmp->inode_tid;
910 				}
911 			} else {
912 				/* can't be mounted as a PFS */
913 			}
914 
915 			hammer2_chain_setcheck(chain, chain->data);
916 			break;
917 		default:
918 			panic("hammer2_flush_core: unsupported "
919 			      "embedded bref %d",
920 			      chain->bref.type);
921 			/* NOT REACHED */
922 		}
923 
924 		/*
925 		 * If the chain was destroyed try to avoid unnecessary I/O
926 		 * that might not have yet occurred.  Remove the data range
927 		 * from dedup candidacy and attempt to invalidation that
928 		 * potentially dirty portion of the I/O buffer.
929 		 */
930 		if (chain->flags & HAMMER2_CHAIN_DESTROY) {
931 			hammer2_io_dedup_delete(hmp,
932 						chain->bref.type,
933 						chain->bref.data_off,
934 						chain->bytes);
935 #if 0
936 			hammer2_io_t *dio;
937 			if (chain->dio) {
938 				hammer2_io_inval(chain->dio,
939 						 chain->bref.data_off,
940 						 chain->bytes);
941 			} else if ((dio = hammer2_io_getquick(hmp,
942 						  chain->bref.data_off,
943 						  chain->bytes,
944 						  1)) != NULL) {
945 				hammer2_io_inval(dio,
946 						 chain->bref.data_off,
947 						 chain->bytes);
948 				hammer2_io_putblk(&dio);
949 			}
950 #endif
951 		}
952 	}
953 
954 	/*
955 	 * If UPDATE is set the parent block table may need to be updated.
956 	 * This can fail if the hammer2_chain_modify() fails.
957 	 *
958 	 * NOTE: UPDATE may be set on vchain or fchain in which case
959 	 *	 parent could be NULL, or on an inode that has not yet
960 	 *	 been inserted into the radix tree.  It's easiest to allow
961 	 *	 the case and test for NULL.  parent can also wind up being
962 	 *	 NULL due to a deletion so we need to handle the case anyway.
963 	 *
964 	 * NOTE: UPDATE can be set when chains are renamed into or out of
965 	 *	 an indirect block, without the chain itself being flagged
966 	 *	 MODIFIED.
967 	 *
968 	 * If no parent exists we can just clear the UPDATE bit.  If the
969 	 * chain gets reattached later on the bit will simply get set
970 	 * again.
971 	 */
972 	if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL)
973 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
974 
975 	/*
976 	 * When flushing an inode outside of a FLUSH_FSSYNC we must NOT
977 	 * update the parent block table to point at the flushed inode.
978 	 * The block table should only ever be updated by the filesystem
979 	 * sync code.  If we do, inode<->inode dependencies (such as
980 	 * directory entries vs inode nlink count) can wind up not being
981 	 * flushed together and result in a broken topology if a crash/reboot
982 	 * occurs at the wrong time.
983 	 */
984 	if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
985 	    (flags & HAMMER2_FLUSH_INODE_STOP) &&
986 	    (flags & HAMMER2_FLUSH_FSSYNC) == 0 &&
987 	    (flags & HAMMER2_FLUSH_ALL) == 0 &&
988 	    chain->pmp && chain->pmp->mp) {
989 #ifdef HAMMER2_DEBUG_SYNC
990 		kprintf("inum %ld do not update parent, non-fssync\n",
991 			(long)chain->bref.key);
992 #endif
993 		goto skipupdate;
994 	}
995 #ifdef HAMMER2_DEBUG_SYNC
996 	if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
997 		kprintf("inum %ld update parent\n", (long)chain->bref.key);
998 #endif
999 
1000 	/*
1001 	 * The chain may need its blockrefs updated in the parent, normal
1002 	 * path.
1003 	 */
1004 	if (chain->flags & HAMMER2_CHAIN_UPDATE) {
1005 		hammer2_blockref_t *base;
1006 		int count;
1007 
1008 		/*
1009 		 * Clear UPDATE flag, mark parent modified, update its
1010 		 * modify_tid if necessary, and adjust the parent blockmap.
1011 		 */
1012 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
1013 
1014 		/*
1015 		 * (optional code)
1016 		 *
1017 		 * Avoid actually modifying and updating the parent if it
1018 		 * was flagged for destruction.  This can greatly reduce
1019 		 * disk I/O in large tree removals because the
1020 		 * hammer2_io_setinval() call in the upward recursion
1021 		 * (see MODIFIED code above) can only handle a few cases.
1022 		 */
1023 		if (parent->flags & HAMMER2_CHAIN_DESTROY) {
1024 			if (parent->bref.modify_tid < chain->bref.modify_tid) {
1025 				parent->bref.modify_tid =
1026 					chain->bref.modify_tid;
1027 			}
1028 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED |
1029 							HAMMER2_CHAIN_BLKMAPUPD);
1030 			goto skipupdate;
1031 		}
1032 
1033 		/*
1034 		 * The flusher is responsible for deleting empty indirect
1035 		 * blocks at this point.  If we don't do this, no major harm
1036 		 * will be done but the empty indirect blocks will stay in
1037 		 * the topology and make it a messy and inefficient.
1038 		 *
1039 		 * The flusher is also responsible for collapsing the
1040 		 * content of an indirect block into its parent whenever
1041 		 * possible (with some hysteresis).  Not doing this will also
1042 		 * not harm the topology, but would make it messy and
1043 		 * inefficient.
1044 		 */
1045 		if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1046 			if (hammer2_chain_indirect_maintenance(parent, chain))
1047 				goto skipupdate;
1048 		}
1049 
1050 		/*
1051 		 * We are updating the parent's blockmap, the parent must
1052 		 * be set modified.  If this fails we re-set the UPDATE flag
1053 		 * in the child.
1054 		 *
1055 		 * NOTE! A modification error can be ENOSPC.  We still want
1056 		 *	 to flush modified chains recursively, not break out,
1057 		 *	 so we just skip the update in this situation and
1058 		 *	 continue.  That is, we still need to try to clean
1059 		 *	 out dirty chains and buffers.
1060 		 *
1061 		 *	 This may not help bulkfree though. XXX
1062 		 */
1063 		save_error = hammer2_chain_modify(parent, 0, 0, 0);
1064 		if (save_error) {
1065 			info->error |= save_error;
1066 			kprintf("hammer2_flush: %016jx.%02x error=%08x\n",
1067 				parent->bref.data_off, parent->bref.type,
1068 				save_error);
1069 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
1070 			goto skipupdate;
1071 		}
1072 		if (parent->bref.modify_tid < chain->bref.modify_tid)
1073 			parent->bref.modify_tid = chain->bref.modify_tid;
1074 
1075 		/*
1076 		 * Calculate blockmap pointer
1077 		 */
1078 		switch(parent->bref.type) {
1079 		case HAMMER2_BREF_TYPE_INODE:
1080 			/*
1081 			 * Access the inode's block array.  However, there is
1082 			 * no block array if the inode is flagged DIRECTDATA.
1083 			 */
1084 			if (parent->data &&
1085 			    (parent->data->ipdata.meta.op_flags &
1086 			     HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1087 				base = &parent->data->
1088 					ipdata.u.blockset.blockref[0];
1089 			} else {
1090 				base = NULL;
1091 			}
1092 			count = HAMMER2_SET_COUNT;
1093 			break;
1094 		case HAMMER2_BREF_TYPE_INDIRECT:
1095 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1096 			if (parent->data)
1097 				base = &parent->data->npdata[0];
1098 			else
1099 				base = NULL;
1100 			count = parent->bytes / sizeof(hammer2_blockref_t);
1101 			break;
1102 		case HAMMER2_BREF_TYPE_VOLUME:
1103 			base = &chain->hmp->voldata.sroot_blockset.blockref[0];
1104 			count = HAMMER2_SET_COUNT;
1105 			break;
1106 		case HAMMER2_BREF_TYPE_FREEMAP:
1107 			base = &parent->data->npdata[0];
1108 			count = HAMMER2_SET_COUNT;
1109 			break;
1110 		default:
1111 			base = NULL;
1112 			count = 0;
1113 			panic("hammer2_flush_core: "
1114 			      "unrecognized blockref type: %d",
1115 			      parent->bref.type);
1116 			break;
1117 		}
1118 
1119 		/*
1120 		 * Blocktable updates
1121 		 *
1122 		 * We synchronize pending statistics at this time.  Delta
1123 		 * adjustments designated for the current and upper level
1124 		 * are synchronized.
1125 		 */
1126 		if (base && (chain->flags & HAMMER2_CHAIN_BLKMAPUPD)) {
1127 			if (chain->flags & HAMMER2_CHAIN_BLKMAPPED) {
1128 				hammer2_spin_ex(&parent->core.spin);
1129 				hammer2_base_delete(parent, base, count, chain,
1130 						    NULL);
1131 				hammer2_spin_unex(&parent->core.spin);
1132 				/* base_delete clears both bits */
1133 			} else {
1134 				atomic_clear_int(&chain->flags,
1135 						 HAMMER2_CHAIN_BLKMAPUPD);
1136 			}
1137 		}
1138 		if (base && (chain->flags & HAMMER2_CHAIN_BLKMAPPED) == 0) {
1139 			hammer2_spin_ex(&parent->core.spin);
1140 			hammer2_base_insert(parent, base, count,
1141 					    chain, &chain->bref);
1142 			hammer2_spin_unex(&parent->core.spin);
1143 			/* base_insert sets BLKMAPPED */
1144 		}
1145 	}
1146 skipupdate:
1147 	if (parent)
1148 		hammer2_chain_unlock(parent);
1149 
1150 	/*
1151 	 * Final cleanup after flush
1152 	 */
1153 done:
1154 	KKASSERT(chain->refs > 0);
1155 
1156 	return retry;
1157 }
1158 
1159 /*
1160  * Flush recursion helper, called from flush_core, calls flush_core.
1161  *
1162  * Flushes the children of the caller's chain (info->parent), restricted
1163  * by sync_tid.
1164  *
1165  * This function may set info->error as a side effect.
1166  *
1167  * WARNING! If we do not call hammer2_flush_core() we must update
1168  *	    bref.mirror_tid ourselves to indicate that the flush has
1169  *	    processed the child.
1170  *
1171  * WARNING! parent->core spinlock is held on entry and return.
1172  */
1173 static int
1174 hammer2_flush_recurse(hammer2_chain_t *child, void *data)
1175 {
1176 	hammer2_flush_info_t *info = data;
1177 	hammer2_chain_t *parent = info->parent;
1178 
1179 #ifdef HAMMER2_SCAN_DEBUG
1180 	++info->scan_count;
1181 	if (child->flags & HAMMER2_CHAIN_MODIFIED)
1182 		++info->scan_mod_count;
1183 	if (child->flags & HAMMER2_CHAIN_UPDATE)
1184 		++info->scan_upd_count;
1185 	if (child->flags & HAMMER2_CHAIN_ONFLUSH)
1186 		++info->scan_onf_count;
1187 #endif
1188 
1189 	/*
1190 	 * (child can never be fchain or vchain so a special check isn't
1191 	 *  needed).
1192 	 *
1193 	 * We must ref the child before unlocking the spinlock.
1194 	 *
1195 	 * The caller has added a ref to the parent so we can temporarily
1196 	 * unlock it in order to lock the child.  However, if it no longer
1197 	 * winds up being the child of the parent we must skip this child.
1198 	 *
1199 	 * NOTE! chain locking errors are fatal.  They are never out-of-space
1200 	 *	 errors.
1201 	 */
1202 	hammer2_chain_ref(child);
1203 	hammer2_spin_unex(&parent->core.spin);
1204 
1205 	hammer2_chain_ref_hold(parent);
1206 	hammer2_chain_unlock(parent);
1207 	hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
1208 	if (child->parent != parent) {
1209 		kprintf("LOST CHILD1 %p->%p (actual parent %p)\n",
1210 			parent, child, child->parent);
1211 		goto done;
1212 	}
1213 	if (child->error) {
1214 		kprintf("CHILD ERROR DURING FLUSH LOCK %p->%p\n",
1215 			parent, child);
1216 		info->error |= child->error;
1217 		goto done;
1218 	}
1219 
1220 	/*
1221 	 * Must propagate the DESTROY flag downwards, otherwise the
1222 	 * parent could end up never being removed because it will
1223 	 * be requeued to the flusher if it survives this run due to
1224 	 * the flag.
1225 	 */
1226 	if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY))
1227 		atomic_set_int(&child->flags, HAMMER2_CHAIN_DESTROY);
1228 #ifdef HAMMER2_SCAN_DEBUG
1229 	if (child->flags & HAMMER2_CHAIN_DESTROY)
1230 		++info->scan_del_count;
1231 #endif
1232 	/*
1233 	 * Special handling of the root inode.  Because the root inode
1234 	 * contains an index of all the inodes in the PFS in addition to
1235 	 * its normal directory entries, any flush that is not part of a
1236 	 * filesystem sync must only flush the directory entries, and not
1237 	 * anything else.
1238 	 *
1239 	 * The child might be an indirect block, but H2 guarantees that
1240 	 * the key-range will fully partition the inode index from the
1241 	 * directory entries so the case just works naturally.
1242 	 */
1243 	if ((parent->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) &&
1244 	    (child->flags & HAMMER2_CHAIN_DESTROY) == 0 &&
1245 	    parent->bref.type == HAMMER2_BREF_TYPE_INODE &&
1246 	    (info->flags & HAMMER2_FLUSH_FSSYNC) == 0) {
1247 		if ((child->bref.key & HAMMER2_DIRHASH_VISIBLE) == 0) {
1248 			if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
1249 				hammer2_chain_setflush(parent);
1250 			}
1251 			goto done;
1252 		}
1253 	}
1254 
1255 	/*
1256 	 * Recurse and collect deferral data.  We're in the media flush,
1257 	 * this can cross PFS boundaries.
1258 	 */
1259 	if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
1260 #ifdef HAMMER2_SCAN_DEBUG
1261 		if (child->bref.type < 7)
1262 			++info->scan_btype[child->bref.type];
1263 #endif
1264 		++info->depth;
1265 		hammer2_flush_core(info, child, info->flags);
1266 		--info->depth;
1267 	}
1268 
1269 done:
1270 	/*
1271 	 * Relock to continue the loop.
1272 	 */
1273 	hammer2_chain_unlock(child);
1274 	hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
1275 	hammer2_chain_drop_unhold(parent);
1276 	if (parent->error) {
1277 		kprintf("PARENT ERROR DURING FLUSH LOCK %p->%p\n",
1278 			parent, child);
1279 		info->error |= parent->error;
1280 	}
1281 	hammer2_chain_drop(child);
1282 	KKASSERT(info->parent == parent);
1283 	hammer2_spin_ex(&parent->core.spin);
1284 
1285 	return (0);
1286 }
1287 
1288 /*
1289  * flush helper (backend threaded)
1290  *
1291  * Flushes chain topology for the specified inode.
1292  *
1293  * HAMMER2_XOP_INODE_STOP	The flush recursion stops at inode boundaries.
1294  *				Inodes belonging to the same flush are flushed
1295  *				separately.
1296  *
1297  * chain->parent can be NULL, usually due to destroy races or detached inodes.
1298  *
1299  * Primarily called from vfs_sync().
1300  */
1301 void
1302 hammer2_xop_inode_flush(hammer2_xop_t *arg, void *scratch __unused, int clindex)
1303 {
1304 	hammer2_xop_flush_t *xop = &arg->xop_flush;
1305 	hammer2_chain_t *chain;
1306 	hammer2_inode_t *ip;
1307 	hammer2_dev_t *hmp;
1308 	hammer2_pfs_t *pmp;
1309 	hammer2_devvp_t *e;
1310 	struct vnode *devvp;
1311 	int flush_error = 0;
1312 	int fsync_error = 0;
1313 	int total_error = 0;
1314 	int j;
1315 	int xflags;
1316 	int ispfsroot = 0;
1317 
1318 	xflags = HAMMER2_FLUSH_TOP;
1319 	if (xop->head.flags & HAMMER2_XOP_INODE_STOP)
1320 		xflags |= HAMMER2_FLUSH_INODE_STOP;
1321 	if (xop->head.flags & HAMMER2_XOP_FSSYNC)
1322 		xflags |= HAMMER2_FLUSH_FSSYNC;
1323 
1324 	/*
1325 	 * Flush core chains
1326 	 */
1327 	ip = xop->head.ip1;
1328 	pmp = ip->pmp;
1329 	chain = hammer2_inode_chain(ip, clindex, HAMMER2_RESOLVE_ALWAYS);
1330 	if (chain) {
1331 		hmp = chain->hmp;
1332 		if (chain->flags & HAMMER2_CHAIN_FLUSH_MASK) {
1333 			/*
1334 			 * Due to flush partitioning the chain topology
1335 			 * above the inode's chain may no longer be flagged.
1336 			 * When asked to flush an inode, remark the topology
1337 			 * leading to that inode.
1338 			 */
1339 			if (chain->parent)
1340 				hammer2_chain_setflush(chain->parent);
1341 			hammer2_flush(chain, xflags);
1342 
1343 			/* XXX cluster */
1344 			if (ip == pmp->iroot && pmp != hmp->spmp) {
1345 				hammer2_spin_ex(&pmp->inum_spin);
1346 				pmp->pfs_iroot_blocksets[clindex] =
1347 					chain->data->ipdata.u.blockset;
1348 				hammer2_spin_unex(&pmp->inum_spin);
1349 			}
1350 
1351 #if 0
1352 			/*
1353 			 * Propogate upwards but only cross an inode boundary
1354 			 * for inodes associated with the current filesystem
1355 			 * sync.
1356 			 */
1357 			if ((xop->head.flags & HAMMER2_XOP_PARENTONFLUSH) ||
1358 			    chain->bref.type != HAMMER2_BREF_TYPE_INODE) {
1359 				parent = chain->parent;
1360 				if (parent)
1361 					hammer2_chain_setflush(parent);
1362 			}
1363 #endif
1364 		}
1365 		if (chain->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1366 			ispfsroot = 1;
1367 		hammer2_chain_unlock(chain);
1368 		hammer2_chain_drop(chain);
1369 		chain = NULL;
1370 	} else {
1371 		hmp = NULL;
1372 	}
1373 
1374 	/*
1375 	 * Only flush the volume header if asked to, plus the inode must also
1376 	 * be the PFS root.
1377 	 */
1378 	if ((xop->head.flags & HAMMER2_XOP_VOLHDR) == 0)
1379 		goto skip;
1380 	if (ispfsroot == 0)
1381 		goto skip;
1382 
1383 	/*
1384 	 * Flush volume roots.  Avoid replication, we only want to
1385 	 * flush each hammer2_dev (hmp) once.
1386 	 */
1387 	for (j = clindex - 1; j >= 0; --j) {
1388 		if ((chain = ip->cluster.array[j].chain) != NULL) {
1389 			if (chain->hmp == hmp) {
1390 				chain = NULL;	/* safety */
1391 				goto skip;
1392 			}
1393 		}
1394 	}
1395 	chain = NULL;	/* safety */
1396 
1397 	/*
1398 	 * spmp transaction.  The super-root is never directly mounted so
1399 	 * there shouldn't be any vnodes, let alone any dirty vnodes
1400 	 * associated with it, so we shouldn't have to mess around with any
1401 	 * vnode flushes here.
1402 	 */
1403 	hammer2_trans_init(hmp->spmp, HAMMER2_TRANS_ISFLUSH);
1404 
1405 	/*
1406 	 * We must flush the superroot down to the PFS iroot.  Remember
1407 	 * that hammer2_chain_setflush() stops at inode boundaries, so
1408 	 * the pmp->iroot has been flushed and flagged down to the superroot,
1409 	 * but the volume root (vchain) probably has not yet been flagged.
1410 	 */
1411 	if (hmp->spmp->iroot) {
1412 		chain = hmp->spmp->iroot->cluster.array[0].chain;
1413 		if (chain) {
1414 			hammer2_chain_ref(chain);
1415 			hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1416 			flush_error |=
1417 				hammer2_flush(chain,
1418 					      HAMMER2_FLUSH_TOP |
1419 					      HAMMER2_FLUSH_INODE_STOP |
1420 					      HAMMER2_FLUSH_FSSYNC);
1421 			hammer2_chain_unlock(chain);
1422 			hammer2_chain_drop(chain);
1423 		}
1424 	}
1425 
1426 	/*
1427 	 * Media mounts have two 'roots', vchain for the topology
1428 	 * and fchain for the free block table.  Flush both.
1429 	 *
1430 	 * Note that the topology and free block table are handled
1431 	 * independently, so the free block table can wind up being
1432 	 * ahead of the topology.  We depend on the bulk free scan
1433 	 * code to deal with any loose ends.
1434 	 *
1435 	 * vchain and fchain do not error on-lock since their data does
1436 	 * not have to be re-read from media.
1437 	 */
1438 	hammer2_chain_ref(&hmp->vchain);
1439 	hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1440 	hammer2_chain_ref(&hmp->fchain);
1441 	hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
1442 	if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1443 		/*
1444 		 * This will also modify vchain as a side effect,
1445 		 * mark vchain as modified now.
1446 		 */
1447 		hammer2_voldata_modify(hmp);
1448 		chain = &hmp->fchain;
1449 		flush_error |= hammer2_flush(chain, HAMMER2_FLUSH_TOP);
1450 		KKASSERT(chain == &hmp->fchain);
1451 	}
1452 	hammer2_chain_unlock(&hmp->fchain);
1453 	hammer2_chain_unlock(&hmp->vchain);
1454 	hammer2_chain_drop(&hmp->fchain);
1455 	/* vchain dropped down below */
1456 
1457 	hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1458 	if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1459 		chain = &hmp->vchain;
1460 		flush_error |= hammer2_flush(chain, HAMMER2_FLUSH_TOP);
1461 		KKASSERT(chain == &hmp->vchain);
1462 	}
1463 	hammer2_chain_unlock(&hmp->vchain);
1464 	hammer2_chain_drop(&hmp->vchain);
1465 
1466 	/*
1467 	 * We can't safely flush the volume header until we have
1468 	 * flushed any device buffers which have built up.
1469 	 *
1470 	 * XXX this isn't being incremental
1471 	 */
1472 	TAILQ_FOREACH(e, &hmp->devvpl, entry) {
1473 		devvp = e->devvp;
1474 		KKASSERT(devvp);
1475 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1476 		fsync_error = VOP_FSYNC(devvp, MNT_WAIT, 0);
1477 		vn_unlock(devvp);
1478 		if (fsync_error || flush_error) {
1479 			kprintf("hammer2: sync error fsync=%d h2flush=0x%04x dev=%s\n",
1480 				fsync_error, flush_error, e->path);
1481 		}
1482 	}
1483 
1484 	/*
1485 	 * The flush code sets CHAIN_VOLUMESYNC to indicate that the
1486 	 * volume header needs synchronization via hmp->volsync.
1487 	 *
1488 	 * XXX synchronize the flag & data with only this flush XXX
1489 	 */
1490 	if (fsync_error == 0 && flush_error == 0 &&
1491 	    (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) {
1492 		struct buf *bp;
1493 		int vol_error = 0;
1494 
1495 		/*
1496 		 * Synchronize the disk before flushing the volume
1497 		 * header.
1498 		 */
1499 		/*
1500 		bp = getpbuf(NULL);
1501 		bp->b_bio1.bio_offset = 0;
1502 		bp->b_bufsize = 0;
1503 		bp->b_bcount = 0;
1504 		bp->b_cmd = BUF_CMD_FLUSH;
1505 		bp->b_bio1.bio_done = biodone_sync;
1506 		bp->b_bio1.bio_flags |= BIO_SYNC;
1507 		vn_strategy(hmp->devvp, &bp->b_bio1);
1508 		fsync_error = biowait(&bp->b_bio1, "h2vol");
1509 		relpbuf(bp, NULL);
1510 		*/
1511 
1512 		/*
1513 		 * Then we can safely flush the version of the
1514 		 * volume header synchronized by the flush code.
1515 		 */
1516 		j = hmp->volhdrno + 1;
1517 		if (j < 0)
1518 			j = 0;
1519 		if (j >= HAMMER2_NUM_VOLHDRS)
1520 			j = 0;
1521 		if (j * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE >
1522 		    hmp->volsync.volu_size) {
1523 			j = 0;
1524 		}
1525 		if (hammer2_debug & 0x8000) {
1526 			/* debug only, avoid syslogd loop */
1527 			kprintf("sync volhdr %d %jd\n",
1528 				j, (intmax_t)hmp->volsync.volu_size);
1529 		}
1530 		bp = getblkx(hmp->devvp, j * HAMMER2_ZONE_BYTES64,
1531 			    HAMMER2_PBUFSIZE, GETBLK_KVABIO, 0);
1532 		atomic_clear_int(&hmp->vchain.flags,
1533 				 HAMMER2_CHAIN_VOLUMESYNC);
1534 		bkvasync(bp);
1535 		bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE);
1536 		vol_error = bwrite(bp);
1537 		hmp->volhdrno = j;
1538 		if (vol_error)
1539 			fsync_error = vol_error;
1540 	}
1541 	if (flush_error)
1542 		total_error = flush_error;
1543 	if (fsync_error)
1544 		total_error = hammer2_errno_to_error(fsync_error);
1545 
1546 	/* spmp trans */
1547 	hammer2_trans_done(hmp->spmp, HAMMER2_TRANS_ISFLUSH);
1548 skip:
1549 	hammer2_xop_feed(&xop->head, NULL, clindex, total_error);
1550 }
1551