1 /*
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2022 Tomohiro Kusumi <tkusumi@netbsd.org>
5  * Copyright (c) 2011-2022 The DragonFly Project.  All rights reserved.
6  *
7  * This code is derived from software contributed to The DragonFly Project
8  * by Matthew Dillon <dillon@dragonflybsd.org>
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in
18  *    the documentation and/or other materials provided with the
19  *    distribution.
20  * 3. Neither the name of The DragonFly Project nor the names of its
21  *    contributors may be used to endorse or promote products derived
22  *    from this software without specific, prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
27  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
28  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
34  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 /*
38  *			TRANSACTION AND FLUSH HANDLING
39  *
40  * Deceptively simple but actually fairly difficult to implement properly is
41  * how I would describe it.
42  *
43  * Flushing generally occurs bottom-up but requires a top-down scan to
44  * locate chains with MODIFIED and/or UPDATE bits set.  The ONFLUSH flag
45  * tells how to recurse downward to find these chains.
46  */
47 
48 /*
49 #include <sys/cdefs.h>
50 #include <sys/param.h>
51 #include <sys/systm.h>
52 #include <sys/types.h>
53 #include <sys/lock.h>
54 #include <sys/vnode.h>
55 #include <sys/buf.h>
56 */
57 
58 #include "hammer2.h"
59 
60 #define HAMMER2_FLUSH_DEPTH_LIMIT	60      /* stack recursion limit */
61 
62 
63 /*
64  * Recursively flush the specified chain.  The chain is locked and
65  * referenced by the caller and will remain so on return.  The chain
66  * will remain referenced throughout but can temporarily lose its
67  * lock during the recursion to avoid unnecessarily stalling user
68  * processes.
69  */
70 struct hammer2_flush_info {
71 	hammer2_chain_t *parent;
72 	int		depth;
73 	int		error;			/* cumulative error */
74 	int		flags;
75 #ifdef HAMMER2_SCAN_DEBUG
76 	long		scan_count;
77 	long		scan_mod_count;
78 	long		scan_upd_count;
79 	long		scan_onf_count;
80 	long		scan_del_count;
81 	long		scan_btype[7];
82 #endif
83 };
84 
85 typedef struct hammer2_flush_info hammer2_flush_info_t;
86 
87 static int hammer2_flush_core(hammer2_flush_info_t *info,
88 				hammer2_chain_t *chain, int flags);
89 static int hammer2_flush_recurse(hammer2_chain_t *child, void *data);
90 
91 /*
92  * Any per-pfs transaction initialization goes here.
93  */
94 void
95 hammer2_trans_manage_init(hammer2_pfs_t *pmp)
96 {
97 }
98 
99 /*
100  * Transaction support for any modifying operation.  Transactions are used
101  * in the pmp layer by the frontend and in the spmp layer by the backend.
102  *
103  * 0			- Normal transaction.  Interlocks against just the
104  *			  COPYQ portion of an ISFLUSH transaction.
105  *
106  * TRANS_ISFLUSH	- Flush transaction.  Interlocks against other flush
107  *			  transactions.
108  *
109  *			  When COPYQ is also specified, waits for the count
110  *			  to drop to 1.
111  *
112  * TRANS_BUFCACHE	- Buffer cache transaction.  No interlock.
113  *
114  * TRANS_SIDEQ		- Run the sideq (only tested in trans_done())
115  *
116  * Initializing a new transaction allocates a transaction ID.  Typically
117  * passed a pmp (hmp passed as NULL), indicating a cluster transaction.  Can
118  * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single
119  * media target.  The latter mode is used by the recovery code.
120  */
121 void
122 hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
123 {
124 	uint32_t oflags;
125 	uint32_t nflags;
126 	int dowait;
127 
128 	for (;;) {
129 		oflags = pmp->trans.flags;
130 		cpu_ccfence();
131 		dowait = 0;
132 
133 		if (flags & HAMMER2_TRANS_ISFLUSH) {
134 			/*
135 			 * Interlock against other flush transactions.
136 			 */
137 			if (oflags & HAMMER2_TRANS_ISFLUSH) {
138 				nflags = oflags | HAMMER2_TRANS_WAITING;
139 				dowait = 1;
140 			} else {
141 				nflags = (oflags | flags) + 1;
142 			}
143 		} else if (flags & HAMMER2_TRANS_BUFCACHE) {
144 			/*
145 			 * Requesting strategy transaction from buffer-cache,
146 			 * or a VM getpages/putpages through the buffer cache.
147 			 * We must allow such transactions in all situations
148 			 * to avoid deadlocks.
149 			 */
150 			nflags = (oflags | flags) + 1;
151 		} else {
152 			/*
153 			 * Normal transaction.  We do not interlock against
154 			 * BUFCACHE or ISFLUSH.
155 			 *
156 			 * Note that vnode locks may be held going into
157 			 * this call.
158 			 *
159 			 * NOTE: Remember that non-modifying operations
160 			 *	 such as read, stat, readdir, etc, do
161 			 *	 not use transactions.
162 			 */
163 			nflags = (oflags | flags) + 1;
164 		}
165 		if (dowait)
166 			tsleep_interlock(&pmp->trans.sync_wait, 0);
167 		if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
168 			if (dowait == 0)
169 				break;
170 			tsleep(&pmp->trans.sync_wait, PINTERLOCKED,
171 			       "h2trans", hz);
172 			/* retry */
173 		} else {
174 			cpu_pause();
175 			/* retry */
176 		}
177 		/* retry */
178 	}
179 
180 #if 0
181 	/*
182 	 * When entering a FLUSH transaction with COPYQ set, wait for the
183 	 * transaction count to drop to 1 (our flush transaction only)
184 	 * before proceeding.
185 	 *
186 	 * This waits for all non-flush transactions to complete and blocks
187 	 * new non-flush transactions from starting until COPYQ is cleared.
188 	 * (the flush will then proceed after clearing COPYQ).  This should
189 	 * be a very short stall on modifying operations.
190 	 */
191 	while ((flags & HAMMER2_TRANS_ISFLUSH) &&
192 	       (flags & HAMMER2_TRANS_COPYQ)) {
193 		oflags = pmp->trans.flags;
194 		cpu_ccfence();
195 		if ((oflags & HAMMER2_TRANS_MASK) == 1)
196 			break;
197 		nflags = oflags | HAMMER2_TRANS_WAITING;
198 		tsleep_interlock(&pmp->trans.sync_wait, 0);
199 		if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
200 			tsleep(&pmp->trans.sync_wait, PINTERLOCKED,
201 			       "h2trans2", hz);
202 		}
203 	}
204 #endif
205 }
206 
207 /*
208  * Start a sub-transaction, there is no 'subdone' function.  This will
209  * issue a new modify_tid (mtid) for the current transaction, which is a
210  * CLC (cluster level change) id and not a per-node id.
211  *
212  * This function must be called for each XOP when multiple XOPs are run in
213  * sequence within a transaction.
214  *
215  * Callers typically update the inode with the transaction mtid manually
216  * to enforce sequencing.
217  */
218 hammer2_tid_t
219 hammer2_trans_sub(hammer2_pfs_t *pmp)
220 {
221 	hammer2_tid_t mtid;
222 
223 	mtid = atomic_fetchadd_64(&pmp->modify_tid, 1);
224 
225 	return (mtid);
226 }
227 
228 void
229 hammer2_trans_setflags(hammer2_pfs_t *pmp, uint32_t flags)
230 {
231 	atomic_set_int(&pmp->trans.flags, flags);
232 }
233 
234 /*
235  * Typically used to clear trans flags asynchronously.  If TRANS_WAITING
236  * is in the mask, and was previously set, this function will wake up
237  * any waiters.
238  */
239 void
240 hammer2_trans_clearflags(hammer2_pfs_t *pmp, uint32_t flags)
241 {
242 	uint32_t oflags;
243 	uint32_t nflags;
244 
245 	for (;;) {
246 		oflags = pmp->trans.flags;
247 		cpu_ccfence();
248 		nflags = oflags & ~flags;
249 		if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
250 			if ((oflags ^ nflags) & HAMMER2_TRANS_WAITING)
251 				wakeup(&pmp->trans.sync_wait);
252 			break;
253 		}
254 		cpu_pause();
255 		/* retry */
256 	}
257 }
258 
259 void
260 hammer2_trans_done(hammer2_pfs_t *pmp, uint32_t flags)
261 {
262 	uint32_t oflags;
263 	uint32_t nflags;
264 
265 #if 0
266 	/*
267 	 * Modifying ops on the front-end can cause dirty inodes to
268 	 * build up in the sideq.  We don't flush these on inactive/reclaim
269 	 * due to potential deadlocks, so we have to deal with them from
270 	 * inside other nominal modifying front-end transactions.
271 	 */
272 	if ((flags & HAMMER2_TRANS_SIDEQ) &&
273 	    pmp->sideq_count > hammer2_limit_dirty_inodes / 2 &&
274 	    pmp->sideq_count > (pmp->inum_count >> 3) &&
275 	    pmp->mp) {
276 		speedup_syncer(pmp->mp);
277 	}
278 #endif
279 
280 	/*
281 	 * Clean-up the transaction.  Wakeup any waiters when finishing
282 	 * a flush transaction or transitioning the non-flush transaction
283 	 * count from 2->1 while a flush transaction is pending.
284 	 */
285 	for (;;) {
286 		oflags = pmp->trans.flags;
287 		cpu_ccfence();
288 		KKASSERT(oflags & HAMMER2_TRANS_MASK);
289 
290 		nflags = (oflags - 1) & ~flags;
291 		if (flags & HAMMER2_TRANS_ISFLUSH) {
292 			nflags &= ~HAMMER2_TRANS_WAITING;
293 		}
294 		if ((oflags & (HAMMER2_TRANS_ISFLUSH|HAMMER2_TRANS_MASK)) ==
295 		    (HAMMER2_TRANS_ISFLUSH|2)) {
296 			nflags &= ~HAMMER2_TRANS_WAITING;
297 		}
298 		if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
299 			if ((oflags ^ nflags) & HAMMER2_TRANS_WAITING)
300 				wakeup(&pmp->trans.sync_wait);
301 			break;
302 		}
303 		cpu_pause();
304 		/* retry */
305 	}
306 }
307 
308 /*
309  * Obtain new, unique inode number (not serialized by caller).
310  */
311 hammer2_tid_t
312 hammer2_trans_newinum(hammer2_pfs_t *pmp)
313 {
314 	hammer2_tid_t tid;
315 
316 	tid = atomic_fetchadd_64(&pmp->inode_tid, 1);
317 
318 	return tid;
319 }
320 
321 /*
322  * Assert that a strategy call is ok here.  Currently we allow strategy
323  * calls in all situations, including during flushes.  Previously:
324  *	(old) (1) In a normal transaction.
325  */
326 void
327 hammer2_trans_assert_strategy(hammer2_pfs_t *pmp)
328 {
329 #if 0
330 	KKASSERT((pmp->trans.flags & HAMMER2_TRANS_ISFLUSH) == 0);
331 #endif
332 }
333 
334 /*
335  * Flush the chain and all modified sub-chains through the specified
336  * synchronization point, propagating blockref updates back up.  As
337  * part of this propagation, mirror_tid and inode/data usage statistics
338  * propagates back upward.
339  *
340  * Returns a HAMMER2 error code, 0 if no error.  Note that I/O errors from
341  * buffers dirtied during the flush operation can occur later.
342  *
343  * modify_tid (clc - cluster level change) is not propagated.
344  *
345  * update_tid (clc) is used for validation and is not propagated by this
346  * function.
347  *
348  * This routine can be called from several places but the most important
349  * is from VFS_SYNC (frontend) via hammer2_xop_inode_flush (backend).
350  *
351  * chain is locked on call and will remain locked on return.  The chain's
352  * UPDATE flag indicates that its parent's block table (which is not yet
353  * part of the flush) should be updated.
354  *
355  * flags:
356  *	HAMMER2_FLUSH_TOP	Indicates that this is the top of the flush.
357  *				Is cleared for the recursion.
358  *
359  *	HAMMER2_FLUSH_ALL	Recurse everything
360  *
361  *	HAMMER2_FLUSH_INODE_STOP
362  *				Stop at PFS inode or normal inode boundary
363  */
364 int
365 hammer2_flush(hammer2_chain_t *chain, int flags)
366 {
367 	hammer2_flush_info_t info;
368 	int loops;
369 
370 	/*
371 	 * Execute the recursive flush and handle deferrals.
372 	 *
373 	 * Chains can be ridiculously long (thousands deep), so to
374 	 * avoid blowing out the kernel stack the recursive flush has a
375 	 * depth limit.  Elements at the limit are placed on a list
376 	 * for re-execution after the stack has been popped.
377 	 */
378 	bzero(&info, sizeof(info));
379 	info.flags = flags & ~HAMMER2_FLUSH_TOP;
380 
381 	/*
382 	 * Calculate parent (can be NULL), if not NULL the flush core
383 	 * expects the parent to be referenced so it can easily lock/unlock
384 	 * it without it getting ripped up.
385 	 */
386 	if ((info.parent = chain->parent) != NULL)
387 		hammer2_chain_ref(info.parent);
388 
389 	/*
390 	 * Extra ref needed because flush_core expects it when replacing
391 	 * chain.
392 	 */
393 	hammer2_chain_ref(chain);
394 	loops = 0;
395 
396 	for (;;) {
397 		/*
398 		 * [re]flush chain as the deep recursion may have generated
399 		 * additional modifications.
400 		 */
401 		if (info.parent != chain->parent) {
402 			if (hammer2_debug & 0x0040) {
403 				kprintf("LOST CHILD4 %p->%p "
404 					"(actual parent %p)\n",
405 					info.parent, chain, chain->parent);
406 			}
407 			hammer2_chain_drop(info.parent);
408 			info.parent = chain->parent;
409 			hammer2_chain_ref(info.parent);
410 		}
411 		if (hammer2_flush_core(&info, chain, flags) == 0)
412 			break;
413 
414 		if (++loops % 1000 == 0) {
415 			kprintf("hammer2_flush: excessive loops on %p\n",
416 				chain);
417 			if (hammer2_debug & 0x100000)
418 				Debugger("hell4");
419 		}
420 	}
421 #ifdef HAMMER2_SCAN_DEBUG
422 	if (info.scan_count >= 10)
423 	kprintf("hammer2_flush: scan_count %ld (%ld,%ld,%ld,%ld) "
424 		"bt(%ld,%ld,%ld,%ld,%ld,%ld)\n",
425 		info.scan_count,
426 		info.scan_mod_count,
427 		info.scan_upd_count,
428 		info.scan_onf_count,
429 		info.scan_del_count,
430 		info.scan_btype[1],
431 		info.scan_btype[2],
432 		info.scan_btype[3],
433 		info.scan_btype[4],
434 		info.scan_btype[5],
435 		info.scan_btype[6]);
436 #endif
437 	hammer2_chain_drop(chain);
438 	if (info.parent)
439 		hammer2_chain_drop(info.parent);
440 	return (info.error);
441 }
442 
443 /*
444  * This is the core of the chain flushing code.  The chain is locked by the
445  * caller and must also have an extra ref on it by the caller, and remains
446  * locked and will have an extra ref on return.  info.parent is referenced
447  * but not locked.
448  *
449  * Upon return, the caller can test the UPDATE bit on the chain to determine
450  * if the parent needs updating.
451  *
452  * If non-zero is returned, the chain's parent changed during the flush and
453  * the caller must retry the operation.
454  *
455  * (1) Determine if this node is a candidate for the flush, return if it is
456  *     not.  fchain and vchain are always candidates for the flush.
457  *
458  * (2) If we recurse too deep the chain is entered onto the deferral list and
459  *     the current flush stack is aborted until after the deferral list is
460  *     run.
461  *
462  * (3) Recursively flush live children (rbtree).  This can create deferrals.
463  *     A successful flush clears the MODIFIED and UPDATE bits on the children
464  *     and typically causes the parent to be marked MODIFIED as the children
465  *     update the parent's block table.  A parent might already be marked
466  *     MODIFIED due to a deletion (whos blocktable update in the parent is
467  *     handled by the frontend), or if the parent itself is modified by the
468  *     frontend for other reasons.
469  *
470  * (4) Permanently disconnected sub-trees are cleaned up by the front-end.
471  *     Deleted-but-open inodes can still be individually flushed via the
472  *     filesystem syncer.
473  *
474  * (5) Delete parents on the way back up if they are normal indirect blocks
475  *     and have no children.
476  *
477  * (6) Note that an unmodified child may still need the block table in its
478  *     parent updated (e.g. rename/move).  The child will have UPDATE set
479  *     in this case.
480  *
481  *			WARNING ON BREF MODIFY_TID/MIRROR_TID
482  *
483  * blockref.modify_tid is consistent only within a PFS, and will not be
484  * consistent during synchronization.  mirror_tid is consistent across the
485  * block device regardless of the PFS.
486  */
487 static int
488 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
489 		   int flags)
490 {
491 	hammer2_chain_t *parent;
492 	hammer2_dev_t *hmp;
493 	int save_error;
494 	int retry;
495 
496 	retry = 0;
497 
498 	/*
499 	 * (1) Optimize downward recursion to locate nodes needing action.
500 	 *     Nothing to do if none of these flags are set.
501 	 */
502 	if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0)
503 		return 0;
504 
505 	hmp = chain->hmp;
506 
507 	/*
508 	 * NOTE: parent can be NULL, usually due to destroy races.
509 	 */
510 	parent = info->parent;
511 	KKASSERT(chain->parent == parent);
512 
513 	/*
514 	 * Downward search recursion
515 	 *
516 	 * We must be careful on cold stops, which often occur on inode
517 	 * boundaries due to the way hammer2_vfs_sync() sequences the flush.
518 	 * Be sure to issue an appropriate chain_setflush()
519 	 */
520 	if ((chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) &&
521 	    (flags & HAMMER2_FLUSH_ALL) == 0 &&
522 	    (flags & HAMMER2_FLUSH_TOP) == 0 &&
523 	    chain->pmp && chain->pmp->mp) {
524 		/*
525 		 * If FLUSH_ALL is not specified the caller does not want
526 		 * to recurse through PFS roots that have been mounted.
527 		 *
528 		 * (If the PFS has not been mounted there may not be
529 		 *  anything monitoring its chains and its up to us
530 		 *  to flush it).
531 		 *
532 		 * The typical sequence is to flush dirty PFS's starting at
533 		 * their root downward, then flush the device root (vchain).
534 		 * It is this second flush that typically leaves out the
535 		 * ALL flag.
536 		 *
537 		 * However we must still process the PFSROOT chains for block
538 		 * table updates in their parent (which IS part of our flush).
539 		 *
540 		 * NOTE: The volume root, vchain, does not set PFSBOUNDARY.
541 		 *
542 		 * NOTE: We must re-set ONFLUSH in the parent to retain if
543 		 *	 this chain (that we are skipping) requires work.
544 		 */
545 		if (chain->flags & (HAMMER2_CHAIN_ONFLUSH |
546 				    HAMMER2_CHAIN_DESTROY |
547 				    HAMMER2_CHAIN_MODIFIED)) {
548 			hammer2_chain_setflush(parent);
549 		}
550 		goto done;
551 	} else if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
552 		   (flags & HAMMER2_FLUSH_INODE_STOP) &&
553 		   (flags & HAMMER2_FLUSH_ALL) == 0 &&
554 		   (flags & HAMMER2_FLUSH_TOP) == 0 &&
555 		   chain->pmp && chain->pmp->mp) {
556 		/*
557 		 * When FLUSH_INODE_STOP is specified we are being asked not
558 		 * to include any inode changes for inodes we encounter,
559 		 * with the exception of the inode that the flush began with.
560 		 * So: INODE, INODE_STOP, and TOP==0 basically.
561 		 *
562 		 * Dirty inodes are flushed based on the hammer2_inode
563 		 * in-memory structure, issuing a chain_setflush() here
564 		 * will only cause unnecessary traversals of the topology.
565 		 */
566 		goto done;
567 #if 0
568 		/*
569 		 * If FLUSH_INODE_STOP is specified and both ALL and TOP
570 		 * are clear, we must not flush the chain.  The chain should
571 		 * have already been flushed and any further ONFLUSH/UPDATE
572 		 * setting will be related to the next flush.
573 		 *
574 		 * This features allows us to flush inodes independently of
575 		 * each other and meta-data above the inodes separately.
576 		 */
577 		if (chain->flags & (HAMMER2_CHAIN_ONFLUSH |
578 				    HAMMER2_CHAIN_DESTROY |
579 				    HAMMER2_CHAIN_MODIFIED)) {
580 			if (parent)
581 				hammer2_chain_setflush(parent);
582 		}
583 #endif
584 	} else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
585 		/*
586 		 * Recursion depth reached.
587 		 */
588 		panic("hammer2: flush depth limit");
589 	} else if (chain->flags & (HAMMER2_CHAIN_ONFLUSH |
590 				   HAMMER2_CHAIN_DESTROY)) {
591 		/*
592 		 * Downward recursion search (actual flush occurs bottom-up).
593 		 * pre-clear ONFLUSH.  It can get set again due to races or
594 		 * flush errors, which we want so the scan finds us again in
595 		 * the next flush.
596 		 *
597 		 * We must also recurse if DESTROY is set so we can finally
598 		 * get rid of the related children, otherwise the node will
599 		 * just get re-flushed on lastdrop.
600 		 *
601 		 * WARNING!  The recursion will unlock/relock info->parent
602 		 *	     (which is 'chain'), potentially allowing it
603 		 *	     to be ripped up.
604 		 */
605 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
606 		save_error = info->error;
607 		info->error = 0;
608 		info->parent = chain;
609 
610 		/*
611 		 * We may have to do this twice to catch any indirect
612 		 * block maintenance that occurs.
613 		 */
614 		hammer2_spin_ex(&chain->core.spin);
615 		RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
616 			NULL, hammer2_flush_recurse, info);
617 		if (chain->flags & HAMMER2_CHAIN_ONFLUSH) {
618 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
619 			RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
620 				NULL, hammer2_flush_recurse, info);
621 		}
622 		hammer2_spin_unex(&chain->core.spin);
623 		info->parent = parent;
624 
625 		/*
626 		 * Re-set the flush bits if the flush was incomplete or
627 		 * an error occurred.  If an error occurs it is typically
628 		 * an allocation error.  Errors do not cause deferrals.
629 		 */
630 		if (info->error)
631 			hammer2_chain_setflush(chain);
632 		info->error |= save_error;
633 
634 		/*
635 		 * If we lost the parent->chain association we have to
636 		 * stop processing this chain because it is no longer
637 		 * in this recursion.  If it moved, it will be handled
638 		 * by the ONFLUSH flag elsewhere.
639 		 */
640 		if (chain->parent != parent) {
641 			kprintf("LOST CHILD2 %p->%p (actual parent %p)\n",
642 				parent, chain, chain->parent);
643 			goto done;
644 		}
645 	}
646 
647 	/*
648 	 * Now we are in the bottom-up part of the recursion.
649 	 *
650 	 * We continue to try to update the chain on lower-level errors, but
651 	 * the flush code may decide not to flush the volume root.
652 	 *
653 	 * XXX should we continue to try to update the chain if an error
654 	 *     occurred?
655 	 */
656 
657 	/*
658 	 * Both parent and chain must be locked in order to flush chain,
659 	 * in order to properly update the parent under certain conditions.
660 	 *
661 	 * In addition, we can't safely unlock/relock the chain once we
662 	 * start flushing the chain itself, which we would have to do later
663 	 * on in order to lock the parent if we didn't do that now.
664 	 */
665 	hammer2_chain_ref_hold(chain);
666 	hammer2_chain_unlock(chain);
667 	if (parent)
668 		hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
669 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE);
670 	hammer2_chain_drop_unhold(chain);
671 
672 	/*
673 	 * Can't process if we can't access their content.
674 	 */
675 	if ((parent && parent->error) || chain->error) {
676 		kprintf("hammer2: chain error during flush\n");
677 		info->error |= chain->error;
678 		if (parent) {
679 			info->error |= parent->error;
680 			hammer2_chain_unlock(parent);
681 		}
682 		goto done;
683 	}
684 
685 	if (chain->parent != parent) {
686 		if (hammer2_debug & 0x0040) {
687 			kprintf("LOST CHILD3 %p->%p (actual parent %p)\n",
688 				parent, chain, chain->parent);
689 		}
690 		KKASSERT(parent != NULL);
691 		hammer2_chain_unlock(parent);
692 		retry = 1;
693 		goto done;
694 	}
695 
696 	/*
697 	 * Propagate the DESTROY flag downwards.  This dummies up the flush
698 	 * code and tries to invalidate related buffer cache buffers to
699 	 * avoid the disk write.
700 	 */
701 	if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY))
702 		atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
703 
704 	/*
705 	 * Dispose of the modified bit.
706 	 *
707 	 * If parent is present, the UPDATE bit should already be set.
708 	 * UPDATE should already be set.
709 	 * bref.mirror_tid should already be set.
710 	 */
711 	if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
712 		KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) ||
713 			 chain->parent == NULL);
714 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
715 		atomic_add_long(&hammer2_count_modified_chains, -1);
716 
717 		/*
718 		 * Manage threads waiting for excessive dirty memory to
719 		 * be retired.
720 		 */
721 		if (chain->pmp)
722 			hammer2_pfs_memory_wakeup(chain->pmp, -1);
723 
724 #if 0
725 		if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0 &&
726 		    chain != &hmp->vchain &&
727 		    chain != &hmp->fchain) {
728 			/*
729 			 * Set UPDATE bit indicating that the parent block
730 			 * table requires updating.
731 			 */
732 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
733 		}
734 #endif
735 
736 		/*
737 		 * Issue the flush.  This is indirect via the DIO.
738 		 *
739 		 * NOTE: A DELETED node that reaches this point must be
740 		 *	 flushed for synchronization point consistency.
741 		 *
742 		 * NOTE: Even though MODIFIED was already set, the related DIO
743 		 *	 might not be dirty due to a system buffer cache
744 		 *	 flush and must be set dirty if we are going to make
745 		 *	 further modifications to the buffer.  Chains with
746 		 *	 embedded data don't need this.
747 		 */
748 		if (hammer2_debug & 0x1000) {
749 			kprintf("Flush %p.%d %016jx/%d data=%016jx\n",
750 				chain, chain->bref.type,
751 				(uintmax_t)chain->bref.key,
752 				chain->bref.keybits,
753 				(uintmax_t)chain->bref.data_off);
754 		}
755 
756 		/*
757 		 * Update chain CRCs for flush.
758 		 *
759 		 * NOTE: Volume headers are NOT flushed here as they require
760 		 *	 special processing.
761 		 */
762 		switch(chain->bref.type) {
763 		case HAMMER2_BREF_TYPE_FREEMAP:
764 			/*
765 			 * Update the volume header's freemap_tid to the
766 			 * freemap's flushing mirror_tid.
767 			 *
768 			 * (note: embedded data, do not call setdirty)
769 			 */
770 			KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED);
771 			KKASSERT(chain == &hmp->fchain);
772 			hmp->voldata.freemap_tid = chain->bref.mirror_tid;
773 			if (hammer2_debug & 0x8000) {
774 				/* debug only, avoid syslogd loop */
775 				kprintf("sync freemap mirror_tid %08jx\n",
776 					(intmax_t)chain->bref.mirror_tid);
777 			}
778 
779 			/*
780 			 * The freemap can be flushed independently of the
781 			 * main topology, but for the case where it is
782 			 * flushed in the same transaction, and flushed
783 			 * before vchain (a case we want to allow for
784 			 * performance reasons), make sure modifications
785 			 * made during the flush under vchain use a new
786 			 * transaction id.
787 			 *
788 			 * Otherwise the mount recovery code will get confused.
789 			 */
790 			++hmp->voldata.mirror_tid;
791 			break;
792 		case HAMMER2_BREF_TYPE_VOLUME:
793 			/*
794 			 * The free block table is flushed by
795 			 * hammer2_vfs_sync() before it flushes vchain.
796 			 * We must still hold fchain locked while copying
797 			 * voldata to volsync, however.
798 			 *
799 			 * These do not error per-say since their data does
800 			 * not need to be re-read from media on lock.
801 			 *
802 			 * (note: embedded data, do not call setdirty)
803 			 */
804 			hammer2_chain_lock(&hmp->fchain,
805 					   HAMMER2_RESOLVE_ALWAYS);
806 			hammer2_voldata_lock(hmp);
807 			if (hammer2_debug & 0x8000) {
808 				/* debug only, avoid syslogd loop */
809 				kprintf("sync volume  mirror_tid %08jx\n",
810 					(intmax_t)chain->bref.mirror_tid);
811 			}
812 
813 			/*
814 			 * Update the volume header's mirror_tid to the
815 			 * main topology's flushing mirror_tid.  It is
816 			 * possible that voldata.mirror_tid is already
817 			 * beyond bref.mirror_tid due to the bump we made
818 			 * above in BREF_TYPE_FREEMAP.
819 			 */
820 			if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
821 				hmp->voldata.mirror_tid =
822 					chain->bref.mirror_tid;
823 			}
824 
825 			/*
826 			 * The volume header is flushed manually by the
827 			 * syncer, not here.  All we do here is adjust the
828 			 * crc's.
829 			 */
830 			KKASSERT(chain->data != NULL);
831 			KKASSERT(chain->dio == NULL);
832 
833 			hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
834 				hammer2_icrc32(
835 					(char *)&hmp->voldata +
836 					 HAMMER2_VOLUME_ICRC1_OFF,
837 					HAMMER2_VOLUME_ICRC1_SIZE);
838 			hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
839 				hammer2_icrc32(
840 					(char *)&hmp->voldata +
841 					 HAMMER2_VOLUME_ICRC0_OFF,
842 					HAMMER2_VOLUME_ICRC0_SIZE);
843 			hmp->voldata.icrc_volheader =
844 				hammer2_icrc32(
845 					(char *)&hmp->voldata +
846 					 HAMMER2_VOLUME_ICRCVH_OFF,
847 					HAMMER2_VOLUME_ICRCVH_SIZE);
848 
849 			if (hammer2_debug & 0x8000) {
850 				/* debug only, avoid syslogd loop */
851 				kprintf("syncvolhdr %016jx %016jx\n",
852 					hmp->voldata.mirror_tid,
853 					hmp->vchain.bref.mirror_tid);
854 			}
855 			hmp->volsync = hmp->voldata;
856 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
857 			hammer2_voldata_unlock(hmp);
858 			hammer2_chain_unlock(&hmp->fchain);
859 			break;
860 		case HAMMER2_BREF_TYPE_DATA:
861 			/*
862 			 * Data elements have already been flushed via the
863 			 * logical file buffer cache.  Their hash was set in
864 			 * the bref by the vop_write code.  Do not re-dirty.
865 			 *
866 			 * Make sure any device buffer(s) have been flushed
867 			 * out here (there aren't usually any to flush) XXX.
868 			 */
869 			break;
870 		case HAMMER2_BREF_TYPE_INDIRECT:
871 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
872 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
873 			/*
874 			 * Buffer I/O will be cleaned up when the volume is
875 			 * flushed (but the kernel is free to flush it before
876 			 * then, as well).
877 			 */
878 			hammer2_chain_setcheck(chain, chain->data);
879 			break;
880 		case HAMMER2_BREF_TYPE_DIRENT:
881 			/*
882 			 * A directory entry can use the check area to store
883 			 * the filename for filenames <= 64 bytes, don't blow
884 			 * it up!
885 			 */
886 			if (chain->bytes)
887 				hammer2_chain_setcheck(chain, chain->data);
888 			break;
889 		case HAMMER2_BREF_TYPE_INODE:
890 			/*
891 			 * NOTE: We must call io_setdirty() to make any late
892 			 *	 changes to the inode data, the system might
893 			 *	 have already flushed the buffer.
894 			 */
895 			if (chain->data->ipdata.meta.op_flags &
896 			    HAMMER2_OPFLAG_PFSROOT) {
897 				/*
898 				 * non-NULL pmp if mounted as a PFS.  We must
899 				 * sync fields cached in the pmp? XXX
900 				 */
901 				hammer2_inode_data_t *ipdata;
902 
903 				hammer2_io_setdirty(chain->dio);
904 				ipdata = &chain->data->ipdata;
905 				if (chain->pmp) {
906 					ipdata->meta.pfs_inum =
907 						chain->pmp->inode_tid;
908 				}
909 			} else {
910 				/* can't be mounted as a PFS */
911 			}
912 
913 			hammer2_chain_setcheck(chain, chain->data);
914 			break;
915 		default:
916 			panic("hammer2_flush_core: unsupported "
917 			      "embedded bref %d",
918 			      chain->bref.type);
919 			/* NOT REACHED */
920 		}
921 
922 		/*
923 		 * If the chain was destroyed try to avoid unnecessary I/O
924 		 * that might not have yet occurred.  Remove the data range
925 		 * from dedup candidacy and attempt to invalidation that
926 		 * potentially dirty portion of the I/O buffer.
927 		 */
928 		if (chain->flags & HAMMER2_CHAIN_DESTROY) {
929 			hammer2_io_dedup_delete(hmp,
930 						chain->bref.type,
931 						chain->bref.data_off,
932 						chain->bytes);
933 #if 0
934 			hammer2_io_t *dio;
935 			if (chain->dio) {
936 				hammer2_io_inval(chain->dio,
937 						 chain->bref.data_off,
938 						 chain->bytes);
939 			} else if ((dio = hammer2_io_getquick(hmp,
940 						  chain->bref.data_off,
941 						  chain->bytes,
942 						  1)) != NULL) {
943 				hammer2_io_inval(dio,
944 						 chain->bref.data_off,
945 						 chain->bytes);
946 				hammer2_io_putblk(&dio);
947 			}
948 #endif
949 		}
950 	}
951 
952 	/*
953 	 * If UPDATE is set the parent block table may need to be updated.
954 	 * This can fail if the hammer2_chain_modify() fails.
955 	 *
956 	 * NOTE: UPDATE may be set on vchain or fchain in which case
957 	 *	 parent could be NULL, or on an inode that has not yet
958 	 *	 been inserted into the radix tree.  It's easiest to allow
959 	 *	 the case and test for NULL.  parent can also wind up being
960 	 *	 NULL due to a deletion so we need to handle the case anyway.
961 	 *
962 	 * NOTE: UPDATE can be set when chains are renamed into or out of
963 	 *	 an indirect block, without the chain itself being flagged
964 	 *	 MODIFIED.
965 	 *
966 	 * If no parent exists we can just clear the UPDATE bit.  If the
967 	 * chain gets reattached later on the bit will simply get set
968 	 * again.
969 	 */
970 	if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL)
971 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
972 
973 	/*
974 	 * When flushing an inode outside of a FLUSH_FSSYNC we must NOT
975 	 * update the parent block table to point at the flushed inode.
976 	 * The block table should only ever be updated by the filesystem
977 	 * sync code.  If we do, inode<->inode dependencies (such as
978 	 * directory entries vs inode nlink count) can wind up not being
979 	 * flushed together and result in a broken topology if a crash/reboot
980 	 * occurs at the wrong time.
981 	 */
982 	if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
983 	    (flags & HAMMER2_FLUSH_INODE_STOP) &&
984 	    (flags & HAMMER2_FLUSH_FSSYNC) == 0 &&
985 	    (flags & HAMMER2_FLUSH_ALL) == 0 &&
986 	    chain->pmp && chain->pmp->mp) {
987 #ifdef HAMMER2_DEBUG_SYNC
988 		kprintf("inum %ld do not update parent, non-fssync\n",
989 			(long)chain->bref.key);
990 #endif
991 		goto skipupdate;
992 	}
993 #ifdef HAMMER2_DEBUG_SYNC
994 	if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
995 		kprintf("inum %ld update parent\n", (long)chain->bref.key);
996 #endif
997 
998 	/*
999 	 * The chain may need its blockrefs updated in the parent, normal
1000 	 * path.
1001 	 */
1002 	if (chain->flags & HAMMER2_CHAIN_UPDATE) {
1003 		hammer2_blockref_t *base;
1004 		int count;
1005 
1006 		/*
1007 		 * Clear UPDATE flag, mark parent modified, update its
1008 		 * modify_tid if necessary, and adjust the parent blockmap.
1009 		 */
1010 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
1011 
1012 		/*
1013 		 * (optional code)
1014 		 *
1015 		 * Avoid actually modifying and updating the parent if it
1016 		 * was flagged for destruction.  This can greatly reduce
1017 		 * disk I/O in large tree removals because the
1018 		 * hammer2_io_setinval() call in the upward recursion
1019 		 * (see MODIFIED code above) can only handle a few cases.
1020 		 */
1021 		if (parent->flags & HAMMER2_CHAIN_DESTROY) {
1022 			if (parent->bref.modify_tid < chain->bref.modify_tid) {
1023 				parent->bref.modify_tid =
1024 					chain->bref.modify_tid;
1025 			}
1026 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED |
1027 							HAMMER2_CHAIN_BLKMAPUPD);
1028 			goto skipupdate;
1029 		}
1030 
1031 		/*
1032 		 * The flusher is responsible for deleting empty indirect
1033 		 * blocks at this point.  If we don't do this, no major harm
1034 		 * will be done but the empty indirect blocks will stay in
1035 		 * the topology and make it a messy and inefficient.
1036 		 *
1037 		 * The flusher is also responsible for collapsing the
1038 		 * content of an indirect block into its parent whenever
1039 		 * possible (with some hysteresis).  Not doing this will also
1040 		 * not harm the topology, but would make it messy and
1041 		 * inefficient.
1042 		 */
1043 		if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1044 			if (hammer2_chain_indirect_maintenance(parent, chain))
1045 				goto skipupdate;
1046 		}
1047 
1048 		/*
1049 		 * We are updating the parent's blockmap, the parent must
1050 		 * be set modified.  If this fails we re-set the UPDATE flag
1051 		 * in the child.
1052 		 *
1053 		 * NOTE! A modification error can be ENOSPC.  We still want
1054 		 *	 to flush modified chains recursively, not break out,
1055 		 *	 so we just skip the update in this situation and
1056 		 *	 continue.  That is, we still need to try to clean
1057 		 *	 out dirty chains and buffers.
1058 		 *
1059 		 *	 This may not help bulkfree though. XXX
1060 		 */
1061 		save_error = hammer2_chain_modify(parent, 0, 0, 0);
1062 		if (save_error) {
1063 			info->error |= save_error;
1064 			kprintf("hammer2_flush: %016jx.%02x error=%08x\n",
1065 				parent->bref.data_off, parent->bref.type,
1066 				save_error);
1067 			atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
1068 			goto skipupdate;
1069 		}
1070 		if (parent->bref.modify_tid < chain->bref.modify_tid)
1071 			parent->bref.modify_tid = chain->bref.modify_tid;
1072 
1073 		/*
1074 		 * Calculate blockmap pointer
1075 		 */
1076 		switch(parent->bref.type) {
1077 		case HAMMER2_BREF_TYPE_INODE:
1078 			/*
1079 			 * Access the inode's block array.  However, there is
1080 			 * no block array if the inode is flagged DIRECTDATA.
1081 			 */
1082 			if (parent->data &&
1083 			    (parent->data->ipdata.meta.op_flags &
1084 			     HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1085 				base = &parent->data->
1086 					ipdata.u.blockset.blockref[0];
1087 			} else {
1088 				base = NULL;
1089 			}
1090 			count = HAMMER2_SET_COUNT;
1091 			break;
1092 		case HAMMER2_BREF_TYPE_INDIRECT:
1093 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1094 			if (parent->data)
1095 				base = &parent->data->npdata[0];
1096 			else
1097 				base = NULL;
1098 			count = parent->bytes / sizeof(hammer2_blockref_t);
1099 			break;
1100 		case HAMMER2_BREF_TYPE_VOLUME:
1101 			base = &chain->hmp->voldata.sroot_blockset.blockref[0];
1102 			count = HAMMER2_SET_COUNT;
1103 			break;
1104 		case HAMMER2_BREF_TYPE_FREEMAP:
1105 			base = &parent->data->npdata[0];
1106 			count = HAMMER2_SET_COUNT;
1107 			break;
1108 		default:
1109 			base = NULL;
1110 			count = 0;
1111 			panic("hammer2_flush_core: "
1112 			      "unrecognized blockref type: %d",
1113 			      parent->bref.type);
1114 			break;
1115 		}
1116 
1117 		/*
1118 		 * Blocktable updates
1119 		 *
1120 		 * We synchronize pending statistics at this time.  Delta
1121 		 * adjustments designated for the current and upper level
1122 		 * are synchronized.
1123 		 */
1124 		if (base && (chain->flags & HAMMER2_CHAIN_BLKMAPUPD)) {
1125 			if (chain->flags & HAMMER2_CHAIN_BLKMAPPED) {
1126 				hammer2_spin_ex(&parent->core.spin);
1127 				hammer2_base_delete(parent, base, count, chain,
1128 						    NULL);
1129 				hammer2_spin_unex(&parent->core.spin);
1130 				/* base_delete clears both bits */
1131 			} else {
1132 				atomic_clear_int(&chain->flags,
1133 						 HAMMER2_CHAIN_BLKMAPUPD);
1134 			}
1135 		}
1136 		if (base && (chain->flags & HAMMER2_CHAIN_BLKMAPPED) == 0) {
1137 			hammer2_spin_ex(&parent->core.spin);
1138 			hammer2_base_insert(parent, base, count,
1139 					    chain, &chain->bref);
1140 			hammer2_spin_unex(&parent->core.spin);
1141 			/* base_insert sets BLKMAPPED */
1142 		}
1143 	}
1144 skipupdate:
1145 	if (parent)
1146 		hammer2_chain_unlock(parent);
1147 
1148 	/*
1149 	 * Final cleanup after flush
1150 	 */
1151 done:
1152 	KKASSERT(chain->refs > 0);
1153 
1154 	return retry;
1155 }
1156 
1157 /*
1158  * Flush recursion helper, called from flush_core, calls flush_core.
1159  *
1160  * Flushes the children of the caller's chain (info->parent), restricted
1161  * by sync_tid.
1162  *
1163  * This function may set info->error as a side effect.
1164  *
1165  * WARNING! If we do not call hammer2_flush_core() we must update
1166  *	    bref.mirror_tid ourselves to indicate that the flush has
1167  *	    processed the child.
1168  *
1169  * WARNING! parent->core spinlock is held on entry and return.
1170  */
1171 static int
1172 hammer2_flush_recurse(hammer2_chain_t *child, void *data)
1173 {
1174 	hammer2_flush_info_t *info = data;
1175 	hammer2_chain_t *parent = info->parent;
1176 
1177 #ifdef HAMMER2_SCAN_DEBUG
1178 	++info->scan_count;
1179 	if (child->flags & HAMMER2_CHAIN_MODIFIED)
1180 		++info->scan_mod_count;
1181 	if (child->flags & HAMMER2_CHAIN_UPDATE)
1182 		++info->scan_upd_count;
1183 	if (child->flags & HAMMER2_CHAIN_ONFLUSH)
1184 		++info->scan_onf_count;
1185 #endif
1186 
1187 	/*
1188 	 * (child can never be fchain or vchain so a special check isn't
1189 	 *  needed).
1190 	 *
1191 	 * We must ref the child before unlocking the spinlock.
1192 	 *
1193 	 * The caller has added a ref to the parent so we can temporarily
1194 	 * unlock it in order to lock the child.  However, if it no longer
1195 	 * winds up being the child of the parent we must skip this child.
1196 	 *
1197 	 * NOTE! chain locking errors are fatal.  They are never out-of-space
1198 	 *	 errors.
1199 	 */
1200 	hammer2_chain_ref(child);
1201 	hammer2_spin_unex(&parent->core.spin);
1202 
1203 	hammer2_chain_ref_hold(parent);
1204 	hammer2_chain_unlock(parent);
1205 	hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
1206 	if (child->parent != parent) {
1207 		kprintf("LOST CHILD1 %p->%p (actual parent %p)\n",
1208 			parent, child, child->parent);
1209 		goto done;
1210 	}
1211 	if (child->error) {
1212 		kprintf("CHILD ERROR DURING FLUSH LOCK %p->%p\n",
1213 			parent, child);
1214 		info->error |= child->error;
1215 		goto done;
1216 	}
1217 
1218 	/*
1219 	 * Must propagate the DESTROY flag downwards, otherwise the
1220 	 * parent could end up never being removed because it will
1221 	 * be requeued to the flusher if it survives this run due to
1222 	 * the flag.
1223 	 */
1224 	if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY))
1225 		atomic_set_int(&child->flags, HAMMER2_CHAIN_DESTROY);
1226 #ifdef HAMMER2_SCAN_DEBUG
1227 	if (child->flags & HAMMER2_CHAIN_DESTROY)
1228 		++info->scan_del_count;
1229 #endif
1230 	/*
1231 	 * Special handling of the root inode.  Because the root inode
1232 	 * contains an index of all the inodes in the PFS in addition to
1233 	 * its normal directory entries, any flush that is not part of a
1234 	 * filesystem sync must only flush the directory entries, and not
1235 	 * anything else.
1236 	 *
1237 	 * The child might be an indirect block, but H2 guarantees that
1238 	 * the key-range will fully partition the inode index from the
1239 	 * directory entries so the case just works naturally.
1240 	 */
1241 	if ((parent->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) &&
1242 	    (child->flags & HAMMER2_CHAIN_DESTROY) == 0 &&
1243 	    parent->bref.type == HAMMER2_BREF_TYPE_INODE &&
1244 	    (info->flags & HAMMER2_FLUSH_FSSYNC) == 0) {
1245 		if ((child->bref.key & HAMMER2_DIRHASH_VISIBLE) == 0) {
1246 			if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
1247 				hammer2_chain_setflush(parent);
1248 			}
1249 			goto done;
1250 		}
1251 	}
1252 
1253 	/*
1254 	 * Recurse and collect deferral data.  We're in the media flush,
1255 	 * this can cross PFS boundaries.
1256 	 */
1257 	if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
1258 #ifdef HAMMER2_SCAN_DEBUG
1259 		if (child->bref.type < 7)
1260 			++info->scan_btype[child->bref.type];
1261 #endif
1262 		++info->depth;
1263 		hammer2_flush_core(info, child, info->flags);
1264 		--info->depth;
1265 	}
1266 
1267 done:
1268 	/*
1269 	 * Relock to continue the loop.
1270 	 */
1271 	hammer2_chain_unlock(child);
1272 	hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
1273 	hammer2_chain_drop_unhold(parent);
1274 	if (parent->error) {
1275 		kprintf("PARENT ERROR DURING FLUSH LOCK %p->%p\n",
1276 			parent, child);
1277 		info->error |= parent->error;
1278 	}
1279 	hammer2_chain_drop(child);
1280 	KKASSERT(info->parent == parent);
1281 	hammer2_spin_ex(&parent->core.spin);
1282 
1283 	return (0);
1284 }
1285 
1286 /*
1287  * flush helper (backend threaded)
1288  *
1289  * Flushes chain topology for the specified inode.
1290  *
1291  * HAMMER2_XOP_INODE_STOP	The flush recursion stops at inode boundaries.
1292  *				Inodes belonging to the same flush are flushed
1293  *				separately.
1294  *
1295  * chain->parent can be NULL, usually due to destroy races or detached inodes.
1296  *
1297  * Primarily called from vfs_sync().
1298  */
1299 void
1300 hammer2_xop_inode_flush(hammer2_xop_t *arg, void *scratch __unused, int clindex)
1301 {
1302 	hammer2_xop_flush_t *xop = &arg->xop_flush;
1303 	hammer2_chain_t *chain;
1304 	hammer2_inode_t *ip;
1305 	hammer2_dev_t *hmp;
1306 	hammer2_pfs_t *pmp;
1307 	hammer2_devvp_t *e;
1308 	struct m_vnode *devvp;
1309 	int flush_error = 0;
1310 	int fsync_error = 0;
1311 	int total_error = 0;
1312 	int j;
1313 	int xflags;
1314 	int ispfsroot = 0;
1315 
1316 	xflags = HAMMER2_FLUSH_TOP;
1317 	if (xop->head.flags & HAMMER2_XOP_INODE_STOP)
1318 		xflags |= HAMMER2_FLUSH_INODE_STOP;
1319 	if (xop->head.flags & HAMMER2_XOP_FSSYNC)
1320 		xflags |= HAMMER2_FLUSH_FSSYNC;
1321 
1322 	/*
1323 	 * Flush core chains
1324 	 */
1325 	ip = xop->head.ip1;
1326 	pmp = ip->pmp;
1327 	chain = hammer2_inode_chain(ip, clindex, HAMMER2_RESOLVE_ALWAYS);
1328 	if (chain) {
1329 		hmp = chain->hmp;
1330 		if (chain->flags & HAMMER2_CHAIN_FLUSH_MASK) {
1331 			/*
1332 			 * Due to flush partitioning the chain topology
1333 			 * above the inode's chain may no longer be flagged.
1334 			 * When asked to flush an inode, remark the topology
1335 			 * leading to that inode.
1336 			 */
1337 			if (chain->parent)
1338 				hammer2_chain_setflush(chain->parent);
1339 			hammer2_flush(chain, xflags);
1340 
1341 			/* XXX cluster */
1342 			if (ip == pmp->iroot && pmp != hmp->spmp) {
1343 				hammer2_spin_ex(&pmp->blockset_spin);
1344 				pmp->pfs_iroot_blocksets[clindex] =
1345 					chain->data->ipdata.u.blockset;
1346 				hammer2_spin_unex(&pmp->blockset_spin);
1347 			}
1348 
1349 #if 0
1350 			/*
1351 			 * Propogate upwards but only cross an inode boundary
1352 			 * for inodes associated with the current filesystem
1353 			 * sync.
1354 			 */
1355 			if ((xop->head.flags & HAMMER2_XOP_PARENTONFLUSH) ||
1356 			    chain->bref.type != HAMMER2_BREF_TYPE_INODE) {
1357 				parent = chain->parent;
1358 				if (parent)
1359 					hammer2_chain_setflush(parent);
1360 			}
1361 #endif
1362 		}
1363 		if (chain->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1364 			ispfsroot = 1;
1365 		hammer2_chain_unlock(chain);
1366 		hammer2_chain_drop(chain);
1367 		chain = NULL;
1368 	} else {
1369 		hmp = NULL;
1370 	}
1371 
1372 	/*
1373 	 * Only flush the volume header if asked to, plus the inode must also
1374 	 * be the PFS root.
1375 	 */
1376 	if ((xop->head.flags & HAMMER2_XOP_VOLHDR) == 0)
1377 		goto skip;
1378 	if (ispfsroot == 0)
1379 		goto skip;
1380 
1381 	/*
1382 	 * Flush volume roots.  Avoid replication, we only want to
1383 	 * flush each hammer2_dev (hmp) once.
1384 	 */
1385 	for (j = clindex - 1; j >= 0; --j) {
1386 		if ((chain = ip->cluster.array[j].chain) != NULL) {
1387 			if (chain->hmp == hmp) {
1388 				chain = NULL;	/* safety */
1389 				goto skip;
1390 			}
1391 		}
1392 	}
1393 	chain = NULL;	/* safety */
1394 
1395 	/*
1396 	 * spmp transaction.  The super-root is never directly mounted so
1397 	 * there shouldn't be any vnodes, let alone any dirty vnodes
1398 	 * associated with it, so we shouldn't have to mess around with any
1399 	 * vnode flushes here.
1400 	 */
1401 	hammer2_trans_init(hmp->spmp, HAMMER2_TRANS_ISFLUSH);
1402 
1403 	/*
1404 	 * We must flush the superroot down to the PFS iroot.  Remember
1405 	 * that hammer2_chain_setflush() stops at inode boundaries, so
1406 	 * the pmp->iroot has been flushed and flagged down to the superroot,
1407 	 * but the volume root (vchain) probably has not yet been flagged.
1408 	 */
1409 	if (hmp->spmp->iroot) {
1410 		chain = hmp->spmp->iroot->cluster.array[0].chain;
1411 		if (chain) {
1412 			hammer2_chain_ref(chain);
1413 			hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1414 			flush_error |=
1415 				hammer2_flush(chain,
1416 					      HAMMER2_FLUSH_TOP |
1417 					      HAMMER2_FLUSH_INODE_STOP |
1418 					      HAMMER2_FLUSH_FSSYNC);
1419 			hammer2_chain_unlock(chain);
1420 			hammer2_chain_drop(chain);
1421 		}
1422 	}
1423 
1424 	/*
1425 	 * Media mounts have two 'roots', vchain for the topology
1426 	 * and fchain for the free block table.  Flush both.
1427 	 *
1428 	 * Note that the topology and free block table are handled
1429 	 * independently, so the free block table can wind up being
1430 	 * ahead of the topology.  We depend on the bulk free scan
1431 	 * code to deal with any loose ends.
1432 	 *
1433 	 * vchain and fchain do not error on-lock since their data does
1434 	 * not have to be re-read from media.
1435 	 */
1436 	hammer2_chain_ref(&hmp->vchain);
1437 	hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1438 	hammer2_chain_ref(&hmp->fchain);
1439 	hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
1440 	if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1441 		/*
1442 		 * This will also modify vchain as a side effect,
1443 		 * mark vchain as modified now.
1444 		 */
1445 		hammer2_voldata_modify(hmp);
1446 		chain = &hmp->fchain;
1447 		flush_error |= hammer2_flush(chain, HAMMER2_FLUSH_TOP);
1448 		KKASSERT(chain == &hmp->fchain);
1449 	}
1450 	hammer2_chain_unlock(&hmp->fchain);
1451 	hammer2_chain_unlock(&hmp->vchain);
1452 	hammer2_chain_drop(&hmp->fchain);
1453 	/* vchain dropped down below */
1454 
1455 	hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1456 	if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1457 		chain = &hmp->vchain;
1458 		flush_error |= hammer2_flush(chain, HAMMER2_FLUSH_TOP);
1459 		KKASSERT(chain == &hmp->vchain);
1460 	}
1461 	hammer2_chain_unlock(&hmp->vchain);
1462 	hammer2_chain_drop(&hmp->vchain);
1463 
1464 	/*
1465 	 * We can't safely flush the volume header until we have
1466 	 * flushed any device buffers which have built up.
1467 	 *
1468 	 * XXX this isn't being incremental
1469 	 */
1470 	TAILQ_FOREACH(e, &hmp->devvpl, entry) {
1471 		devvp = e->devvp;
1472 		KKASSERT(devvp);
1473 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1474 		fsync_error = VOP_FSYNC(devvp, MNT_WAIT, 0);
1475 		vn_unlock(devvp);
1476 		if (fsync_error || flush_error) {
1477 			kprintf("hammer2: sync error fsync=%d h2flush=0x%04x dev=%s\n",
1478 				fsync_error, flush_error, e->path);
1479 		}
1480 	}
1481 
1482 	/*
1483 	 * The flush code sets CHAIN_VOLUMESYNC to indicate that the
1484 	 * volume header needs synchronization via hmp->volsync.
1485 	 *
1486 	 * XXX synchronize the flag & data with only this flush XXX
1487 	 */
1488 	if (fsync_error == 0 && flush_error == 0 &&
1489 	    (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) {
1490 		struct m_buf *bp;
1491 		int vol_error = 0;
1492 
1493 		/*
1494 		 * Synchronize the disk before flushing the volume
1495 		 * header.
1496 		 */
1497 		/*
1498 		bp = getpbuf(NULL);
1499 		bp->b_bio1.bio_offset = 0;
1500 		bp->b_bufsize = 0;
1501 		bp->b_bcount = 0;
1502 		bp->b_cmd = BUF_CMD_FLUSH;
1503 		bp->b_bio1.bio_done = biodone_sync;
1504 		bp->b_bio1.bio_flags |= BIO_SYNC;
1505 		vn_strategy(hmp->devvp, &bp->b_bio1);
1506 		fsync_error = biowait(&bp->b_bio1, "h2vol");
1507 		relpbuf(bp, NULL);
1508 		*/
1509 
1510 		/*
1511 		 * Then we can safely flush the version of the
1512 		 * volume header synchronized by the flush code.
1513 		 */
1514 		j = hmp->volhdrno + 1;
1515 		if (j < 0)
1516 			j = 0;
1517 		if (j >= HAMMER2_NUM_VOLHDRS)
1518 			j = 0;
1519 		if (j * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE >
1520 		    hmp->volsync.volu_size) {
1521 			j = 0;
1522 		}
1523 		if (hammer2_debug & 0x8000) {
1524 			/* debug only, avoid syslogd loop */
1525 			kprintf("sync volhdr %d %jd\n",
1526 				j, (intmax_t)hmp->volsync.volu_size);
1527 		}
1528 		bp = getblkx(hmp->devvp, j * HAMMER2_ZONE_BYTES64,
1529 			    HAMMER2_VOLUME_BYTES, GETBLK_KVABIO, 0);
1530 		atomic_clear_int(&hmp->vchain.flags,
1531 				 HAMMER2_CHAIN_VOLUMESYNC);
1532 		bkvasync(bp);
1533 		bcopy(&hmp->volsync, bp->b_data, HAMMER2_VOLUME_BYTES);
1534 		vol_error = bwrite(bp);
1535 		hmp->volhdrno = j;
1536 		if (vol_error)
1537 			fsync_error = vol_error;
1538 	}
1539 	if (flush_error)
1540 		total_error = flush_error;
1541 	if (fsync_error)
1542 		total_error = hammer2_errno_to_error(fsync_error);
1543 
1544 	/* spmp trans */
1545 	hammer2_trans_done(hmp->spmp, HAMMER2_TRANS_ISFLUSH);
1546 skip:
1547 	hammer2_xop_feed(&xop->head, NULL, clindex, total_error);
1548 }
1549