132b800e6SMatthew Dillon /* 28138a154SMatthew Dillon * Copyright (c) 2011-2014 The DragonFly Project. All rights reserved. 332b800e6SMatthew Dillon * 432b800e6SMatthew Dillon * This code is derived from software contributed to The DragonFly Project 532b800e6SMatthew Dillon * by Matthew Dillon <dillon@dragonflybsd.org> 632b800e6SMatthew Dillon * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 732b800e6SMatthew Dillon * 832b800e6SMatthew Dillon * Redistribution and use in source and binary forms, with or without 932b800e6SMatthew Dillon * modification, are permitted provided that the following conditions 1032b800e6SMatthew Dillon * are met: 1132b800e6SMatthew Dillon * 1232b800e6SMatthew Dillon * 1. Redistributions of source code must retain the above copyright 1332b800e6SMatthew Dillon * notice, this list of conditions and the following disclaimer. 1432b800e6SMatthew Dillon * 2. Redistributions in binary form must reproduce the above copyright 1532b800e6SMatthew Dillon * notice, this list of conditions and the following disclaimer in 1632b800e6SMatthew Dillon * the documentation and/or other materials provided with the 1732b800e6SMatthew Dillon * distribution. 1832b800e6SMatthew Dillon * 3. Neither the name of The DragonFly Project nor the names of its 1932b800e6SMatthew Dillon * contributors may be used to endorse or promote products derived 2032b800e6SMatthew Dillon * from this software without specific, prior written permission. 2132b800e6SMatthew Dillon * 2232b800e6SMatthew Dillon * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 2332b800e6SMatthew Dillon * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 2432b800e6SMatthew Dillon * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 2532b800e6SMatthew Dillon * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 2632b800e6SMatthew Dillon * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 2732b800e6SMatthew Dillon * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 2832b800e6SMatthew Dillon * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 2932b800e6SMatthew Dillon * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 3032b800e6SMatthew Dillon * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 3132b800e6SMatthew Dillon * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 3232b800e6SMatthew Dillon * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3332b800e6SMatthew Dillon * SUCH DAMAGE. 3432b800e6SMatthew Dillon */ 3550456506SMatthew Dillon /* 3650456506SMatthew Dillon * TRANSACTION AND FLUSH HANDLING 3750456506SMatthew Dillon * 3850456506SMatthew Dillon * Deceptively simple but actually fairly difficult to implement properly is 3950456506SMatthew Dillon * how I would describe it. 4050456506SMatthew Dillon * 41da6f36f4SMatthew Dillon * Flushing generally occurs bottom-up but requires a top-down scan to 42da6f36f4SMatthew Dillon * locate chains with MODIFIED and/or UPDATE bits set. The ONFLUSH flag 43da6f36f4SMatthew Dillon * tells how to recurse downward to find these chains. 4450456506SMatthew Dillon */ 4550456506SMatthew Dillon 4632b800e6SMatthew Dillon #include <sys/cdefs.h> 4732b800e6SMatthew Dillon #include <sys/param.h> 4832b800e6SMatthew Dillon #include <sys/systm.h> 4932b800e6SMatthew Dillon #include <sys/types.h> 5032b800e6SMatthew Dillon #include <sys/lock.h> 5132b800e6SMatthew Dillon #include <sys/uuid.h> 5232b800e6SMatthew Dillon 5332b800e6SMatthew Dillon #include "hammer2.h" 5432b800e6SMatthew Dillon 55925e4ad1SMatthew Dillon #define FLUSH_DEBUG 0 56925e4ad1SMatthew Dillon 57a71db85dSMatthew Dillon #define HAMMER2_FLUSH_DEPTH_LIMIT 10 /* stack recursion limit */ 58a71db85dSMatthew Dillon 59a71db85dSMatthew Dillon 6032b800e6SMatthew Dillon /* 6132b800e6SMatthew Dillon * Recursively flush the specified chain. The chain is locked and 6232b800e6SMatthew Dillon * referenced by the caller and will remain so on return. The chain 6332b800e6SMatthew Dillon * will remain referenced throughout but can temporarily lose its 6432b800e6SMatthew Dillon * lock during the recursion to avoid unnecessarily stalling user 6532b800e6SMatthew Dillon * processes. 6632b800e6SMatthew Dillon */ 6732b800e6SMatthew Dillon struct hammer2_flush_info { 680dea3156SMatthew Dillon hammer2_chain_t *parent; 690dea3156SMatthew Dillon hammer2_trans_t *trans; 7032b800e6SMatthew Dillon int depth; 710dea3156SMatthew Dillon int diddeferral; 721897c66eSMatthew Dillon int cache_index; 73da6f36f4SMatthew Dillon struct h2_flush_list flushq; 7450456506SMatthew Dillon hammer2_xid_t sync_xid; /* memory synchronization point */ 75e513e77eSMatthew Dillon hammer2_tid_t mirror_tid; /* avoid digging through hmp */ 76e513e77eSMatthew Dillon hammer2_tid_t modify_tid; 77850687d2SMatthew Dillon hammer2_chain_t *debug; 7832b800e6SMatthew Dillon }; 7932b800e6SMatthew Dillon 8032b800e6SMatthew Dillon typedef struct hammer2_flush_info hammer2_flush_info_t; 8132b800e6SMatthew Dillon 828138a154SMatthew Dillon static void hammer2_flush_core(hammer2_flush_info_t *info, 83da6f36f4SMatthew Dillon hammer2_chain_t *chain, int deleting); 84da6f36f4SMatthew Dillon static int hammer2_flush_recurse(hammer2_chain_t *child, void *data); 8593f3933aSMatthew Dillon 8632b800e6SMatthew Dillon /* 8750456506SMatthew Dillon * For now use a global transaction manager. What we ultimately want to do 8850456506SMatthew Dillon * is give each non-overlapping hmp/pmp group its own transaction manager. 8950456506SMatthew Dillon * 9050456506SMatthew Dillon * Transactions govern XID tracking on the physical media (the hmp), but they 9150456506SMatthew Dillon * also govern TID tracking which is per-PFS and thus might cross multiple 92506bd6d1SMatthew Dillon * hmp's. So we can't just stuff tmanage into hammer2_dev or 93506bd6d1SMatthew Dillon * hammer2_pfs. 9450456506SMatthew Dillon */ 9550456506SMatthew Dillon void 96*9450e866SMatthew Dillon hammer2_trans_manage_init(hammer2_trans_manage_t *tman) 9750456506SMatthew Dillon { 98*9450e866SMatthew Dillon lockinit(&tman->translk, "h2trans", 0, 0); 99*9450e866SMatthew Dillon TAILQ_INIT(&tman->transq); 100*9450e866SMatthew Dillon tman->flush_xid = 1; 101*9450e866SMatthew Dillon tman->alloc_xid = tman->flush_xid + 1; 10250456506SMatthew Dillon } 10350456506SMatthew Dillon 10450456506SMatthew Dillon hammer2_xid_t 105*9450e866SMatthew Dillon hammer2_trans_newxid(hammer2_pfs_t *pmp) 10650456506SMatthew Dillon { 10750456506SMatthew Dillon hammer2_xid_t xid; 10850456506SMatthew Dillon 10950456506SMatthew Dillon for (;;) { 110*9450e866SMatthew Dillon xid = atomic_fetchadd_int(&pmp->tmanage.alloc_xid, 1); 11150456506SMatthew Dillon if (xid) 11250456506SMatthew Dillon break; 11350456506SMatthew Dillon } 11450456506SMatthew Dillon return xid; 11550456506SMatthew Dillon } 11650456506SMatthew Dillon 11750456506SMatthew Dillon /* 1180dea3156SMatthew Dillon * Transaction support functions for writing to the filesystem. 1190dea3156SMatthew Dillon * 12010136ab6SMatthew Dillon * Initializing a new transaction allocates a transaction ID. Typically 12110136ab6SMatthew Dillon * passed a pmp (hmp passed as NULL), indicating a cluster transaction. Can 12210136ab6SMatthew Dillon * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single 12310136ab6SMatthew Dillon * media target. The latter mode is used by the recovery code. 12410136ab6SMatthew Dillon * 125623d43d4SMatthew Dillon * TWO TRANSACTION IDs can run concurrently, where one is a flush and the 126623d43d4SMatthew Dillon * other is a set of any number of concurrent filesystem operations. We 127623d43d4SMatthew Dillon * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops> 128623d43d4SMatthew Dillon * or we can have <running_flush> + <concurrent_fs_ops>. 1290dea3156SMatthew Dillon * 130623d43d4SMatthew Dillon * During a flush, new fs_ops are only blocked until the fs_ops prior to 131623d43d4SMatthew Dillon * the flush complete. The new fs_ops can then run concurrent with the flush. 132d001f460SMatthew Dillon * 133623d43d4SMatthew Dillon * Buffer-cache transactions operate as fs_ops but never block. A 134623d43d4SMatthew Dillon * buffer-cache flush will run either before or after the current pending 135623d43d4SMatthew Dillon * flush depending on its state. 1360dea3156SMatthew Dillon */ 1370dea3156SMatthew Dillon void 138506bd6d1SMatthew Dillon hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfs_t *pmp, int flags) 1390dea3156SMatthew Dillon { 14050456506SMatthew Dillon hammer2_trans_manage_t *tman; 141a4dc31e0SMatthew Dillon hammer2_trans_t *head; 142d001f460SMatthew Dillon 143*9450e866SMatthew Dillon tman = &pmp->tmanage; 144d001f460SMatthew Dillon 14550456506SMatthew Dillon bzero(trans, sizeof(*trans)); 14650456506SMatthew Dillon trans->pmp = pmp; 147d001f460SMatthew Dillon trans->flags = flags; 148d001f460SMatthew Dillon trans->td = curthread; 14950456506SMatthew Dillon 15050456506SMatthew Dillon lockmgr(&tman->translk, LK_EXCLUSIVE); 151d001f460SMatthew Dillon 152d001f460SMatthew Dillon if (flags & HAMMER2_TRANS_ISFLUSH) { 153d001f460SMatthew Dillon /* 154355d67fcSMatthew Dillon * If multiple flushes are trying to run we have to 155a4dc31e0SMatthew Dillon * wait until it is our turn. All flushes are serialized. 156355d67fcSMatthew Dillon * 157a4dc31e0SMatthew Dillon * We queue ourselves and then wait to become the head 158a4dc31e0SMatthew Dillon * of the queue, allowing all prior flushes to complete. 159623d43d4SMatthew Dillon * 1608138a154SMatthew Dillon * Multiple normal transactions can share the current 1618138a154SMatthew Dillon * transaction id but a flush transaction needs its own 1628138a154SMatthew Dillon * unique TID for proper block table update accounting. 163355d67fcSMatthew Dillon */ 16450456506SMatthew Dillon ++tman->flushcnt; 165e513e77eSMatthew Dillon ++pmp->modify_tid; 16650456506SMatthew Dillon tman->flush_xid = hammer2_trans_newxid(pmp); 16750456506SMatthew Dillon trans->sync_xid = tman->flush_xid; 168e513e77eSMatthew Dillon trans->modify_tid = pmp->modify_tid; 16950456506SMatthew Dillon TAILQ_INSERT_TAIL(&tman->transq, trans, entry); 17050456506SMatthew Dillon if (TAILQ_FIRST(&tman->transq) != trans) { 171d001f460SMatthew Dillon trans->blocked = 1; 172d001f460SMatthew Dillon while (trans->blocked) { 17350456506SMatthew Dillon lksleep(&trans->sync_xid, &tman->translk, 174a4dc31e0SMatthew Dillon 0, "h2multf", hz); 175d001f460SMatthew Dillon } 176d001f460SMatthew Dillon } 17750456506SMatthew Dillon } else if (tman->flushcnt == 0) { 178a7720be7SMatthew Dillon /* 17950456506SMatthew Dillon * No flushes are pending, we can go. Use prior flush_xid + 1. 18050456506SMatthew Dillon * 181da6f36f4SMatthew Dillon * WARNING! Also see hammer2_chain_setflush() 182a7720be7SMatthew Dillon */ 18350456506SMatthew Dillon TAILQ_INSERT_TAIL(&tman->transq, trans, entry); 18450456506SMatthew Dillon trans->sync_xid = tman->flush_xid + 1; 185a7720be7SMatthew Dillon 186a4dc31e0SMatthew Dillon /* XXX improve/optimize inode allocation */ 187052e0aa0SMatthew Dillon } else if (trans->flags & HAMMER2_TRANS_BUFCACHE) { 188052e0aa0SMatthew Dillon /* 189052e0aa0SMatthew Dillon * A buffer cache transaction is requested while a flush 190052e0aa0SMatthew Dillon * is in progress. The flush's PREFLUSH flag must be set 191052e0aa0SMatthew Dillon * in this situation. 192052e0aa0SMatthew Dillon * 193052e0aa0SMatthew Dillon * The buffer cache flush takes on the main flush's 194052e0aa0SMatthew Dillon * transaction id. 195052e0aa0SMatthew Dillon */ 19650456506SMatthew Dillon TAILQ_FOREACH(head, &tman->transq, entry) { 197052e0aa0SMatthew Dillon if (head->flags & HAMMER2_TRANS_ISFLUSH) 198052e0aa0SMatthew Dillon break; 199052e0aa0SMatthew Dillon } 200052e0aa0SMatthew Dillon KKASSERT(head); 201052e0aa0SMatthew Dillon KKASSERT(head->flags & HAMMER2_TRANS_PREFLUSH); 202052e0aa0SMatthew Dillon trans->flags |= HAMMER2_TRANS_PREFLUSH; 20350456506SMatthew Dillon TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry); 20450456506SMatthew Dillon trans->sync_xid = head->sync_xid; 205e513e77eSMatthew Dillon trans->modify_tid = head->modify_tid; 206052e0aa0SMatthew Dillon trans->flags |= HAMMER2_TRANS_CONCURRENT; 207052e0aa0SMatthew Dillon /* not allowed to block */ 208a4dc31e0SMatthew Dillon } else { 209a4dc31e0SMatthew Dillon /* 210052e0aa0SMatthew Dillon * A normal transaction is requested while a flush is in 211052e0aa0SMatthew Dillon * progress. We insert after the current flush and may 21250456506SMatthew Dillon * block. 21350456506SMatthew Dillon * 214da6f36f4SMatthew Dillon * WARNING! Also see hammer2_chain_setflush() 215a4dc31e0SMatthew Dillon */ 21650456506SMatthew Dillon TAILQ_FOREACH(head, &tman->transq, entry) { 217a4dc31e0SMatthew Dillon if (head->flags & HAMMER2_TRANS_ISFLUSH) 218a4dc31e0SMatthew Dillon break; 219a7720be7SMatthew Dillon } 220a4dc31e0SMatthew Dillon KKASSERT(head); 22150456506SMatthew Dillon TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry); 22250456506SMatthew Dillon trans->sync_xid = head->sync_xid + 1; 2238138a154SMatthew Dillon trans->flags |= HAMMER2_TRANS_CONCURRENT; 224a4dc31e0SMatthew Dillon 2258138a154SMatthew Dillon /* 226052e0aa0SMatthew Dillon * XXX for now we must block new transactions, synchronous 227052e0aa0SMatthew Dillon * flush mode is on by default. 228052e0aa0SMatthew Dillon * 2298138a154SMatthew Dillon * If synchronous flush mode is enabled concurrent 2308138a154SMatthew Dillon * frontend transactions during the flush are not 2318138a154SMatthew Dillon * allowed (except we don't have a choice for buffer 2328138a154SMatthew Dillon * cache ops). 2338138a154SMatthew Dillon */ 2348138a154SMatthew Dillon if (hammer2_synchronous_flush > 0 || 23550456506SMatthew Dillon TAILQ_FIRST(&tman->transq) != head) { 236a4dc31e0SMatthew Dillon trans->blocked = 1; 237a4dc31e0SMatthew Dillon while (trans->blocked) { 2380bdddbf4SMatthew Dillon lksleep(&trans->sync_xid, &tman->translk, 2390bdddbf4SMatthew Dillon 0, "h2multf", hz); 240a4dc31e0SMatthew Dillon } 241a4dc31e0SMatthew Dillon } 242a4dc31e0SMatthew Dillon } 243044541cdSMatthew Dillon if (flags & HAMMER2_TRANS_NEWINODE) { 24450456506SMatthew Dillon if (pmp->spmp_hmp) { 24550456506SMatthew Dillon /* 24650456506SMatthew Dillon * Super-root transaction, all new inodes have an 24750456506SMatthew Dillon * inode number of 1. Normal pfs inode cache 24850456506SMatthew Dillon * semantics are not used. 24950456506SMatthew Dillon */ 25050456506SMatthew Dillon trans->inode_tid = 1; 25150456506SMatthew Dillon } else { 25250456506SMatthew Dillon /* 25350456506SMatthew Dillon * Normal transaction 25450456506SMatthew Dillon */ 25550456506SMatthew Dillon if (pmp->inode_tid < HAMMER2_INODE_START) 25650456506SMatthew Dillon pmp->inode_tid = HAMMER2_INODE_START; 25750456506SMatthew Dillon trans->inode_tid = pmp->inode_tid++; 258044541cdSMatthew Dillon } 25950456506SMatthew Dillon } 26050456506SMatthew Dillon 26150456506SMatthew Dillon lockmgr(&tman->translk, LK_RELEASE); 262a7720be7SMatthew Dillon } 263a7720be7SMatthew Dillon 2640dea3156SMatthew Dillon void 265*9450e866SMatthew Dillon hammer2_trans_assert_strategy(hammer2_pfs_t *pmp) 266c7916d0bSMatthew Dillon { 267c7916d0bSMatthew Dillon hammer2_trans_manage_t *tman; 268c7916d0bSMatthew Dillon hammer2_trans_t *head; 269c7916d0bSMatthew Dillon 270*9450e866SMatthew Dillon tman = &pmp->tmanage; 271c7916d0bSMatthew Dillon lockmgr(&tman->translk, LK_EXCLUSIVE); 272c7916d0bSMatthew Dillon if (tman->flushcnt) { 273c7916d0bSMatthew Dillon TAILQ_FOREACH(head, &tman->transq, entry) { 274c7916d0bSMatthew Dillon if (head->flags & HAMMER2_TRANS_ISFLUSH) 275c7916d0bSMatthew Dillon break; 276c7916d0bSMatthew Dillon } 277c7916d0bSMatthew Dillon KKASSERT(head); 278c7916d0bSMatthew Dillon KKASSERT(head->flags & HAMMER2_TRANS_PREFLUSH); 279c7916d0bSMatthew Dillon } 280c7916d0bSMatthew Dillon lockmgr(&tman->translk, LK_RELEASE); 281c7916d0bSMatthew Dillon } 282c7916d0bSMatthew Dillon 283c7916d0bSMatthew Dillon void 2840dea3156SMatthew Dillon hammer2_trans_done(hammer2_trans_t *trans) 2850dea3156SMatthew Dillon { 28650456506SMatthew Dillon hammer2_trans_manage_t *tman; 287a4dc31e0SMatthew Dillon hammer2_trans_t *head; 288d001f460SMatthew Dillon hammer2_trans_t *scan; 289a02dfba1SMatthew Dillon 290*9450e866SMatthew Dillon tman = &trans->pmp->tmanage; 291a5913bdfSMatthew Dillon 292a4dc31e0SMatthew Dillon /* 2938138a154SMatthew Dillon * Remove. 294a4dc31e0SMatthew Dillon */ 29550456506SMatthew Dillon lockmgr(&tman->translk, LK_EXCLUSIVE); 29650456506SMatthew Dillon TAILQ_REMOVE(&tman->transq, trans, entry); 29750456506SMatthew Dillon head = TAILQ_FIRST(&tman->transq); 2988138a154SMatthew Dillon 2998138a154SMatthew Dillon /* 3008138a154SMatthew Dillon * Adjust flushcnt if this was a flush, clear TRANS_CONCURRENT 3018138a154SMatthew Dillon * up through the next flush. (If the head is a flush then we 3028138a154SMatthew Dillon * stop there, unlike the unblock code following this section). 3038138a154SMatthew Dillon */ 3048138a154SMatthew Dillon if (trans->flags & HAMMER2_TRANS_ISFLUSH) { 30550456506SMatthew Dillon --tman->flushcnt; 3068138a154SMatthew Dillon scan = head; 3078138a154SMatthew Dillon while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) { 3088138a154SMatthew Dillon atomic_clear_int(&scan->flags, 3098138a154SMatthew Dillon HAMMER2_TRANS_CONCURRENT); 310052e0aa0SMatthew Dillon scan = TAILQ_NEXT(scan, entry); 3118138a154SMatthew Dillon } 3128138a154SMatthew Dillon } 313a4dc31e0SMatthew Dillon 314355d67fcSMatthew Dillon /* 315a4dc31e0SMatthew Dillon * Unblock the head of the queue and any additional transactions 3168138a154SMatthew Dillon * up to the next flush. The head can be a flush and it will be 3178138a154SMatthew Dillon * unblocked along with the non-flush transactions following it 3188138a154SMatthew Dillon * (which are allowed to run concurrently with it). 3198138a154SMatthew Dillon * 3208138a154SMatthew Dillon * In synchronous flush mode we stop if the head transaction is 3218138a154SMatthew Dillon * a flush. 322355d67fcSMatthew Dillon */ 323a4dc31e0SMatthew Dillon if (head && head->blocked) { 324a4dc31e0SMatthew Dillon head->blocked = 0; 32550456506SMatthew Dillon wakeup(&head->sync_xid); 326a4dc31e0SMatthew Dillon 3278138a154SMatthew Dillon if (hammer2_synchronous_flush > 0) 3288138a154SMatthew Dillon scan = head; 3298138a154SMatthew Dillon else 330a4dc31e0SMatthew Dillon scan = TAILQ_NEXT(head, entry); 331a4dc31e0SMatthew Dillon while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) { 332925e4ad1SMatthew Dillon if (scan->blocked) { 333a4dc31e0SMatthew Dillon scan->blocked = 0; 33450456506SMatthew Dillon wakeup(&scan->sync_xid); 335925e4ad1SMatthew Dillon } 336a4dc31e0SMatthew Dillon scan = TAILQ_NEXT(scan, entry); 337a02dfba1SMatthew Dillon } 338a02dfba1SMatthew Dillon } 33950456506SMatthew Dillon lockmgr(&tman->translk, LK_RELEASE); 340a02dfba1SMatthew Dillon } 341a02dfba1SMatthew Dillon 3420dea3156SMatthew Dillon /* 3430dea3156SMatthew Dillon * Flush the chain and all modified sub-chains through the specified 344e513e77eSMatthew Dillon * synchronization point, propagating parent chain modifications, modify_tid, 345e513e77eSMatthew Dillon * and mirror_tid updates back up as needed. 3460dea3156SMatthew Dillon * 3470dea3156SMatthew Dillon * Caller must have interlocked against any non-flush-related modifying 348da6f36f4SMatthew Dillon * operations in progress whos XXX values are less than or equal 34950456506SMatthew Dillon * to the passed sync_xid. 3500dea3156SMatthew Dillon * 3510dea3156SMatthew Dillon * Caller must have already vetted synchronization points to ensure they 3520dea3156SMatthew Dillon * are properly flushed. Only snapshots and cluster flushes can create 3530dea3156SMatthew Dillon * these sorts of synchronization points. 3540dea3156SMatthew Dillon * 35532b800e6SMatthew Dillon * This routine can be called from several places but the most important 3568138a154SMatthew Dillon * is from VFS_SYNC. 35732b800e6SMatthew Dillon * 358da6f36f4SMatthew Dillon * chain is locked on call and will remain locked on return. The chain's 359da6f36f4SMatthew Dillon * UPDATE flag indicates that its parent's block table (which is not yet 360da6f36f4SMatthew Dillon * part of the flush) should be updated. The chain may be replaced by 361da6f36f4SMatthew Dillon * the call if it was modified. 36232b800e6SMatthew Dillon */ 36332b800e6SMatthew Dillon void 364*9450e866SMatthew Dillon hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t *chain, int istop) 36532b800e6SMatthew Dillon { 36632b800e6SMatthew Dillon hammer2_chain_t *scan; 36732b800e6SMatthew Dillon hammer2_flush_info_t info; 368925e4ad1SMatthew Dillon int loops; 36932b800e6SMatthew Dillon 37032b800e6SMatthew Dillon /* 37132b800e6SMatthew Dillon * Execute the recursive flush and handle deferrals. 37232b800e6SMatthew Dillon * 37332b800e6SMatthew Dillon * Chains can be ridiculously long (thousands deep), so to 37432b800e6SMatthew Dillon * avoid blowing out the kernel stack the recursive flush has a 37532b800e6SMatthew Dillon * depth limit. Elements at the limit are placed on a list 37632b800e6SMatthew Dillon * for re-execution after the stack has been popped. 37732b800e6SMatthew Dillon */ 37832b800e6SMatthew Dillon bzero(&info, sizeof(info)); 379da6f36f4SMatthew Dillon TAILQ_INIT(&info.flushq); 3800dea3156SMatthew Dillon info.trans = trans; 38150456506SMatthew Dillon info.sync_xid = trans->sync_xid; 3821897c66eSMatthew Dillon info.cache_index = -1; 38332b800e6SMatthew Dillon 384da6f36f4SMatthew Dillon /* 385da6f36f4SMatthew Dillon * Calculate parent (can be NULL), if not NULL the flush core 386da6f36f4SMatthew Dillon * expects the parent to be referenced so it can easily lock/unlock 387da6f36f4SMatthew Dillon * it without it getting ripped up. 388da6f36f4SMatthew Dillon */ 389da6f36f4SMatthew Dillon if ((info.parent = chain->parent) != NULL) 390da6f36f4SMatthew Dillon hammer2_chain_ref(info.parent); 391731b2a84SMatthew Dillon 392a7720be7SMatthew Dillon /* 393a7720be7SMatthew Dillon * Extra ref needed because flush_core expects it when replacing 394a7720be7SMatthew Dillon * chain. 395a7720be7SMatthew Dillon */ 396a7720be7SMatthew Dillon hammer2_chain_ref(chain); 397925e4ad1SMatthew Dillon loops = 0; 398a7720be7SMatthew Dillon 3990dea3156SMatthew Dillon for (;;) { 40032b800e6SMatthew Dillon /* 4010dea3156SMatthew Dillon * Unwind deep recursions which had been deferred. This 4028138a154SMatthew Dillon * can leave the FLUSH_* bits set for these chains, which 4038138a154SMatthew Dillon * will be handled when we [re]flush chain after the unwind. 40432b800e6SMatthew Dillon */ 405da6f36f4SMatthew Dillon while ((scan = TAILQ_FIRST(&info.flushq)) != NULL) { 40632b800e6SMatthew Dillon KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED); 407da6f36f4SMatthew Dillon TAILQ_REMOVE(&info.flushq, scan, flush_node); 40832b800e6SMatthew Dillon atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED); 40932b800e6SMatthew Dillon 41032b800e6SMatthew Dillon /* 41132b800e6SMatthew Dillon * Now that we've popped back up we can do a secondary 41232b800e6SMatthew Dillon * recursion on the deferred elements. 413053e752cSMatthew Dillon * 4148138a154SMatthew Dillon * NOTE: hammer2_flush() may replace scan. 41532b800e6SMatthew Dillon */ 41632b800e6SMatthew Dillon if (hammer2_debug & 0x0040) 417053e752cSMatthew Dillon kprintf("deferred flush %p\n", scan); 4180dea3156SMatthew Dillon hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE); 419*9450e866SMatthew Dillon hammer2_flush(trans, scan, 0); 4200dea3156SMatthew Dillon hammer2_chain_unlock(scan); 421e513e77eSMatthew Dillon hammer2_chain_drop(scan); /* ref from deferral */ 42232b800e6SMatthew Dillon } 42332b800e6SMatthew Dillon 42432b800e6SMatthew Dillon /* 425925e4ad1SMatthew Dillon * [re]flush chain. 42632b800e6SMatthew Dillon */ 4270dea3156SMatthew Dillon info.diddeferral = 0; 428*9450e866SMatthew Dillon hammer2_flush_core(&info, chain, istop); 42932b800e6SMatthew Dillon 43032b800e6SMatthew Dillon /* 4310dea3156SMatthew Dillon * Only loop if deep recursions have been deferred. 43232b800e6SMatthew Dillon */ 433da6f36f4SMatthew Dillon if (TAILQ_EMPTY(&info.flushq)) 43432b800e6SMatthew Dillon break; 435925e4ad1SMatthew Dillon 436925e4ad1SMatthew Dillon if (++loops % 1000 == 0) { 4378138a154SMatthew Dillon kprintf("hammer2_flush: excessive loops on %p\n", 438925e4ad1SMatthew Dillon chain); 439925e4ad1SMatthew Dillon if (hammer2_debug & 0x100000) 440925e4ad1SMatthew Dillon Debugger("hell4"); 441925e4ad1SMatthew Dillon } 44232b800e6SMatthew Dillon } 443a7720be7SMatthew Dillon hammer2_chain_drop(chain); 444da6f36f4SMatthew Dillon if (info.parent) 445da6f36f4SMatthew Dillon hammer2_chain_drop(info.parent); 44632b800e6SMatthew Dillon } 44732b800e6SMatthew Dillon 448476d2aadSMatthew Dillon /* 449ea155208SMatthew Dillon * This is the core of the chain flushing code. The chain is locked by the 450a7720be7SMatthew Dillon * caller and must also have an extra ref on it by the caller, and remains 4518138a154SMatthew Dillon * locked and will have an extra ref on return. Upon return, the caller can 452da6f36f4SMatthew Dillon * test the UPDATE bit on the child to determine if the parent needs updating. 453a7720be7SMatthew Dillon * 4548138a154SMatthew Dillon * (1) Determine if this node is a candidate for the flush, return if it is 4558138a154SMatthew Dillon * not. fchain and vchain are always candidates for the flush. 4560dea3156SMatthew Dillon * 4578138a154SMatthew Dillon * (2) If we recurse too deep the chain is entered onto the deferral list and 4588138a154SMatthew Dillon * the current flush stack is aborted until after the deferral list is 4598138a154SMatthew Dillon * run. 4608138a154SMatthew Dillon * 4618138a154SMatthew Dillon * (3) Recursively flush live children (rbtree). This can create deferrals. 462da6f36f4SMatthew Dillon * A successful flush clears the MODIFIED and UPDATE bits on the children 463da6f36f4SMatthew Dillon * and typically causes the parent to be marked MODIFIED as the children 464da6f36f4SMatthew Dillon * update the parent's block table. A parent might already be marked 465da6f36f4SMatthew Dillon * MODIFIED due to a deletion (whos blocktable update in the parent is 466da6f36f4SMatthew Dillon * handled by the frontend), or if the parent itself is modified by the 467da6f36f4SMatthew Dillon * frontend for other reasons. 4688138a154SMatthew Dillon * 469da6f36f4SMatthew Dillon * (4) Permanently disconnected sub-trees are cleaned up by the front-end. 470da6f36f4SMatthew Dillon * Deleted-but-open inodes can still be individually flushed via the 471da6f36f4SMatthew Dillon * filesystem syncer. 4728138a154SMatthew Dillon * 473da6f36f4SMatthew Dillon * (5) Note that an unmodified child may still need the block table in its 474da6f36f4SMatthew Dillon * parent updated (e.g. rename/move). The child will have UPDATE set 475da6f36f4SMatthew Dillon * in this case. 4768138a154SMatthew Dillon * 47750456506SMatthew Dillon * WARNING ON BREF MODIFY_TID/MIRROR_TID 478925e4ad1SMatthew Dillon * 479e513e77eSMatthew Dillon * blockref.modify_tid is consistent only within a PFS, and will not be 480e513e77eSMatthew Dillon * consistent during synchronization. mirror_tid is consistent across the 481e513e77eSMatthew Dillon * block device regardless of the PFS. 482476d2aadSMatthew Dillon */ 48332b800e6SMatthew Dillon static void 484da6f36f4SMatthew Dillon hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain, 485*9450e866SMatthew Dillon int istop) 48632b800e6SMatthew Dillon { 487da6f36f4SMatthew Dillon hammer2_chain_t *parent; 488506bd6d1SMatthew Dillon hammer2_dev_t *hmp; 489925e4ad1SMatthew Dillon int diddeferral; 490da6f36f4SMatthew Dillon 491da6f36f4SMatthew Dillon /* 492da6f36f4SMatthew Dillon * (1) Optimize downward recursion to locate nodes needing action. 493da6f36f4SMatthew Dillon * Nothing to do if none of these flags are set. 494da6f36f4SMatthew Dillon */ 495850687d2SMatthew Dillon if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0) { 496850687d2SMatthew Dillon if (hammer2_debug & 0x200) { 497850687d2SMatthew Dillon if (info->debug == NULL) 498850687d2SMatthew Dillon info->debug = chain; 499850687d2SMatthew Dillon } else { 500da6f36f4SMatthew Dillon return; 501850687d2SMatthew Dillon } 502850687d2SMatthew Dillon } 50332b800e6SMatthew Dillon 504a5913bdfSMatthew Dillon hmp = chain->hmp; 505925e4ad1SMatthew Dillon diddeferral = info->diddeferral; 506da6f36f4SMatthew Dillon parent = info->parent; /* can be NULL */ 507925e4ad1SMatthew Dillon 5080924b3f8SMatthew Dillon /* 509da6f36f4SMatthew Dillon * Downward search recursion 510ea155208SMatthew Dillon */ 511925e4ad1SMatthew Dillon if (chain->flags & HAMMER2_CHAIN_DEFERRED) { 512da6f36f4SMatthew Dillon /* 513da6f36f4SMatthew Dillon * Already deferred. 514da6f36f4SMatthew Dillon */ 515925e4ad1SMatthew Dillon ++info->diddeferral; 516925e4ad1SMatthew Dillon } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) { 517da6f36f4SMatthew Dillon /* 518da6f36f4SMatthew Dillon * Recursion depth reached. 519da6f36f4SMatthew Dillon */ 5200dea3156SMatthew Dillon hammer2_chain_ref(chain); 521da6f36f4SMatthew Dillon TAILQ_INSERT_TAIL(&info->flushq, chain, flush_node); 522da6f36f4SMatthew Dillon atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED); 523925e4ad1SMatthew Dillon ++info->diddeferral; 524*9450e866SMatthew Dillon } else if ((chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) && istop == 0) { 525*9450e866SMatthew Dillon /* 526*9450e866SMatthew Dillon * We do not recurse through PFSROOTs. PFSROOT flushes are 527*9450e866SMatthew Dillon * handled by the related pmp's (whether mounted or not, 528*9450e866SMatthew Dillon * including during recovery). 529*9450e866SMatthew Dillon * 530*9450e866SMatthew Dillon * But we must still process the PFSROOT chains for block 531*9450e866SMatthew Dillon * table updates in their parent (which IS part of our flush). 532*9450e866SMatthew Dillon * 533*9450e866SMatthew Dillon * Note that the volume root, vchain, does not set this flag. 534*9450e866SMatthew Dillon */ 535*9450e866SMatthew Dillon ; 536da6f36f4SMatthew Dillon } else if (chain->flags & HAMMER2_CHAIN_ONFLUSH) { 5378138a154SMatthew Dillon /* 538da6f36f4SMatthew Dillon * Downward recursion search (actual flush occurs bottom-up). 539da6f36f4SMatthew Dillon * pre-clear ONFLUSH. It can get set again due to races, 540da6f36f4SMatthew Dillon * which we want so the scan finds us again in the next flush. 541*9450e866SMatthew Dillon * These races can also include 542*9450e866SMatthew Dillon * 543*9450e866SMatthew Dillon * Flush recursions stop at PFSROOT boundaries. Each PFS 544*9450e866SMatthew Dillon * must be individually flushed and then the root must 545*9450e866SMatthew Dillon * be flushed. 5468138a154SMatthew Dillon */ 547da6f36f4SMatthew Dillon atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 5488138a154SMatthew Dillon info->parent = chain; 54994491fa0SMatthew Dillon hammer2_spin_ex(&chain->core.spin); 550da6f36f4SMatthew Dillon RB_SCAN(hammer2_chain_tree, &chain->core.rbtree, 551da6f36f4SMatthew Dillon NULL, hammer2_flush_recurse, info); 55294491fa0SMatthew Dillon hammer2_spin_unex(&chain->core.spin); 553da6f36f4SMatthew Dillon info->parent = parent; 554da6f36f4SMatthew Dillon if (info->diddeferral) 555da6f36f4SMatthew Dillon hammer2_chain_setflush(info->trans, chain); 5568138a154SMatthew Dillon } 5570924b3f8SMatthew Dillon 55832b800e6SMatthew Dillon /* 559da6f36f4SMatthew Dillon * Now we are in the bottom-up part of the recursion. 560da6f36f4SMatthew Dillon * 561da6f36f4SMatthew Dillon * Do not update chain if lower layers were deferred. 5628138a154SMatthew Dillon */ 563da6f36f4SMatthew Dillon if (info->diddeferral) 5648138a154SMatthew Dillon goto done; 5658138a154SMatthew Dillon 5668138a154SMatthew Dillon /* 567da6f36f4SMatthew Dillon * Propagate the DESTROY flag downwards. This dummies up the flush 568da6f36f4SMatthew Dillon * code and tries to invalidate related buffer cache buffers to 569da6f36f4SMatthew Dillon * avoid the disk write. 570623d43d4SMatthew Dillon */ 571da6f36f4SMatthew Dillon if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY)) 572da6f36f4SMatthew Dillon atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 573623d43d4SMatthew Dillon 574623d43d4SMatthew Dillon /* 575da6f36f4SMatthew Dillon * Chain was already modified or has become modified, flush it out. 576da6f36f4SMatthew Dillon */ 577da6f36f4SMatthew Dillon again: 578850687d2SMatthew Dillon if ((hammer2_debug & 0x200) && 579850687d2SMatthew Dillon info->debug && 580850687d2SMatthew Dillon (chain->flags & (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_UPDATE))) { 581850687d2SMatthew Dillon hammer2_chain_t *scan = chain; 582850687d2SMatthew Dillon 583850687d2SMatthew Dillon kprintf("DISCONNECTED FLUSH %p->%p\n", info->debug, chain); 584850687d2SMatthew Dillon while (scan) { 585850687d2SMatthew Dillon kprintf(" chain %p [%08x] bref=%016jx:%02x\n", 586850687d2SMatthew Dillon scan, scan->flags, 587850687d2SMatthew Dillon scan->bref.key, scan->bref.type); 588850687d2SMatthew Dillon if (scan == info->debug) 589850687d2SMatthew Dillon break; 590850687d2SMatthew Dillon scan = scan->parent; 591850687d2SMatthew Dillon } 592850687d2SMatthew Dillon } 593850687d2SMatthew Dillon 594da6f36f4SMatthew Dillon if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 595da6f36f4SMatthew Dillon /* 596e513e77eSMatthew Dillon * Dispose of the modified bit. 597e513e77eSMatthew Dillon * 598e513e77eSMatthew Dillon * UPDATE should already be set. 599e513e77eSMatthew Dillon * bref.mirror_tid should already be set. 60032b800e6SMatthew Dillon */ 601da6f36f4SMatthew Dillon KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) || 60250456506SMatthew Dillon chain == &hmp->vchain); 6030dea3156SMatthew Dillon atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 6048db69c9fSMatthew Dillon 6058db69c9fSMatthew Dillon /* 606e513e77eSMatthew Dillon * Manage threads waiting for excessive dirty memory to 607e513e77eSMatthew Dillon * be retired. 6088db69c9fSMatthew Dillon */ 609e513e77eSMatthew Dillon if (chain->pmp) 610e513e77eSMatthew Dillon hammer2_pfs_memory_wakeup(chain->pmp); 6118138a154SMatthew Dillon 612da6f36f4SMatthew Dillon if ((chain->flags & HAMMER2_CHAIN_UPDATE) || 6138138a154SMatthew Dillon chain == &hmp->vchain || 6148138a154SMatthew Dillon chain == &hmp->fchain) { 6158138a154SMatthew Dillon /* 6168138a154SMatthew Dillon * Drop the ref from the MODIFIED bit we cleared, 6178138a154SMatthew Dillon * net -1 ref. 6188138a154SMatthew Dillon */ 6190dea3156SMatthew Dillon hammer2_chain_drop(chain); 6208138a154SMatthew Dillon } else { 6218138a154SMatthew Dillon /* 6228138a154SMatthew Dillon * Drop the ref from the MODIFIED bit we cleared and 623da6f36f4SMatthew Dillon * set a ref for the UPDATE bit we are setting. Net 624da6f36f4SMatthew Dillon * 0 refs. 6258138a154SMatthew Dillon */ 626da6f36f4SMatthew Dillon atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 6270dea3156SMatthew Dillon } 6280dea3156SMatthew Dillon 6290dea3156SMatthew Dillon /* 630a71db85dSMatthew Dillon * Issue the flush. This is indirect via the DIO. 6310dea3156SMatthew Dillon * 632a71db85dSMatthew Dillon * NOTE: A DELETED node that reaches this point must be 633a71db85dSMatthew Dillon * flushed for synchronization point consistency. 634a71db85dSMatthew Dillon * 635a71db85dSMatthew Dillon * NOTE: Even though MODIFIED was already set, the related DIO 636a71db85dSMatthew Dillon * might not be dirty due to a system buffer cache 637a71db85dSMatthew Dillon * flush and must be set dirty if we are going to make 638a71db85dSMatthew Dillon * further modifications to the buffer. Chains with 639a71db85dSMatthew Dillon * embedded data don't need this. 6400dea3156SMatthew Dillon */ 641a7720be7SMatthew Dillon if (hammer2_debug & 0x1000) { 642da6f36f4SMatthew Dillon kprintf("Flush %p.%d %016jx/%d sync_xid=%08x " 643da6f36f4SMatthew Dillon "data=%016jx\n", 644a7720be7SMatthew Dillon chain, chain->bref.type, 645a7720be7SMatthew Dillon chain->bref.key, chain->bref.keybits, 646da6f36f4SMatthew Dillon info->sync_xid, 647da6f36f4SMatthew Dillon chain->bref.data_off); 648a7720be7SMatthew Dillon } 649a7720be7SMatthew Dillon if (hammer2_debug & 0x2000) { 650a7720be7SMatthew Dillon Debugger("Flush hell"); 651a7720be7SMatthew Dillon } 65210136ab6SMatthew Dillon 65332b800e6SMatthew Dillon /* 654da6f36f4SMatthew Dillon * Update chain CRCs for flush. 65532b800e6SMatthew Dillon * 656da6f36f4SMatthew Dillon * NOTE: Volume headers are NOT flushed here as they require 657da6f36f4SMatthew Dillon * special processing. 65832b800e6SMatthew Dillon */ 65932b800e6SMatthew Dillon switch(chain->bref.type) { 6601a7cfe5aSMatthew Dillon case HAMMER2_BREF_TYPE_FREEMAP: 661a71db85dSMatthew Dillon /* 662e513e77eSMatthew Dillon * Update the volume header's freemap_tid to the 663e513e77eSMatthew Dillon * freemap's flushing mirror_tid. 664e513e77eSMatthew Dillon * 665a71db85dSMatthew Dillon * (note: embedded data, do not call setdirty) 666a71db85dSMatthew Dillon */ 66750456506SMatthew Dillon KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED); 668e513e77eSMatthew Dillon KKASSERT(chain == &hmp->fchain); 669e513e77eSMatthew Dillon hmp->voldata.freemap_tid = chain->bref.mirror_tid; 670e513e77eSMatthew Dillon kprintf("sync freemap mirror_tid %08jx\n", 671e513e77eSMatthew Dillon (intmax_t)chain->bref.mirror_tid); 672e513e77eSMatthew Dillon 673e513e77eSMatthew Dillon /* 674e513e77eSMatthew Dillon * The freemap can be flushed independently of the 675e513e77eSMatthew Dillon * main topology, but for the case where it is 676e513e77eSMatthew Dillon * flushed in the same transaction, and flushed 677e513e77eSMatthew Dillon * before vchain (a case we want to allow for 678e513e77eSMatthew Dillon * performance reasons), make sure modifications 679e513e77eSMatthew Dillon * made during the flush under vchain use a new 680e513e77eSMatthew Dillon * transaction id. 681e513e77eSMatthew Dillon * 682e513e77eSMatthew Dillon * Otherwise the mount recovery code will get confused. 683e513e77eSMatthew Dillon */ 684e513e77eSMatthew Dillon ++hmp->voldata.mirror_tid; 6851a7cfe5aSMatthew Dillon break; 68632b800e6SMatthew Dillon case HAMMER2_BREF_TYPE_VOLUME: 68732b800e6SMatthew Dillon /* 688e513e77eSMatthew Dillon * The free block table is flushed by 689e513e77eSMatthew Dillon * hammer2_vfs_sync() before it flushes vchain. 690e513e77eSMatthew Dillon * We must still hold fchain locked while copying 691e513e77eSMatthew Dillon * voldata to volsync, however. 692a71db85dSMatthew Dillon * 693a71db85dSMatthew Dillon * (note: embedded data, do not call setdirty) 6941a7cfe5aSMatthew Dillon */ 69550456506SMatthew Dillon hammer2_voldata_lock(hmp); 696da6f36f4SMatthew Dillon hammer2_chain_lock(&hmp->fchain, 697da6f36f4SMatthew Dillon HAMMER2_RESOLVE_ALWAYS); 698e513e77eSMatthew Dillon kprintf("sync volume mirror_tid %08jx\n", 699da6f36f4SMatthew Dillon (intmax_t)chain->bref.mirror_tid); 7001a7cfe5aSMatthew Dillon 7011a7cfe5aSMatthew Dillon /* 702e513e77eSMatthew Dillon * Update the volume header's mirror_tid to the 703e513e77eSMatthew Dillon * main topology's flushing mirror_tid. It is 704e513e77eSMatthew Dillon * possible that voldata.mirror_tid is already 705e513e77eSMatthew Dillon * beyond bref.mirror_tid due to the bump we made 706e513e77eSMatthew Dillon * above in BREF_TYPE_FREEMAP. 707e513e77eSMatthew Dillon */ 708e513e77eSMatthew Dillon if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) { 709e513e77eSMatthew Dillon hmp->voldata.mirror_tid = 710e513e77eSMatthew Dillon chain->bref.mirror_tid; 711e513e77eSMatthew Dillon } 712e513e77eSMatthew Dillon 713e513e77eSMatthew Dillon /* 714da6f36f4SMatthew Dillon * The volume header is flushed manually by the 715da6f36f4SMatthew Dillon * syncer, not here. All we do here is adjust the 716da6f36f4SMatthew Dillon * crc's. 71732b800e6SMatthew Dillon */ 71832b800e6SMatthew Dillon KKASSERT(chain->data != NULL); 719fdf62707SMatthew Dillon KKASSERT(chain->dio == NULL); 72032b800e6SMatthew Dillon 72132b800e6SMatthew Dillon hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]= 72232b800e6SMatthew Dillon hammer2_icrc32( 72332b800e6SMatthew Dillon (char *)&hmp->voldata + 72432b800e6SMatthew Dillon HAMMER2_VOLUME_ICRC1_OFF, 72532b800e6SMatthew Dillon HAMMER2_VOLUME_ICRC1_SIZE); 72632b800e6SMatthew Dillon hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]= 72732b800e6SMatthew Dillon hammer2_icrc32( 72832b800e6SMatthew Dillon (char *)&hmp->voldata + 72932b800e6SMatthew Dillon HAMMER2_VOLUME_ICRC0_OFF, 73032b800e6SMatthew Dillon HAMMER2_VOLUME_ICRC0_SIZE); 73132b800e6SMatthew Dillon hmp->voldata.icrc_volheader = 73232b800e6SMatthew Dillon hammer2_icrc32( 73332b800e6SMatthew Dillon (char *)&hmp->voldata + 73432b800e6SMatthew Dillon HAMMER2_VOLUME_ICRCVH_OFF, 73532b800e6SMatthew Dillon HAMMER2_VOLUME_ICRCVH_SIZE); 736e513e77eSMatthew Dillon 737e513e77eSMatthew Dillon kprintf("syncvolhdr %016jx %016jx\n", 738e513e77eSMatthew Dillon hmp->voldata.mirror_tid, 739e513e77eSMatthew Dillon hmp->vchain.bref.mirror_tid); 74032b800e6SMatthew Dillon hmp->volsync = hmp->voldata; 7410dea3156SMatthew Dillon atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC); 74293f3933aSMatthew Dillon hammer2_chain_unlock(&hmp->fchain); 74350456506SMatthew Dillon hammer2_voldata_unlock(hmp); 74432b800e6SMatthew Dillon break; 74532b800e6SMatthew Dillon case HAMMER2_BREF_TYPE_DATA: 74632b800e6SMatthew Dillon /* 747da6f36f4SMatthew Dillon * Data elements have already been flushed via the 748da6f36f4SMatthew Dillon * logical file buffer cache. Their hash was set in 749a71db85dSMatthew Dillon * the bref by the vop_write code. Do not re-dirty. 75032b800e6SMatthew Dillon * 751da6f36f4SMatthew Dillon * Make sure any device buffer(s) have been flushed 752da6f36f4SMatthew Dillon * out here (there aren't usually any to flush) XXX. 75332b800e6SMatthew Dillon */ 75432b800e6SMatthew Dillon break; 755512beabdSMatthew Dillon case HAMMER2_BREF_TYPE_INDIRECT: 7561a7cfe5aSMatthew Dillon case HAMMER2_BREF_TYPE_FREEMAP_NODE: 75791caa51cSMatthew Dillon case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 758da6f36f4SMatthew Dillon /* 759da6f36f4SMatthew Dillon * Buffer I/O will be cleaned up when the volume is 760da6f36f4SMatthew Dillon * flushed (but the kernel is free to flush it before 761da6f36f4SMatthew Dillon * then, as well). 762da6f36f4SMatthew Dillon */ 76350456506SMatthew Dillon KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 764a71db85dSMatthew Dillon hammer2_chain_setcheck(chain, chain->data); 76550456506SMatthew Dillon break; 76691caa51cSMatthew Dillon case HAMMER2_BREF_TYPE_INODE: 767a71db85dSMatthew Dillon /* 768a71db85dSMatthew Dillon * NOTE: We must call io_setdirty() to make any late 769a71db85dSMatthew Dillon * changes to the inode data, the system might 770a71db85dSMatthew Dillon * have already flushed the buffer. 771a71db85dSMatthew Dillon */ 772da6f36f4SMatthew Dillon if (chain->data->ipdata.op_flags & 773da6f36f4SMatthew Dillon HAMMER2_OPFLAG_PFSROOT) { 774837bd39bSMatthew Dillon /* 775da6f36f4SMatthew Dillon * non-NULL pmp if mounted as a PFS. We must 77618e8ab5fSMatthew Dillon * sync fields cached in the pmp? XXX 777837bd39bSMatthew Dillon */ 778837bd39bSMatthew Dillon hammer2_inode_data_t *ipdata; 779837bd39bSMatthew Dillon 780a71db85dSMatthew Dillon hammer2_io_setdirty(chain->dio); 781837bd39bSMatthew Dillon ipdata = &chain->data->ipdata; 782e513e77eSMatthew Dillon if (chain->pmp) { 783e513e77eSMatthew Dillon ipdata->pfs_inum = 784e513e77eSMatthew Dillon chain->pmp->inode_tid; 785e513e77eSMatthew Dillon } 78650456506SMatthew Dillon } else { 78750456506SMatthew Dillon /* can't be mounted as a PFS */ 78850456506SMatthew Dillon } 789b3659de2SMatthew Dillon 790b3659de2SMatthew Dillon /* 791b3659de2SMatthew Dillon * Update inode statistics. Pending stats in chain 792b3659de2SMatthew Dillon * are cleared out on UPDATE so expect that bit to 793b3659de2SMatthew Dillon * be set here too or the statistics will not be 794b3659de2SMatthew Dillon * rolled-up properly. 7950cc33e20SMatthew Dillon * 7960cc33e20SMatthew Dillon * (note: rollup data does not effect modify_tid 7970cc33e20SMatthew Dillon * based synchronization checks and can be 7980cc33e20SMatthew Dillon * different). 799b3659de2SMatthew Dillon */ 800a71db85dSMatthew Dillon if (chain->data_count || chain->inode_count) { 801b3659de2SMatthew Dillon hammer2_inode_data_t *ipdata; 802b3659de2SMatthew Dillon 803b3659de2SMatthew Dillon KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 804a71db85dSMatthew Dillon hammer2_io_setdirty(chain->dio); 805b3659de2SMatthew Dillon ipdata = &chain->data->ipdata; 806b3659de2SMatthew Dillon ipdata->data_count += chain->data_count; 807b3659de2SMatthew Dillon ipdata->inode_count += chain->inode_count; 808b3659de2SMatthew Dillon } 809512beabdSMatthew Dillon KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 810a71db85dSMatthew Dillon hammer2_chain_setcheck(chain, chain->data); 8111a7cfe5aSMatthew Dillon break; 81232b800e6SMatthew Dillon default: 81391caa51cSMatthew Dillon KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED); 814da6f36f4SMatthew Dillon panic("hammer2_flush_core: unsupported " 815da6f36f4SMatthew Dillon "embedded bref %d", 81691caa51cSMatthew Dillon chain->bref.type); 81791caa51cSMatthew Dillon /* NOT REACHED */ 81832b800e6SMatthew Dillon } 81932b800e6SMatthew Dillon 82032b800e6SMatthew Dillon /* 821da6f36f4SMatthew Dillon * If the chain was destroyed try to avoid unnecessary I/O. 822da6f36f4SMatthew Dillon * (this only really works if the DIO system buffer is the 823da6f36f4SMatthew Dillon * same size as chain->bytes). 824da6f36f4SMatthew Dillon */ 82505dd26e4SMatthew Dillon if ((chain->flags & HAMMER2_CHAIN_DESTROY) && chain->dio) { 826da6f36f4SMatthew Dillon hammer2_io_setinval(chain->dio, chain->bytes); 827da6f36f4SMatthew Dillon } 828da6f36f4SMatthew Dillon } 829da6f36f4SMatthew Dillon 830da6f36f4SMatthew Dillon /* 831da6f36f4SMatthew Dillon * If UPDATE is set the parent block table may need to be updated. 832da6f36f4SMatthew Dillon * 833da6f36f4SMatthew Dillon * NOTE: UPDATE may be set on vchain or fchain in which case 834da6f36f4SMatthew Dillon * parent could be NULL. It's easiest to allow the case 835da6f36f4SMatthew Dillon * and test for NULL. parent can also wind up being NULL 836da6f36f4SMatthew Dillon * due to a deletion so we need to handle the case anyway. 837da6f36f4SMatthew Dillon * 838da6f36f4SMatthew Dillon * If no parent exists we can just clear the UPDATE bit. If the 839da6f36f4SMatthew Dillon * chain gets reattached later on the bit will simply get set 840da6f36f4SMatthew Dillon * again. 841da6f36f4SMatthew Dillon */ 842da6f36f4SMatthew Dillon if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL) { 843da6f36f4SMatthew Dillon atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 844da6f36f4SMatthew Dillon hammer2_chain_drop(chain); 845da6f36f4SMatthew Dillon } 846da6f36f4SMatthew Dillon 847da6f36f4SMatthew Dillon /* 848da6f36f4SMatthew Dillon * The chain may need its blockrefs updated in the parent. This 849da6f36f4SMatthew Dillon * requires some fancy footwork. 850da6f36f4SMatthew Dillon */ 851da6f36f4SMatthew Dillon if (chain->flags & HAMMER2_CHAIN_UPDATE) { 852da6f36f4SMatthew Dillon hammer2_blockref_t *base; 853da6f36f4SMatthew Dillon int count; 854da6f36f4SMatthew Dillon 855da6f36f4SMatthew Dillon /* 856da6f36f4SMatthew Dillon * Both parent and chain must be locked. This requires 857da6f36f4SMatthew Dillon * temporarily unlocking the chain. We have to deal with 858da6f36f4SMatthew Dillon * the case where the chain might be reparented or modified 859da6f36f4SMatthew Dillon * while it was unlocked. 860da6f36f4SMatthew Dillon */ 861da6f36f4SMatthew Dillon hammer2_chain_unlock(chain); 862da6f36f4SMatthew Dillon hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 863da6f36f4SMatthew Dillon hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE); 864da6f36f4SMatthew Dillon if (chain->parent != parent) { 865da6f36f4SMatthew Dillon kprintf("PARENT MISMATCH ch=%p p=%p/%p\n", chain, chain->parent, parent); 866da6f36f4SMatthew Dillon hammer2_chain_unlock(parent); 867da6f36f4SMatthew Dillon goto done; 868da6f36f4SMatthew Dillon } 869da6f36f4SMatthew Dillon 870da6f36f4SMatthew Dillon /* 871da6f36f4SMatthew Dillon * Check race condition. If someone got in and modified 872da6f36f4SMatthew Dillon * it again while it was unlocked, we have to loop up. 873da6f36f4SMatthew Dillon */ 874da6f36f4SMatthew Dillon if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 875da6f36f4SMatthew Dillon hammer2_chain_unlock(parent); 876da6f36f4SMatthew Dillon kprintf("hammer2_flush: chain %p flush-mod race\n", 877da6f36f4SMatthew Dillon chain); 878da6f36f4SMatthew Dillon goto again; 879da6f36f4SMatthew Dillon } 880da6f36f4SMatthew Dillon 881da6f36f4SMatthew Dillon /* 882da6f36f4SMatthew Dillon * Clear UPDATE flag 883da6f36f4SMatthew Dillon */ 884da6f36f4SMatthew Dillon if (chain->flags & HAMMER2_CHAIN_UPDATE) { 885da6f36f4SMatthew Dillon atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 886da6f36f4SMatthew Dillon hammer2_chain_drop(chain); 887da6f36f4SMatthew Dillon } 888da6f36f4SMatthew Dillon hammer2_chain_modify(info->trans, parent, 0); 889da6f36f4SMatthew Dillon 890da6f36f4SMatthew Dillon /* 891da6f36f4SMatthew Dillon * Calculate blockmap pointer 892da6f36f4SMatthew Dillon */ 893da6f36f4SMatthew Dillon switch(parent->bref.type) { 894da6f36f4SMatthew Dillon case HAMMER2_BREF_TYPE_INODE: 895da6f36f4SMatthew Dillon /* 896da6f36f4SMatthew Dillon * Access the inode's block array. However, there is 897da6f36f4SMatthew Dillon * no block array if the inode is flagged DIRECTDATA. 898da6f36f4SMatthew Dillon */ 899da6f36f4SMatthew Dillon if (parent->data && 900da6f36f4SMatthew Dillon (parent->data->ipdata.op_flags & 901da6f36f4SMatthew Dillon HAMMER2_OPFLAG_DIRECTDATA) == 0) { 902da6f36f4SMatthew Dillon base = &parent->data-> 903da6f36f4SMatthew Dillon ipdata.u.blockset.blockref[0]; 904da6f36f4SMatthew Dillon } else { 905da6f36f4SMatthew Dillon base = NULL; 906da6f36f4SMatthew Dillon } 907da6f36f4SMatthew Dillon count = HAMMER2_SET_COUNT; 908da6f36f4SMatthew Dillon break; 909da6f36f4SMatthew Dillon case HAMMER2_BREF_TYPE_INDIRECT: 910da6f36f4SMatthew Dillon case HAMMER2_BREF_TYPE_FREEMAP_NODE: 911da6f36f4SMatthew Dillon if (parent->data) 912da6f36f4SMatthew Dillon base = &parent->data->npdata[0]; 913da6f36f4SMatthew Dillon else 914da6f36f4SMatthew Dillon base = NULL; 915da6f36f4SMatthew Dillon count = parent->bytes / sizeof(hammer2_blockref_t); 916da6f36f4SMatthew Dillon break; 917da6f36f4SMatthew Dillon case HAMMER2_BREF_TYPE_VOLUME: 918da6f36f4SMatthew Dillon base = &chain->hmp->voldata.sroot_blockset.blockref[0]; 919da6f36f4SMatthew Dillon count = HAMMER2_SET_COUNT; 920da6f36f4SMatthew Dillon break; 921da6f36f4SMatthew Dillon case HAMMER2_BREF_TYPE_FREEMAP: 922da6f36f4SMatthew Dillon base = &parent->data->npdata[0]; 923da6f36f4SMatthew Dillon count = HAMMER2_SET_COUNT; 924da6f36f4SMatthew Dillon break; 925da6f36f4SMatthew Dillon default: 926da6f36f4SMatthew Dillon base = NULL; 927da6f36f4SMatthew Dillon count = 0; 928da6f36f4SMatthew Dillon panic("hammer2_flush_core: " 929da6f36f4SMatthew Dillon "unrecognized blockref type: %d", 930da6f36f4SMatthew Dillon parent->bref.type); 931da6f36f4SMatthew Dillon } 932da6f36f4SMatthew Dillon 933da6f36f4SMatthew Dillon /* 934da6f36f4SMatthew Dillon * Blocktable updates 935b3659de2SMatthew Dillon * 936b3659de2SMatthew Dillon * We synchronize pending statistics at this time. Delta 937b3659de2SMatthew Dillon * adjustments designated for the current and upper level 938b3659de2SMatthew Dillon * are synchronized. 939da6f36f4SMatthew Dillon */ 940da6f36f4SMatthew Dillon if (base && (chain->flags & HAMMER2_CHAIN_BMAPUPD)) { 941da6f36f4SMatthew Dillon if (chain->flags & HAMMER2_CHAIN_BMAPPED) { 9420cc33e20SMatthew Dillon hammer2_spin_ex(&parent->core.spin); 943da6f36f4SMatthew Dillon hammer2_base_delete(info->trans, parent, 944da6f36f4SMatthew Dillon base, count, 945da6f36f4SMatthew Dillon &info->cache_index, chain); 9460cc33e20SMatthew Dillon hammer2_spin_unex(&parent->core.spin); 947b3659de2SMatthew Dillon /* base_delete clears both bits */ 948b3659de2SMatthew Dillon } else { 949b3659de2SMatthew Dillon atomic_clear_int(&chain->flags, 950b3659de2SMatthew Dillon HAMMER2_CHAIN_BMAPUPD); 951da6f36f4SMatthew Dillon } 952da6f36f4SMatthew Dillon } 953da6f36f4SMatthew Dillon if (base && (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) { 954b3659de2SMatthew Dillon parent->data_count += chain->data_count + 955b3659de2SMatthew Dillon chain->data_count_up; 956b3659de2SMatthew Dillon parent->inode_count += chain->inode_count + 957b3659de2SMatthew Dillon chain->inode_count_up; 958b3659de2SMatthew Dillon chain->data_count = 0; 959b3659de2SMatthew Dillon chain->inode_count = 0; 960b3659de2SMatthew Dillon chain->data_count_up = 0; 961b3659de2SMatthew Dillon chain->inode_count_up = 0; 9620cc33e20SMatthew Dillon hammer2_spin_ex(&parent->core.spin); 963da6f36f4SMatthew Dillon hammer2_base_insert(info->trans, parent, 964da6f36f4SMatthew Dillon base, count, 965da6f36f4SMatthew Dillon &info->cache_index, chain); 9660cc33e20SMatthew Dillon hammer2_spin_unex(&parent->core.spin); 967b3659de2SMatthew Dillon /* base_insert sets BMAPPED */ 968da6f36f4SMatthew Dillon } 969da6f36f4SMatthew Dillon hammer2_chain_unlock(parent); 970da6f36f4SMatthew Dillon } 971da6f36f4SMatthew Dillon 972da6f36f4SMatthew Dillon /* 9738138a154SMatthew Dillon * Final cleanup after flush 9748138a154SMatthew Dillon */ 9758138a154SMatthew Dillon done: 976e513e77eSMatthew Dillon KKASSERT(chain->refs > 0); 977850687d2SMatthew Dillon if (hammer2_debug & 0x200) { 978850687d2SMatthew Dillon if (info->debug == chain) 979850687d2SMatthew Dillon info->debug = NULL; 980850687d2SMatthew Dillon } 9818138a154SMatthew Dillon } 9828138a154SMatthew Dillon 9838138a154SMatthew Dillon /* 984da6f36f4SMatthew Dillon * Flush recursion helper, called from flush_core, calls flush_core. 9850dea3156SMatthew Dillon * 9868138a154SMatthew Dillon * Flushes the children of the caller's chain (info->parent), restricted 9878138a154SMatthew Dillon * by sync_tid. Set info->domodify if the child's blockref must propagate 9888138a154SMatthew Dillon * back up to the parent. 9890dea3156SMatthew Dillon * 9908138a154SMatthew Dillon * Ripouts can move child from rbtree to dbtree or dbq but the caller's 9918138a154SMatthew Dillon * flush scan order prevents any chains from being lost. A child can be 992da6f36f4SMatthew Dillon * executes more than once. 993ea155208SMatthew Dillon * 9948138a154SMatthew Dillon * WARNING! If we do not call hammer2_flush_core() we must update 9958138a154SMatthew Dillon * bref.mirror_tid ourselves to indicate that the flush has 9968138a154SMatthew Dillon * processed the child. 997925e4ad1SMatthew Dillon * 9988138a154SMatthew Dillon * WARNING! parent->core spinlock is held on entry and return. 99932b800e6SMatthew Dillon */ 10000dea3156SMatthew Dillon static int 1001da6f36f4SMatthew Dillon hammer2_flush_recurse(hammer2_chain_t *child, void *data) 100232b800e6SMatthew Dillon { 10030dea3156SMatthew Dillon hammer2_flush_info_t *info = data; 1004da6f36f4SMatthew Dillon /*hammer2_trans_t *trans = info->trans;*/ 10050dea3156SMatthew Dillon hammer2_chain_t *parent = info->parent; 1006925e4ad1SMatthew Dillon 10070dea3156SMatthew Dillon /* 100810136ab6SMatthew Dillon * (child can never be fchain or vchain so a special check isn't 100910136ab6SMatthew Dillon * needed). 1010da6f36f4SMatthew Dillon * 1011a4dc31e0SMatthew Dillon * We must ref the child before unlocking the spinlock. 1012a4dc31e0SMatthew Dillon * 1013a4dc31e0SMatthew Dillon * The caller has added a ref to the parent so we can temporarily 1014a4dc31e0SMatthew Dillon * unlock it in order to lock the child. 1015a4dc31e0SMatthew Dillon */ 1016ea155208SMatthew Dillon hammer2_chain_ref(child); 101794491fa0SMatthew Dillon hammer2_spin_unex(&parent->core.spin); 10180dea3156SMatthew Dillon 10190dea3156SMatthew Dillon hammer2_chain_unlock(parent); 10200dea3156SMatthew Dillon hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE); 10210dea3156SMatthew Dillon 102203faa7d5SMatthew Dillon /* 1023e513e77eSMatthew Dillon * Recurse and collect deferral data. We're in the media flush, 1024e513e77eSMatthew Dillon * this can cross PFS boundaries. 102503faa7d5SMatthew Dillon */ 1026da6f36f4SMatthew Dillon if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) { 10270dea3156SMatthew Dillon ++info->depth; 1028*9450e866SMatthew Dillon hammer2_flush_core(info, child, 0); 10290dea3156SMatthew Dillon --info->depth; 1030850687d2SMatthew Dillon } else if (hammer2_debug & 0x200) { 1031850687d2SMatthew Dillon if (info->debug == NULL) 1032850687d2SMatthew Dillon info->debug = child; 1033850687d2SMatthew Dillon ++info->depth; 1034*9450e866SMatthew Dillon hammer2_flush_core(info, child, 0); 1035850687d2SMatthew Dillon --info->depth; 1036850687d2SMatthew Dillon if (info->debug == child) 1037850687d2SMatthew Dillon info->debug = NULL; 10388138a154SMatthew Dillon } 10390dea3156SMatthew Dillon 1040a4dc31e0SMatthew Dillon /* 1041a4dc31e0SMatthew Dillon * Relock to continue the loop 1042a4dc31e0SMatthew Dillon */ 1043a4dc31e0SMatthew Dillon hammer2_chain_unlock(child); 1044ea155208SMatthew Dillon hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 1045a4dc31e0SMatthew Dillon hammer2_chain_drop(child); 1046a4dc31e0SMatthew Dillon KKASSERT(info->parent == parent); 104794491fa0SMatthew Dillon hammer2_spin_ex(&parent->core.spin); 10480dea3156SMatthew Dillon 10490dea3156SMatthew Dillon return (0); 10500dea3156SMatthew Dillon } 1051