xref: /dragonfly/sys/vfs/hammer/hammer_rebalance.c (revision 6e875644)
11775b6a0SMatthew Dillon /*
21775b6a0SMatthew Dillon  * Copyright (c) 2009 The DragonFly Project.  All rights reserved.
31775b6a0SMatthew Dillon  *
41775b6a0SMatthew Dillon  * This code is derived from software contributed to The DragonFly Project
51775b6a0SMatthew Dillon  * by Matthew Dillon <dillon@backplane.com>
61775b6a0SMatthew Dillon  *
71775b6a0SMatthew Dillon  * Redistribution and use in source and binary forms, with or without
81775b6a0SMatthew Dillon  * modification, are permitted provided that the following conditions
91775b6a0SMatthew Dillon  * are met:
101775b6a0SMatthew Dillon  *
111775b6a0SMatthew Dillon  * 1. Redistributions of source code must retain the above copyright
121775b6a0SMatthew Dillon  *    notice, this list of conditions and the following disclaimer.
131775b6a0SMatthew Dillon  * 2. Redistributions in binary form must reproduce the above copyright
141775b6a0SMatthew Dillon  *    notice, this list of conditions and the following disclaimer in
151775b6a0SMatthew Dillon  *    the documentation and/or other materials provided with the
161775b6a0SMatthew Dillon  *    distribution.
171775b6a0SMatthew Dillon  * 3. Neither the name of The DragonFly Project nor the names of its
181775b6a0SMatthew Dillon  *    contributors may be used to endorse or promote products derived
191775b6a0SMatthew Dillon  *    from this software without specific, prior written permission.
201775b6a0SMatthew Dillon  *
211775b6a0SMatthew Dillon  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
221775b6a0SMatthew Dillon  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
231775b6a0SMatthew Dillon  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
241775b6a0SMatthew Dillon  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
251775b6a0SMatthew Dillon  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
261775b6a0SMatthew Dillon  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
271775b6a0SMatthew Dillon  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
281775b6a0SMatthew Dillon  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
291775b6a0SMatthew Dillon  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
301775b6a0SMatthew Dillon  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
311775b6a0SMatthew Dillon  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
321775b6a0SMatthew Dillon  * SUCH DAMAGE.
331775b6a0SMatthew Dillon  */
341775b6a0SMatthew Dillon 
351775b6a0SMatthew Dillon #include "hammer.h"
361775b6a0SMatthew Dillon 
371775b6a0SMatthew Dillon static int rebalance_node(struct hammer_ioc_rebalance *rebal,
3824cf83d2SMatthew Dillon 			hammer_cursor_t cursor, hammer_node_lock_t lcache);
391775b6a0SMatthew Dillon static void rebalance_closeout(hammer_node_lock_t base_item, int base_count,
401775b6a0SMatthew Dillon 			hammer_btree_elm_t elm);
411775b6a0SMatthew Dillon static void rebalance_parent_ptrs(hammer_node_lock_t base_item, int index,
421775b6a0SMatthew Dillon 			hammer_node_lock_t item, hammer_node_lock_t chld_item);
431775b6a0SMatthew Dillon 
441775b6a0SMatthew Dillon /*
451775b6a0SMatthew Dillon  * Iterate through the specified range of object ids and rebalance B-Tree
461775b6a0SMatthew Dillon  * leaf and internal nodes we encounter.  A forwards iteration is used.
471775b6a0SMatthew Dillon  *
481775b6a0SMatthew Dillon  * All leafs are at the same depth.  We use the b-tree scan code loosely
491775b6a0SMatthew Dillon  * to position ourselves and create degenerate cases to skip indices
501775b6a0SMatthew Dillon  * that we have rebalanced in bulk.
511775b6a0SMatthew Dillon  */
521775b6a0SMatthew Dillon 
531775b6a0SMatthew Dillon int
hammer_ioc_rebalance(hammer_transaction_t trans,hammer_inode_t ip,struct hammer_ioc_rebalance * rebal)541775b6a0SMatthew Dillon hammer_ioc_rebalance(hammer_transaction_t trans, hammer_inode_t ip,
551775b6a0SMatthew Dillon 		 struct hammer_ioc_rebalance *rebal)
561775b6a0SMatthew Dillon {
571775b6a0SMatthew Dillon 	struct hammer_cursor cursor;
5824cf83d2SMatthew Dillon 	struct hammer_node_lock lcache;
591775b6a0SMatthew Dillon 	hammer_btree_leaf_elm_t elm;
601775b6a0SMatthew Dillon 	int error;
611775b6a0SMatthew Dillon 	int seq;
6246137e17STomohiro Kusumi 	uint32_t key_end_localization;
631775b6a0SMatthew Dillon 
641775b6a0SMatthew Dillon 	if ((rebal->key_beg.localization | rebal->key_end.localization) &
651775b6a0SMatthew Dillon 	    HAMMER_LOCALIZE_PSEUDOFS_MASK) {
661775b6a0SMatthew Dillon 		return(EINVAL);
671775b6a0SMatthew Dillon 	}
681775b6a0SMatthew Dillon 	if (rebal->key_beg.localization > rebal->key_end.localization)
691775b6a0SMatthew Dillon 		return(EINVAL);
701775b6a0SMatthew Dillon 	if (rebal->key_beg.localization == rebal->key_end.localization) {
711775b6a0SMatthew Dillon 		if (rebal->key_beg.obj_id > rebal->key_end.obj_id)
721775b6a0SMatthew Dillon 			return(EINVAL);
731775b6a0SMatthew Dillon 		/* key-space limitations - no check needed */
741775b6a0SMatthew Dillon 	}
751775b6a0SMatthew Dillon 	if (rebal->saturation < HAMMER_BTREE_INT_ELMS / 2)
761775b6a0SMatthew Dillon 		rebal->saturation = HAMMER_BTREE_INT_ELMS / 2;
771775b6a0SMatthew Dillon 	if (rebal->saturation > HAMMER_BTREE_INT_ELMS)
781775b6a0SMatthew Dillon 		rebal->saturation = HAMMER_BTREE_INT_ELMS;
791775b6a0SMatthew Dillon 
80cd192e33STomohiro Kusumi 	/*
81cd192e33STomohiro Kusumi 	 * Ioctl caller has only set localization type to rebalance.
82cd192e33STomohiro Kusumi 	 * Initialize cursor key localization with ip localization.
83cd192e33STomohiro Kusumi 	 */
841775b6a0SMatthew Dillon 	rebal->key_cur = rebal->key_beg;
851775b6a0SMatthew Dillon 	rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK;
865e1e1454STomohiro Kusumi 	if (rebal->allpfs == 0)
877e52af60STomohiro Kusumi 		rebal->key_cur.localization |= ip->obj_localization;
881775b6a0SMatthew Dillon 
89cd192e33STomohiro Kusumi 	key_end_localization = rebal->key_end.localization;
90cd192e33STomohiro Kusumi 	key_end_localization &= HAMMER_LOCALIZE_MASK;
915e1e1454STomohiro Kusumi 	if (rebal->allpfs == 0)
927e52af60STomohiro Kusumi 		key_end_localization |= ip->obj_localization;
935e1e1454STomohiro Kusumi 	else
9420cf2291STomohiro Kusumi 		key_end_localization |= pfs_to_lo(HAMMER_MAX_PFSID);
95cd192e33STomohiro Kusumi 
9624cf83d2SMatthew Dillon 	hammer_btree_lcache_init(trans->hmp, &lcache, 2);
9724cf83d2SMatthew Dillon 
98e86903d8SMatthew Dillon 	seq = trans->hmp->flusher.done;
991775b6a0SMatthew Dillon 
1001775b6a0SMatthew Dillon 	/*
1011775b6a0SMatthew Dillon 	 * Scan forwards.  Retries typically occur if a deadlock is detected.
1021775b6a0SMatthew Dillon 	 */
1031775b6a0SMatthew Dillon retry:
1041775b6a0SMatthew Dillon 	error = hammer_init_cursor(trans, &cursor, NULL, NULL);
1051775b6a0SMatthew Dillon 	if (error) {
1061775b6a0SMatthew Dillon 		hammer_done_cursor(&cursor);
1071775b6a0SMatthew Dillon 		goto failed;
1081775b6a0SMatthew Dillon 	}
1091775b6a0SMatthew Dillon 	cursor.key_beg = rebal->key_cur;
1101775b6a0SMatthew Dillon 	cursor.key_end = rebal->key_end;
111cd192e33STomohiro Kusumi 	cursor.key_end.localization = key_end_localization;
1121775b6a0SMatthew Dillon 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1131775b6a0SMatthew Dillon 	cursor.flags |= HAMMER_CURSOR_BACKEND;
1141775b6a0SMatthew Dillon 
1151775b6a0SMatthew Dillon 	/*
1161775b6a0SMatthew Dillon 	 * Cause internal nodes to be returned on the way up.  Internal nodes
1171775b6a0SMatthew Dillon 	 * are not returned on the way down so we can create a degenerate
1181775b6a0SMatthew Dillon 	 * case to handle internal nodes as a trailing function of their
1191775b6a0SMatthew Dillon 	 * sub-trees.
1201775b6a0SMatthew Dillon 	 *
1211775b6a0SMatthew Dillon 	 * Note that by not setting INSERTING or PRUNING no boundary
1221775b6a0SMatthew Dillon 	 * corrections will be made and a sync lock is not needed for the
1231775b6a0SMatthew Dillon 	 * B-Tree scan itself.
1241775b6a0SMatthew Dillon 	 */
1251775b6a0SMatthew Dillon 	cursor.flags |= HAMMER_CURSOR_REBLOCKING;
1261775b6a0SMatthew Dillon 
1271775b6a0SMatthew Dillon 	error = hammer_btree_first(&cursor);
1281775b6a0SMatthew Dillon 
1291775b6a0SMatthew Dillon 	while (error == 0) {
1301775b6a0SMatthew Dillon 		/*
131c652be54SMatthew Dillon 		 * Rebalancing can be hard on the memory allocator, make
132c652be54SMatthew Dillon 		 * sure there is enough free memory before doing it.
133c652be54SMatthew Dillon 		 */
13412052253SMatthew Dillon 		if (vm_test_nominal()) {
13512052253SMatthew Dillon 			hammer_unlock_cursor(&cursor);
136c652be54SMatthew Dillon 			vm_wait_nominal();
13712052253SMatthew Dillon 			hammer_lock_cursor(&cursor);
13812052253SMatthew Dillon 		}
139c652be54SMatthew Dillon 
140c652be54SMatthew Dillon 		/*
14184c5a984SMatthew Dillon 		 * Filesystem went read-only during rebalancing
14284c5a984SMatthew Dillon 		 */
14384c5a984SMatthew Dillon 		if (trans->hmp->ronly) {
14484c5a984SMatthew Dillon 			error = EROFS;
14584c5a984SMatthew Dillon 			break;
14684c5a984SMatthew Dillon 		}
14784c5a984SMatthew Dillon 
14884c5a984SMatthew Dillon 		/*
1491775b6a0SMatthew Dillon 		 * We only care about internal nodes visited for the last
1501775b6a0SMatthew Dillon 		 * time on the way up... that is, a trailing scan of the
1511775b6a0SMatthew Dillon 		 * internal node after all of its children have been recursed
1521775b6a0SMatthew Dillon 		 * through.
1531775b6a0SMatthew Dillon 		 */
1541775b6a0SMatthew Dillon 		if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
1551775b6a0SMatthew Dillon 			/*
1561775b6a0SMatthew Dillon 			 * Leave cursor.index alone, we want to recurse
1571775b6a0SMatthew Dillon 			 * through all children of the internal node before
1581775b6a0SMatthew Dillon 			 * visiting it.
1591775b6a0SMatthew Dillon 			 *
1601775b6a0SMatthew Dillon 			 * Process the internal node on the way up after
1611775b6a0SMatthew Dillon 			 * the last child's sub-tree has been balanced.
1621775b6a0SMatthew Dillon 			 */
1631775b6a0SMatthew Dillon 			if (cursor.index == cursor.node->ondisk->count - 1) {
1641775b6a0SMatthew Dillon 				hammer_sync_lock_sh(trans);
16524cf83d2SMatthew Dillon 				error = rebalance_node(rebal, &cursor, &lcache);
1661775b6a0SMatthew Dillon 				hammer_sync_unlock(trans);
1671775b6a0SMatthew Dillon 			}
1681775b6a0SMatthew Dillon 		} else {
1691775b6a0SMatthew Dillon 			/*
1701775b6a0SMatthew Dillon 			 * We don't need to iterate through all the leaf
1711775b6a0SMatthew Dillon 			 * elements, we only care about the parent (internal)
1721775b6a0SMatthew Dillon 			 * node.
1731775b6a0SMatthew Dillon 			 */
1741775b6a0SMatthew Dillon 			cursor.index = cursor.node->ondisk->count - 1;
1751775b6a0SMatthew Dillon 		}
1761775b6a0SMatthew Dillon 		if (error)
1771775b6a0SMatthew Dillon 			break;
1781775b6a0SMatthew Dillon 
1791775b6a0SMatthew Dillon 		/*
1801775b6a0SMatthew Dillon 		 * Update returned scan position and do a flush if
1811775b6a0SMatthew Dillon 		 * necessary.
182c9ce54d6SMatthew Dillon 		 *
183f3a4893bSMatthew Dillon 		 * WARNING: We extract the base using the leaf element
184f3a4893bSMatthew Dillon 		 *	    type but this could be an internal node.  The
185f3a4893bSMatthew Dillon 		 *	    base is the same either way.
186f3a4893bSMatthew Dillon 		 *
187f3a4893bSMatthew Dillon 		 *	    However, due to the rebalancing operation the
188f3a4893bSMatthew Dillon 		 *	    cursor position may have exceeded the right-hand
189f3a4893bSMatthew Dillon 		 *	    boundary.
190f3a4893bSMatthew Dillon 		 *
191c9ce54d6SMatthew Dillon 		 * WARNING: See warnings in hammer_unlock_cursor()
192c9ce54d6SMatthew Dillon 		 *	    function.
1931775b6a0SMatthew Dillon 		 */
1941775b6a0SMatthew Dillon 		elm = &cursor.node->ondisk->elms[cursor.index].leaf;
1951775b6a0SMatthew Dillon 		rebal->key_cur = elm->base;
1961775b6a0SMatthew Dillon 		++rebal->stat_ncount;
1971775b6a0SMatthew Dillon 
1981775b6a0SMatthew Dillon 		while (hammer_flusher_meta_halflimit(trans->hmp) ||
1991775b6a0SMatthew Dillon 		       hammer_flusher_undo_exhausted(trans, 2)) {
2001775b6a0SMatthew Dillon 			hammer_unlock_cursor(&cursor);
2011775b6a0SMatthew Dillon 			hammer_flusher_wait(trans->hmp, seq);
2021775b6a0SMatthew Dillon 			hammer_lock_cursor(&cursor);
2031775b6a0SMatthew Dillon 			seq = hammer_flusher_async_one(trans->hmp);
2041775b6a0SMatthew Dillon 		}
2051775b6a0SMatthew Dillon 
2061775b6a0SMatthew Dillon 		/*
207f3a4893bSMatthew Dillon 		 * Before iterating check if the rebalance operation caused
208f3a4893bSMatthew Dillon 		 * the cursor to index past the right-hand boundary and make
209f3a4893bSMatthew Dillon 		 * sure to stop if it does.  Otherwise the iteration may
210f3a4893bSMatthew Dillon 		 * panic e.g. due to the key maxing out its fields and no
211f3a4893bSMatthew Dillon 		 * longer being within the strict bounds of the root node.
212f3a4893bSMatthew Dillon 		 */
213f3a4893bSMatthew Dillon 		if (hammer_btree_cmp(&rebal->key_cur, &cursor.key_end) > 0) {
214f3a4893bSMatthew Dillon 			rebal->key_cur = cursor.key_end;
215f3a4893bSMatthew Dillon 			break;
216f3a4893bSMatthew Dillon 		}
217f3a4893bSMatthew Dillon 
218f3a4893bSMatthew Dillon 		/*
2191775b6a0SMatthew Dillon 		 * Iterate, stop if a signal was received.
2201775b6a0SMatthew Dillon 		 */
2211775b6a0SMatthew Dillon 		if ((error = hammer_signal_check(trans->hmp)) != 0)
2221775b6a0SMatthew Dillon 			break;
2231775b6a0SMatthew Dillon 		error = hammer_btree_iterate(&cursor);
2241775b6a0SMatthew Dillon 	}
2251775b6a0SMatthew Dillon 	if (error == ENOENT)
2261775b6a0SMatthew Dillon 		error = 0;
2271775b6a0SMatthew Dillon 	hammer_done_cursor(&cursor);
2287ddc70d1SMatthew Dillon 	if (error == EDEADLK) {
2297ddc70d1SMatthew Dillon 		++rebal->stat_collisions;
2301775b6a0SMatthew Dillon 		goto retry;
2317ddc70d1SMatthew Dillon 	}
2321775b6a0SMatthew Dillon 	if (error == EINTR) {
2331775b6a0SMatthew Dillon 		rebal->head.flags |= HAMMER_IOC_HEAD_INTR;
2341775b6a0SMatthew Dillon 		error = 0;
2351775b6a0SMatthew Dillon 	}
2361775b6a0SMatthew Dillon failed:
2371775b6a0SMatthew Dillon 	rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK;
23824cf83d2SMatthew Dillon 	hammer_btree_lcache_free(trans->hmp, &lcache);
2391775b6a0SMatthew Dillon 	return(error);
2401775b6a0SMatthew Dillon }
2411775b6a0SMatthew Dillon 
2421775b6a0SMatthew Dillon /*
2431775b6a0SMatthew Dillon  * Rebalance an internal node, called via a trailing upward recursion.
2441775b6a0SMatthew Dillon  * All the children have already been individually rebalanced.
2451775b6a0SMatthew Dillon  *
2461775b6a0SMatthew Dillon  * To rebalance we scan the elements in the children and pack them,
2471775b6a0SMatthew Dillon  * so we actually need to lock the children and the children's children.
2481775b6a0SMatthew Dillon  *
2491775b6a0SMatthew Dillon  *	INTERNAL_NODE
2501775b6a0SMatthew Dillon  *	/ / | | | \ \
2511775b6a0SMatthew Dillon  *     C C  C C C  C C	children (first level) (internal or leaf nodes)
2521775b6a0SMatthew Dillon  *			children's elements (second level)
2531775b6a0SMatthew Dillon  *
2541775b6a0SMatthew Dillon  *	<<<----------   pack children's elements, possibly remove excess
2551775b6a0SMatthew Dillon  *			children after packing.
2561775b6a0SMatthew Dillon  *
2571775b6a0SMatthew Dillon  * NOTE: The mirror_tids, parent pointers, and child pointers must be updated.
2581775b6a0SMatthew Dillon  *       Any live tracked B-Tree nodes must be updated (we worm out of that
2591775b6a0SMatthew Dillon  *       by not allowing any).  And boundary elements must be preserved.
2601775b6a0SMatthew Dillon  *
2611775b6a0SMatthew Dillon  * NOTE: If the children are leaf nodes we may have a degenerate case
2621775b6a0SMatthew Dillon  *       case where there are no elements in the leafs.
2631775b6a0SMatthew Dillon  *
2641775b6a0SMatthew Dillon  * XXX live-tracked
2651775b6a0SMatthew Dillon  */
2661775b6a0SMatthew Dillon static int
rebalance_node(struct hammer_ioc_rebalance * rebal,hammer_cursor_t cursor,hammer_node_lock_t lcache)26724cf83d2SMatthew Dillon rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor,
268b59ba5b8STomohiro Kusumi 	       hammer_node_lock_t lcache)
2691775b6a0SMatthew Dillon {
2701775b6a0SMatthew Dillon 	struct hammer_node_lock lockroot;
2711775b6a0SMatthew Dillon 	hammer_node_lock_t base_item;
2721775b6a0SMatthew Dillon 	hammer_node_lock_t chld_item;
2731775b6a0SMatthew Dillon 	hammer_node_lock_t item;
2741775b6a0SMatthew Dillon 	hammer_btree_elm_t elm;
2751775b6a0SMatthew Dillon 	hammer_node_t node;
276bbb01e14SMatthew Dillon 	hammer_tid_t tid;
27746137e17STomohiro Kusumi 	uint8_t type1 __debugvar;
2781775b6a0SMatthew Dillon 	int base_count;
2791775b6a0SMatthew Dillon 	int root_count;
2801775b6a0SMatthew Dillon 	int avg_elms;
2811775b6a0SMatthew Dillon 	int count;
2821775b6a0SMatthew Dillon 	int error;
2831775b6a0SMatthew Dillon 	int i;
2841775b6a0SMatthew Dillon 	int n;
2851775b6a0SMatthew Dillon 
2861775b6a0SMatthew Dillon 	/*
2871775b6a0SMatthew Dillon 	 * Lock the parent node via the cursor, collect and lock our
2881775b6a0SMatthew Dillon 	 * children and children's children.
2891775b6a0SMatthew Dillon 	 *
2901775b6a0SMatthew Dillon 	 * By the way, this is a LOT of locks.
2911775b6a0SMatthew Dillon 	 */
2921775b6a0SMatthew Dillon 	hammer_node_lock_init(&lockroot, cursor->node);
2931775b6a0SMatthew Dillon 	error = hammer_cursor_upgrade(cursor);
2941775b6a0SMatthew Dillon 	if (error)
2951775b6a0SMatthew Dillon 		goto done;
29624cf83d2SMatthew Dillon 	error = hammer_btree_lock_children(cursor, 2, &lockroot, lcache);
2971775b6a0SMatthew Dillon 	if (error)
2981775b6a0SMatthew Dillon 		goto done;
2991775b6a0SMatthew Dillon 
3001775b6a0SMatthew Dillon 	/*
3011775b6a0SMatthew Dillon 	 * Make a copy of all the locked on-disk data to simplify the element
3021775b6a0SMatthew Dillon 	 * shifting we are going to have to do.  We will modify the copy
3031775b6a0SMatthew Dillon 	 * first.
3041775b6a0SMatthew Dillon 	 */
3051775b6a0SMatthew Dillon 	hammer_btree_lock_copy(cursor, &lockroot);
3061775b6a0SMatthew Dillon 
3071775b6a0SMatthew Dillon 	/*
3081775b6a0SMatthew Dillon 	 * Look at the first child node.
3091775b6a0SMatthew Dillon 	 */
3101775b6a0SMatthew Dillon 	if (TAILQ_FIRST(&lockroot.list) == NULL)
3111775b6a0SMatthew Dillon 		goto done;
3121775b6a0SMatthew Dillon 	type1 = TAILQ_FIRST(&lockroot.list)->node->ondisk->type;
3131775b6a0SMatthew Dillon 
3141775b6a0SMatthew Dillon 	/*
3151775b6a0SMatthew Dillon 	 * Figure out the total number of children's children and
3161775b6a0SMatthew Dillon 	 * calculate the average number of elements per child.
3171775b6a0SMatthew Dillon 	 *
3181775b6a0SMatthew Dillon 	 * The minimum avg_elms is 1 when count > 0.  avg_elms * root_elms
3191775b6a0SMatthew Dillon 	 * is always greater or equal to count.
3201775b6a0SMatthew Dillon 	 *
3211775b6a0SMatthew Dillon 	 * If count == 0 we hit a degenerate case which will cause
3221775b6a0SMatthew Dillon 	 * avg_elms to also calculate as 0.
3231775b6a0SMatthew Dillon 	 */
3241775b6a0SMatthew Dillon 	if (hammer_debug_general & 0x1000)
32533234d14STomohiro Kusumi 		hdkprintf("lockroot %p count %d\n", &lockroot, lockroot.count);
3261775b6a0SMatthew Dillon 	count = 0;
3271775b6a0SMatthew Dillon 	TAILQ_FOREACH(item, &lockroot.list, entry) {
3281775b6a0SMatthew Dillon 		if (hammer_debug_general & 0x1000)
32933234d14STomohiro Kusumi 			hdkprintf("add count %d\n", item->count);
3301775b6a0SMatthew Dillon 		count += item->count;
3311775b6a0SMatthew Dillon 		KKASSERT(item->node->ondisk->type == type1);
3321775b6a0SMatthew Dillon 	}
333*6e875644SSascha Wildner 	avg_elms = howmany(count, lockroot.count);
3341775b6a0SMatthew Dillon 	KKASSERT(avg_elms >= 0);
3351775b6a0SMatthew Dillon 
3361775b6a0SMatthew Dillon 	/*
3371775b6a0SMatthew Dillon 	 * If the average number of elements per child is too low then
3381775b6a0SMatthew Dillon 	 * calculate the desired number of children (n) such that the
3391775b6a0SMatthew Dillon 	 * average number of elements is reasonable.
3401775b6a0SMatthew Dillon 	 *
3411775b6a0SMatthew Dillon 	 * If the desired number of children is 1 then avg_elms will
3421775b6a0SMatthew Dillon 	 * wind up being count, which may still be smaller then saturation
3431775b6a0SMatthew Dillon 	 * but that is ok.
3441775b6a0SMatthew Dillon 	 */
3451775b6a0SMatthew Dillon 	if (count && avg_elms < rebal->saturation) {
346*6e875644SSascha Wildner 		n = howmany(count, rebal->saturation);
347*6e875644SSascha Wildner 		avg_elms = howmany(count, n);
3481775b6a0SMatthew Dillon 	}
3491775b6a0SMatthew Dillon 
3501775b6a0SMatthew Dillon 	/*
3511775b6a0SMatthew Dillon 	 * Pack the elements in the children.  Elements for each item is
3521775b6a0SMatthew Dillon 	 * packed into base_item until avg_elms is reached, then base_item
3531775b6a0SMatthew Dillon 	 * iterates.
3541775b6a0SMatthew Dillon 	 *
3551775b6a0SMatthew Dillon 	 * hammer_cursor_moved_element() is called for each element moved
3561775b6a0SMatthew Dillon 	 * to update tracked cursors, including the index beyond the last
3571775b6a0SMatthew Dillon 	 * element (at count).
358bbb01e14SMatthew Dillon 	 *
359bbb01e14SMatthew Dillon 	 * Any cursors tracking the internal node itself must also be
360bbb01e14SMatthew Dillon 	 * updated, potentially repointing them at a leaf and clearing
361bbb01e14SMatthew Dillon 	 * ATEDISK.
3621775b6a0SMatthew Dillon 	 */
3631775b6a0SMatthew Dillon 	base_item = TAILQ_FIRST(&lockroot.list);
3641775b6a0SMatthew Dillon 	base_count = 0;
3651775b6a0SMatthew Dillon 	root_count = 0;
3661775b6a0SMatthew Dillon 
3671775b6a0SMatthew Dillon 	TAILQ_FOREACH(item, &lockroot.list, entry) {
3681775b6a0SMatthew Dillon 		node = item->node;
3691775b6a0SMatthew Dillon 		KKASSERT(item->count == node->ondisk->count);
3701775b6a0SMatthew Dillon 		chld_item = TAILQ_FIRST(&item->list);
3711775b6a0SMatthew Dillon 		for (i = 0; i < item->count; ++i) {
3721775b6a0SMatthew Dillon 			/*
3731775b6a0SMatthew Dillon 			 * Closeout.  If the next element is at index 0
3741775b6a0SMatthew Dillon 			 * just use the existing separator in the parent.
3751775b6a0SMatthew Dillon 			 */
3761775b6a0SMatthew Dillon 			if (base_count == avg_elms) {
3771775b6a0SMatthew Dillon 				if (i == 0) {
3781775b6a0SMatthew Dillon 					elm = &lockroot.node->ondisk->elms[
3791775b6a0SMatthew Dillon 						item->index];
3801775b6a0SMatthew Dillon 				} else {
3811775b6a0SMatthew Dillon 					elm = &node->ondisk->elms[i];
3821775b6a0SMatthew Dillon 				}
3831775b6a0SMatthew Dillon 				rebalance_closeout(base_item, base_count, elm);
3841775b6a0SMatthew Dillon 				base_item = TAILQ_NEXT(base_item, entry);
3851775b6a0SMatthew Dillon 				KKASSERT(base_item);
3861775b6a0SMatthew Dillon 				base_count = 0;
3871775b6a0SMatthew Dillon 				++root_count;
3881775b6a0SMatthew Dillon 			}
3891775b6a0SMatthew Dillon 
3901775b6a0SMatthew Dillon 			/*
3911775b6a0SMatthew Dillon 			 * Check degenerate no-work case.  Otherwise pack
3921775b6a0SMatthew Dillon 			 * the element.
3931775b6a0SMatthew Dillon 			 *
3941775b6a0SMatthew Dillon 			 * All changes are made to the copy.
3951775b6a0SMatthew Dillon 			 */
3961775b6a0SMatthew Dillon 			if (item == base_item && i == base_count) {
3971775b6a0SMatthew Dillon 				++base_count;
3981775b6a0SMatthew Dillon 				if (chld_item)
3991775b6a0SMatthew Dillon 					chld_item = TAILQ_NEXT(chld_item, entry);
4001775b6a0SMatthew Dillon 				continue;
4011775b6a0SMatthew Dillon 			}
4021775b6a0SMatthew Dillon 
4031775b6a0SMatthew Dillon 			/*
4041775b6a0SMatthew Dillon 			 * Pack element.
4051775b6a0SMatthew Dillon 			 */
4061775b6a0SMatthew Dillon 			elm = &base_item->copy->elms[base_count];
4071775b6a0SMatthew Dillon 			*elm = node->ondisk->elms[i];
4081775b6a0SMatthew Dillon 			base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
4091775b6a0SMatthew Dillon 
4101775b6a0SMatthew Dillon 			/*
411bbb01e14SMatthew Dillon 			 * Adjust the mirror_tid of the target and the
412bbb01e14SMatthew Dillon 			 * internal element linkage.
4134ac6e40fSMatthew Dillon 			 *
414bbb01e14SMatthew Dillon 			 * The parent node (lockroot.node) should already
415bbb01e14SMatthew Dillon 			 * have an aggregate mirror_tid so we do not have
416bbb01e14SMatthew Dillon 			 * to update that.  However, it is possible for us
417bbb01e14SMatthew Dillon 			 * to catch a hammer_btree_mirror_propagate() with
418bbb01e14SMatthew Dillon 			 * its pants down.  Update the parent if necessary.
4191775b6a0SMatthew Dillon 			 */
420bbb01e14SMatthew Dillon 			tid = node->ondisk->mirror_tid;
421bbb01e14SMatthew Dillon 
422bbb01e14SMatthew Dillon 			if (base_item->copy->mirror_tid < tid) {
423bbb01e14SMatthew Dillon 				base_item->copy->mirror_tid = tid;
424bbb01e14SMatthew Dillon 				if (lockroot.copy->mirror_tid < tid) {
425bbb01e14SMatthew Dillon 					lockroot.copy->mirror_tid = tid;
426bbb01e14SMatthew Dillon 					lockroot.flags |=
427bbb01e14SMatthew Dillon 						HAMMER_NODE_LOCK_UPDATED;
428bbb01e14SMatthew Dillon 				}
429bbb01e14SMatthew Dillon 				if (lockroot.copy->elms[root_count].
430bbb01e14SMatthew Dillon 				    internal.mirror_tid < tid) {
431bbb01e14SMatthew Dillon 					lockroot.copy->elms[root_count].
432bbb01e14SMatthew Dillon 						internal.mirror_tid = tid;
4334ac6e40fSMatthew Dillon 					lockroot.flags |=
4344ac6e40fSMatthew Dillon 						HAMMER_NODE_LOCK_UPDATED;
4354ac6e40fSMatthew Dillon 				}
4361775b6a0SMatthew Dillon 				base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
4371775b6a0SMatthew Dillon 			}
4381775b6a0SMatthew Dillon 
4391775b6a0SMatthew Dillon 			/*
4401775b6a0SMatthew Dillon 			 * We moved elm.  The parent pointers for any
4411775b6a0SMatthew Dillon 			 * children of elm must be repointed.
4421775b6a0SMatthew Dillon 			 */
4431775b6a0SMatthew Dillon 			if (item != base_item &&
4441775b6a0SMatthew Dillon 			    node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
4451775b6a0SMatthew Dillon 				KKASSERT(chld_item);
4461775b6a0SMatthew Dillon 				rebalance_parent_ptrs(base_item, base_count,
4471775b6a0SMatthew Dillon 						      item, chld_item);
4481775b6a0SMatthew Dillon 			}
449bbb01e14SMatthew Dillon 			hammer_cursor_moved_element(item->parent->node,
450bbb01e14SMatthew Dillon 						    item->index,
451bbb01e14SMatthew Dillon 						    node, i,
452bbb01e14SMatthew Dillon 						    base_item->node,
453bbb01e14SMatthew Dillon 						    base_count);
4541775b6a0SMatthew Dillon 			++base_count;
4551775b6a0SMatthew Dillon 			if (chld_item)
4561775b6a0SMatthew Dillon 				chld_item = TAILQ_NEXT(chld_item, entry);
4571775b6a0SMatthew Dillon 		}
4581775b6a0SMatthew Dillon 
4591775b6a0SMatthew Dillon 		/*
4601775b6a0SMatthew Dillon 		 * Always call at the end (i == number of elements) in
4611775b6a0SMatthew Dillon 		 * case a cursor is sitting indexed there.
4621775b6a0SMatthew Dillon 		 */
463bbb01e14SMatthew Dillon 		hammer_cursor_moved_element(item->parent->node, item->index,
464bbb01e14SMatthew Dillon 					    node, i,
465bbb01e14SMatthew Dillon 					    base_item->node, base_count);
4661775b6a0SMatthew Dillon 	}
4671775b6a0SMatthew Dillon 
4681775b6a0SMatthew Dillon 	/*
4691775b6a0SMatthew Dillon 	 * Packing complete, close-out base_item using the right-hand
4701775b6a0SMatthew Dillon 	 * boundary of the original parent.
4711775b6a0SMatthew Dillon 	 *
4721775b6a0SMatthew Dillon 	 * If we will be deleting nodes from the root shift the old
4731775b6a0SMatthew Dillon 	 * right-hand-boundary to the new ending index.
4741775b6a0SMatthew Dillon 	 */
4751775b6a0SMatthew Dillon 	elm = &lockroot.node->ondisk->elms[lockroot.node->ondisk->count];
4761775b6a0SMatthew Dillon 	rebalance_closeout(base_item, base_count, elm);
4771775b6a0SMatthew Dillon 	++root_count;
4781775b6a0SMatthew Dillon 	if (lockroot.copy->count != root_count) {
4791775b6a0SMatthew Dillon 		lockroot.copy->count = root_count;
4801775b6a0SMatthew Dillon 		lockroot.copy->elms[root_count] = *elm;
4811775b6a0SMatthew Dillon 		lockroot.flags |= HAMMER_NODE_LOCK_UPDATED;
4821775b6a0SMatthew Dillon 	}
4831775b6a0SMatthew Dillon 
4841775b6a0SMatthew Dillon 	/*
4851775b6a0SMatthew Dillon 	 * Any extra items beyond base_item are now completely empty and
4861775b6a0SMatthew Dillon 	 * can be destroyed.  Queue the destruction up in the copy.  Note
4871775b6a0SMatthew Dillon 	 * that none of the destroyed nodes are part of our cursor.
4881775b6a0SMatthew Dillon 	 *
4891775b6a0SMatthew Dillon 	 * The cursor is locked so it isn't on the tracking list.  It
4901775b6a0SMatthew Dillon 	 * should have been pointing at the boundary element (at root_count).
4911775b6a0SMatthew Dillon 	 * When deleting elements from the root (which is cursor.node), we
4921775b6a0SMatthew Dillon 	 * have to update the cursor.index manually to keep it in bounds.
4931775b6a0SMatthew Dillon 	 */
4941775b6a0SMatthew Dillon 	while ((base_item = TAILQ_NEXT(base_item, entry)) != NULL) {
4951775b6a0SMatthew Dillon 		hammer_cursor_removed_node(base_item->node, lockroot.node,
4961775b6a0SMatthew Dillon 					   base_count);
4971775b6a0SMatthew Dillon 		hammer_cursor_deleted_element(lockroot.node, base_count);
4981775b6a0SMatthew Dillon 		base_item->copy->type = HAMMER_BTREE_TYPE_DELETED;
4991775b6a0SMatthew Dillon 		base_item->copy->count = 0;
5001775b6a0SMatthew Dillon 		base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
5011775b6a0SMatthew Dillon 		if (cursor->index > lockroot.copy->count)
5021775b6a0SMatthew Dillon 			--cursor->index;
5037ddc70d1SMatthew Dillon 		++rebal->stat_deletions;
5041775b6a0SMatthew Dillon 	}
5051775b6a0SMatthew Dillon 
5061775b6a0SMatthew Dillon 	/*
5071775b6a0SMatthew Dillon 	 * All done, sync the locked child tree to disk.  This will also
5081775b6a0SMatthew Dillon 	 * flush and delete deleted nodes.
5091775b6a0SMatthew Dillon 	 */
5107ddc70d1SMatthew Dillon 	rebal->stat_nrebal += hammer_btree_sync_copy(cursor, &lockroot);
5111775b6a0SMatthew Dillon done:
51224cf83d2SMatthew Dillon 	hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, lcache);
5131775b6a0SMatthew Dillon 	hammer_cursor_downgrade(cursor);
5141775b6a0SMatthew Dillon 	return (error);
5151775b6a0SMatthew Dillon }
5161775b6a0SMatthew Dillon 
5171775b6a0SMatthew Dillon /*
5181775b6a0SMatthew Dillon  * Close-out the child base_item.  This node contains base_count
5191775b6a0SMatthew Dillon  * elements.
5201775b6a0SMatthew Dillon  *
5211775b6a0SMatthew Dillon  * If the node is an internal node the right-hand boundary must be
5221775b6a0SMatthew Dillon  * set to elm.
5231775b6a0SMatthew Dillon  */
5241775b6a0SMatthew Dillon static
5251775b6a0SMatthew Dillon void
rebalance_closeout(hammer_node_lock_t base_item,int base_count,hammer_btree_elm_t elm)5261775b6a0SMatthew Dillon rebalance_closeout(hammer_node_lock_t base_item, int base_count,
5271775b6a0SMatthew Dillon 		   hammer_btree_elm_t elm)
5281775b6a0SMatthew Dillon {
5291775b6a0SMatthew Dillon 	hammer_node_lock_t parent;
5301775b6a0SMatthew Dillon 	hammer_btree_elm_t base_elm;
5311775b6a0SMatthew Dillon 	hammer_btree_elm_t rbound_elm;
53246137e17STomohiro Kusumi 	uint8_t save;
5331775b6a0SMatthew Dillon 
5341775b6a0SMatthew Dillon 	/*
5351775b6a0SMatthew Dillon 	 * Update the count.  NOTE:  base_count can be 0 for the
5361775b6a0SMatthew Dillon 	 * degenerate leaf case.
5371775b6a0SMatthew Dillon 	 */
5381775b6a0SMatthew Dillon 	if (hammer_debug_general & 0x1000) {
53935a5249bSTomohiro Kusumi 		hdkprintf("%016jx:", (intmax_t)base_item->node->node_offset);
5401775b6a0SMatthew Dillon 	}
5411775b6a0SMatthew Dillon 	if (base_item->copy->count != base_count) {
5421775b6a0SMatthew Dillon 		base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
5431775b6a0SMatthew Dillon 		base_item->copy->count = base_count;
5441775b6a0SMatthew Dillon 		if (hammer_debug_general & 0x1000)
5451775b6a0SMatthew Dillon 			kprintf(" (count update)");
5461775b6a0SMatthew Dillon 	}
5471775b6a0SMatthew Dillon 
5481775b6a0SMatthew Dillon 	/*
5491775b6a0SMatthew Dillon 	 * If we are closing out an internal node we must assign
5501775b6a0SMatthew Dillon 	 * a right-hand boundary.  Use the element contents as the
5511775b6a0SMatthew Dillon 	 * right-hand boundary.
5521775b6a0SMatthew Dillon 	 *
5531775b6a0SMatthew Dillon 	 * Internal nodes are required to have at least one child,
5541775b6a0SMatthew Dillon 	 * otherwise the left and right boundary would end up being
5551775b6a0SMatthew Dillon 	 * the same element.  Only leaf nodes can be empty.
5567ddc70d1SMatthew Dillon 	 *
5577ddc70d1SMatthew Dillon 	 * Rebalancing may cut-off an internal node such that the
5587ddc70d1SMatthew Dillon 	 * new right hand boundary is the next element anyway, but
5597ddc70d1SMatthew Dillon 	 * we still have to make sure that subtree_offset, btype,
5607ddc70d1SMatthew Dillon 	 * and mirror_tid are all 0.
5611775b6a0SMatthew Dillon 	 */
5621775b6a0SMatthew Dillon 	if (base_item->copy->type == HAMMER_BTREE_TYPE_INTERNAL) {
5631775b6a0SMatthew Dillon 		KKASSERT(base_count != 0);
5641775b6a0SMatthew Dillon 		base_elm = &base_item->copy->elms[base_count];
5651775b6a0SMatthew Dillon 
5667ddc70d1SMatthew Dillon 		if (bcmp(base_elm, elm, sizeof(*elm)) != 0 ||
5677ddc70d1SMatthew Dillon 		    elm->internal.subtree_offset ||
5687ddc70d1SMatthew Dillon 		    elm->internal.mirror_tid ||
5697ddc70d1SMatthew Dillon 		    elm->base.btype) {
5701775b6a0SMatthew Dillon 			*base_elm = *elm;
5711775b6a0SMatthew Dillon 			base_elm->internal.subtree_offset = 0;
5727ddc70d1SMatthew Dillon 			base_elm->internal.mirror_tid = 0;
5731775b6a0SMatthew Dillon 			base_elm->base.btype = 0;
5741775b6a0SMatthew Dillon 			base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
5751775b6a0SMatthew Dillon 			if (hammer_debug_general & 0x1000)
5761775b6a0SMatthew Dillon 				kprintf(" (rhs update)");
5771775b6a0SMatthew Dillon 		} else {
5781775b6a0SMatthew Dillon 			if (hammer_debug_general & 0x1000)
5791775b6a0SMatthew Dillon 				kprintf(" (rhs same)");
5801775b6a0SMatthew Dillon 		}
5811775b6a0SMatthew Dillon 	}
5821775b6a0SMatthew Dillon 
5831775b6a0SMatthew Dillon 	/*
5841775b6a0SMatthew Dillon 	 * The parent's boundary must be updated.  Be careful to retain
5851775b6a0SMatthew Dillon 	 * the btype and non-base internal fields as that information is
5861775b6a0SMatthew Dillon 	 * unrelated.
5871775b6a0SMatthew Dillon 	 */
5881775b6a0SMatthew Dillon 	parent = base_item->parent;
5891775b6a0SMatthew Dillon 	rbound_elm = &parent->copy->elms[base_item->index + 1];
5901775b6a0SMatthew Dillon 	if (bcmp(&rbound_elm->base, &elm->base, sizeof(elm->base)) != 0) {
5911775b6a0SMatthew Dillon 		save = rbound_elm->base.btype;
5921775b6a0SMatthew Dillon 		rbound_elm->base = elm->base;
5931775b6a0SMatthew Dillon 		rbound_elm->base.btype = save;
5941775b6a0SMatthew Dillon 		parent->flags |= HAMMER_NODE_LOCK_UPDATED;
5951775b6a0SMatthew Dillon 		if (hammer_debug_general & 0x1000) {
5961775b6a0SMatthew Dillon 			kprintf(" (parent bound update %d)",
5971775b6a0SMatthew Dillon 				base_item->index + 1);
5981775b6a0SMatthew Dillon 		}
5991775b6a0SMatthew Dillon 	}
6001775b6a0SMatthew Dillon 	if (hammer_debug_general & 0x1000)
6011775b6a0SMatthew Dillon 		kprintf("\n");
6021775b6a0SMatthew Dillon }
6031775b6a0SMatthew Dillon 
6041775b6a0SMatthew Dillon /*
6051775b6a0SMatthew Dillon  * An element in item has moved to base_item.  We must update the parent
6061775b6a0SMatthew Dillon  * pointer of the node the element points to (which is chld_item).
6071775b6a0SMatthew Dillon  */
6081775b6a0SMatthew Dillon static
6091775b6a0SMatthew Dillon void
rebalance_parent_ptrs(hammer_node_lock_t base_item,int index,hammer_node_lock_t item,hammer_node_lock_t chld_item)6101775b6a0SMatthew Dillon rebalance_parent_ptrs(hammer_node_lock_t base_item, int index,
6111775b6a0SMatthew Dillon 		      hammer_node_lock_t item, hammer_node_lock_t chld_item)
6121775b6a0SMatthew Dillon {
6131775b6a0SMatthew Dillon 	KKASSERT(chld_item->node->ondisk->parent == item->node->node_offset);
6141775b6a0SMatthew Dillon 	chld_item->copy->parent = base_item->node->node_offset;
6151775b6a0SMatthew Dillon 	chld_item->flags |= HAMMER_NODE_LOCK_UPDATED;
6161775b6a0SMatthew Dillon 	hammer_cursor_parent_changed(chld_item->node,
6171775b6a0SMatthew Dillon 				     item->node, base_item->node, index);
6181775b6a0SMatthew Dillon }
619