xref: /dragonfly/sys/vfs/hammer/hammer_rebalance.c (revision 235099c3)
1 /*
2  * Copyright (c) 2009 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer.h"
36 
37 static int rebalance_node(struct hammer_ioc_rebalance *rebal,
38 			hammer_cursor_t cursor);
39 static void rebalance_closeout(hammer_node_lock_t base_item, int base_count,
40 			hammer_btree_elm_t elm);
41 static void rebalance_parent_ptrs(hammer_node_lock_t base_item, int index,
42 			hammer_node_lock_t item, hammer_node_lock_t chld_item);
43 
44 /*
45  * Iterate through the specified range of object ids and rebalance B-Tree
46  * leaf and internal nodes we encounter.  A forwards iteration is used.
47  *
48  * All leafs are at the same depth.  We use the b-tree scan code loosely
49  * to position ourselves and create degenerate cases to skip indices
50  * that we have rebalanced in bulk.
51  */
52 
53 int
54 hammer_ioc_rebalance(hammer_transaction_t trans, hammer_inode_t ip,
55 		 struct hammer_ioc_rebalance *rebal)
56 {
57 	struct hammer_cursor cursor;
58 	hammer_btree_leaf_elm_t elm;
59 	int error;
60 	int seq;
61 
62 	if ((rebal->key_beg.localization | rebal->key_end.localization) &
63 	    HAMMER_LOCALIZE_PSEUDOFS_MASK) {
64 		return(EINVAL);
65 	}
66 	if (rebal->key_beg.localization > rebal->key_end.localization)
67 		return(EINVAL);
68 	if (rebal->key_beg.localization == rebal->key_end.localization) {
69 		if (rebal->key_beg.obj_id > rebal->key_end.obj_id)
70 			return(EINVAL);
71 		/* key-space limitations - no check needed */
72 	}
73 	if (rebal->saturation < HAMMER_BTREE_INT_ELMS / 2)
74 		rebal->saturation = HAMMER_BTREE_INT_ELMS / 2;
75 	if (rebal->saturation > HAMMER_BTREE_INT_ELMS)
76 		rebal->saturation = HAMMER_BTREE_INT_ELMS;
77 
78 	rebal->key_cur = rebal->key_beg;
79 	rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK;
80 	rebal->key_cur.localization += ip->obj_localization;
81 
82 	seq = trans->hmp->flusher.act;
83 
84 	/*
85 	 * Scan forwards.  Retries typically occur if a deadlock is detected.
86 	 */
87 retry:
88 	error = hammer_init_cursor(trans, &cursor, NULL, NULL);
89 	if (error) {
90 		hammer_done_cursor(&cursor);
91 		goto failed;
92 	}
93 	cursor.key_beg = rebal->key_cur;
94 	cursor.key_end = rebal->key_end;
95 	cursor.key_end.localization &= HAMMER_LOCALIZE_MASK;
96 	cursor.key_end.localization += ip->obj_localization;
97 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
98 	cursor.flags |= HAMMER_CURSOR_BACKEND;
99 
100 	/*
101 	 * Cause internal nodes to be returned on the way up.  Internal nodes
102 	 * are not returned on the way down so we can create a degenerate
103 	 * case to handle internal nodes as a trailing function of their
104 	 * sub-trees.
105 	 *
106 	 * Note that by not setting INSERTING or PRUNING no boundary
107 	 * corrections will be made and a sync lock is not needed for the
108 	 * B-Tree scan itself.
109 	 */
110 	cursor.flags |= HAMMER_CURSOR_REBLOCKING;
111 
112 	error = hammer_btree_first(&cursor);
113 
114 	while (error == 0) {
115 		/*
116 		 * We only care about internal nodes visited for the last
117 		 * time on the way up... that is, a trailing scan of the
118 		 * internal node after all of its children have been recursed
119 		 * through.
120 		 */
121 		if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
122 			/*
123 			 * Leave cursor.index alone, we want to recurse
124 			 * through all children of the internal node before
125 			 * visiting it.
126 			 *
127 			 * Process the internal node on the way up after
128 			 * the last child's sub-tree has been balanced.
129 			 */
130 			if (cursor.index == cursor.node->ondisk->count - 1) {
131 				hammer_sync_lock_sh(trans);
132 				error = rebalance_node(rebal, &cursor);
133 				hammer_sync_unlock(trans);
134 			}
135 		} else {
136 			/*
137 			 * We don't need to iterate through all the leaf
138 			 * elements, we only care about the parent (internal)
139 			 * node.
140 			 */
141 			cursor.index = cursor.node->ondisk->count - 1;
142 		}
143 		if (error)
144 			break;
145 
146 		/*
147 		 * Update returned scan position and do a flush if
148 		 * necessary.
149 		 *
150 		 * WARNING: See warnings in hammer_unlock_cursor()
151 		 *	    function.
152 		 */
153 		elm = &cursor.node->ondisk->elms[cursor.index].leaf;
154 		rebal->key_cur = elm->base;
155 		++rebal->stat_ncount;
156 
157 		while (hammer_flusher_meta_halflimit(trans->hmp) ||
158 		       hammer_flusher_undo_exhausted(trans, 2)) {
159 			hammer_unlock_cursor(&cursor);
160 			hammer_flusher_wait(trans->hmp, seq);
161 			hammer_lock_cursor(&cursor);
162 			seq = hammer_flusher_async_one(trans->hmp);
163 		}
164 
165 		/*
166 		 * Iterate, stop if a signal was received.
167 		 */
168 		if ((error = hammer_signal_check(trans->hmp)) != 0)
169 			break;
170 		error = hammer_btree_iterate(&cursor);
171 	}
172 	if (error == ENOENT)
173 		error = 0;
174 	hammer_done_cursor(&cursor);
175 	if (error == EDEADLK) {
176 		++rebal->stat_collisions;
177 		goto retry;
178 	}
179 	if (error == EINTR) {
180 		rebal->head.flags |= HAMMER_IOC_HEAD_INTR;
181 		error = 0;
182 	}
183 failed:
184 	rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK;
185 	return(error);
186 }
187 
188 /*
189  * Rebalance an internal node, called via a trailing upward recursion.
190  * All the children have already been individually rebalanced.
191  *
192  * To rebalance we scan the elements in the children and pack them,
193  * so we actually need to lock the children and the children's children.
194  *
195  *	INTERNAL_NODE
196  *	/ / | | | \ \
197  *     C C  C C C  C C	children (first level) (internal or leaf nodes)
198  *			children's elements (second level)
199  *
200  *	<<<----------   pack children's elements, possibly remove excess
201  *			children after packing.
202  *
203  * NOTE: The mirror_tids, parent pointers, and child pointers must be updated.
204  *       Any live tracked B-Tree nodes must be updated (we worm out of that
205  *       by not allowing any).  And boundary elements must be preserved.
206  *
207  * NOTE: If the children are leaf nodes we may have a degenerate case
208  *       case where there are no elements in the leafs.
209  *
210  * XXX live-tracked
211  */
212 static int
213 rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor)
214 {
215 	struct hammer_node_lock lockroot;
216 	hammer_node_lock_t base_item;
217 	hammer_node_lock_t chld_item;
218 	hammer_node_lock_t item;
219 	hammer_btree_elm_t elm;
220 	hammer_node_t node;
221 	u_int8_t type1;
222 	int base_count;
223 	int root_count;
224 	int avg_elms;
225 	int count;
226 	int error;
227 	int i;
228 	int n;
229 
230 	/*
231 	 * Lock the parent node via the cursor, collect and lock our
232 	 * children and children's children.
233 	 *
234 	 * By the way, this is a LOT of locks.
235 	 */
236 	hammer_node_lock_init(&lockroot, cursor->node);
237 	error = hammer_cursor_upgrade(cursor);
238 	if (error)
239 		goto done;
240 	error = hammer_btree_lock_children(cursor, 2, &lockroot);
241 	if (error)
242 		goto done;
243 
244 	/*
245 	 * Make a copy of all the locked on-disk data to simplify the element
246 	 * shifting we are going to have to do.  We will modify the copy
247 	 * first.
248 	 */
249 	hammer_btree_lock_copy(cursor, &lockroot);
250 
251 	/*
252 	 * Look at the first child node.
253 	 */
254 	if (TAILQ_FIRST(&lockroot.list) == NULL)
255 		goto done;
256 	type1 = TAILQ_FIRST(&lockroot.list)->node->ondisk->type;
257 
258 	/*
259 	 * Figure out the total number of children's children and
260 	 * calculate the average number of elements per child.
261 	 *
262 	 * The minimum avg_elms is 1 when count > 0.  avg_elms * root_elms
263 	 * is always greater or equal to count.
264 	 *
265 	 * If count == 0 we hit a degenerate case which will cause
266 	 * avg_elms to also calculate as 0.
267 	 */
268 	if (hammer_debug_general & 0x1000)
269 		kprintf("lockroot %p count %d\n", &lockroot, lockroot.count);
270 	count = 0;
271 	TAILQ_FOREACH(item, &lockroot.list, entry) {
272 		if (hammer_debug_general & 0x1000)
273 			kprintf("add count %d\n", item->count);
274 		count += item->count;
275 		KKASSERT(item->node->ondisk->type == type1);
276 	}
277 	avg_elms = (count + (lockroot.count - 1)) / lockroot.count;
278 	KKASSERT(avg_elms >= 0);
279 
280 	/*
281 	 * If the average number of elements per child is too low then
282 	 * calculate the desired number of children (n) such that the
283 	 * average number of elements is reasonable.
284 	 *
285 	 * If the desired number of children is 1 then avg_elms will
286 	 * wind up being count, which may still be smaller then saturation
287 	 * but that is ok.
288 	 */
289 	if (count && avg_elms < rebal->saturation) {
290 		n = (count + (rebal->saturation - 1)) / rebal->saturation;
291 		avg_elms = (count + (n - 1)) / n;
292 	}
293 
294 	/*
295 	 * Pack the elements in the children.  Elements for each item is
296 	 * packed into base_item until avg_elms is reached, then base_item
297 	 * iterates.
298 	 *
299 	 * hammer_cursor_moved_element() is called for each element moved
300 	 * to update tracked cursors, including the index beyond the last
301 	 * element (at count).
302 	 */
303 	base_item = TAILQ_FIRST(&lockroot.list);
304 	base_count = 0;
305 	root_count = 0;
306 
307 	TAILQ_FOREACH(item, &lockroot.list, entry) {
308 		node = item->node;
309 		KKASSERT(item->count == node->ondisk->count);
310 		chld_item = TAILQ_FIRST(&item->list);
311 		for (i = 0; i < item->count; ++i) {
312 			/*
313 			 * Closeout.  If the next element is at index 0
314 			 * just use the existing separator in the parent.
315 			 */
316 			if (base_count == avg_elms) {
317 				if (i == 0) {
318 					elm = &lockroot.node->ondisk->elms[
319 						item->index];
320 				} else {
321 					elm = &node->ondisk->elms[i];
322 				}
323 				rebalance_closeout(base_item, base_count, elm);
324 				base_item = TAILQ_NEXT(base_item, entry);
325 				KKASSERT(base_item);
326 				base_count = 0;
327 				++root_count;
328 			}
329 
330 			/*
331 			 * Check degenerate no-work case.  Otherwise pack
332 			 * the element.
333 			 *
334 			 * All changes are made to the copy.
335 			 */
336 			if (item == base_item && i == base_count) {
337 				++base_count;
338 				if (chld_item)
339 					chld_item = TAILQ_NEXT(chld_item, entry);
340 				continue;
341 			}
342 
343 			/*
344 			 * Pack element.
345 			 */
346 			elm = &base_item->copy->elms[base_count];
347 			*elm = node->ondisk->elms[i];
348 			base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
349 
350 			/*
351 			 * Adjust the mirror_tid of the target.  The parent
352 			 * node (lockroot.node) should already have an
353 			 * aggregate mirror_tid so we do not have to update
354 			 * that.
355 			 *
356 			 * However, it is possible for us to catch a
357 			 * hammer_btree_mirror_propagate() with its pants
358 			 * down.  Update the parent if necessary.
359 			 */
360 			if (base_item->copy->mirror_tid <
361 			    node->ondisk->mirror_tid) {
362 				base_item->copy->mirror_tid =
363 					node->ondisk->mirror_tid;
364 				if (lockroot.copy->mirror_tid <
365 				    node->ondisk->mirror_tid) {
366 					kprintf("HAMMER: Warning: rebalance "
367 						"caught race against "
368 						"propagate\n");
369 					lockroot.copy->mirror_tid =
370 						node->ondisk->mirror_tid;
371 					lockroot.flags |=
372 						HAMMER_NODE_LOCK_UPDATED;
373 				}
374 				base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
375 			}
376 
377 			/*
378 			 * We moved elm.  The parent pointers for any
379 			 * children of elm must be repointed.
380 			 */
381 			if (item != base_item &&
382 			    node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
383 				KKASSERT(chld_item);
384 				rebalance_parent_ptrs(base_item, base_count,
385 						      item, chld_item);
386 			}
387 			hammer_cursor_moved_element(node, base_item->node,
388 						    i, base_count);
389 			++base_count;
390 			if (chld_item)
391 				chld_item = TAILQ_NEXT(chld_item, entry);
392 		}
393 
394 		/*
395 		 * Always call at the end (i == number of elements) in
396 		 * case a cursor is sitting indexed there.
397 		 */
398 		hammer_cursor_moved_element(node, base_item->node,
399 					    i, base_count);
400 	}
401 
402 	/*
403 	 * Packing complete, close-out base_item using the right-hand
404 	 * boundary of the original parent.
405 	 *
406 	 * If we will be deleting nodes from the root shift the old
407 	 * right-hand-boundary to the new ending index.
408 	 */
409 	elm = &lockroot.node->ondisk->elms[lockroot.node->ondisk->count];
410 	rebalance_closeout(base_item, base_count, elm);
411 	++root_count;
412 	if (lockroot.copy->count != root_count) {
413 		lockroot.copy->count = root_count;
414 		lockroot.copy->elms[root_count] = *elm;
415 		lockroot.flags |= HAMMER_NODE_LOCK_UPDATED;
416 	}
417 
418 	/*
419 	 * Any extra items beyond base_item are now completely empty and
420 	 * can be destroyed.  Queue the destruction up in the copy.  Note
421 	 * that none of the destroyed nodes are part of our cursor.
422 	 *
423 	 * The cursor is locked so it isn't on the tracking list.  It
424 	 * should have been pointing at the boundary element (at root_count).
425 	 * When deleting elements from the root (which is cursor.node), we
426 	 * have to update the cursor.index manually to keep it in bounds.
427 	 */
428 	while ((base_item = TAILQ_NEXT(base_item, entry)) != NULL) {
429 		hammer_cursor_removed_node(base_item->node, lockroot.node,
430 					   base_count);
431 		hammer_cursor_deleted_element(lockroot.node, base_count);
432 		base_item->copy->type = HAMMER_BTREE_TYPE_DELETED;
433 		base_item->copy->count = 0;
434 		base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
435 		if (cursor->index > lockroot.copy->count)
436 			--cursor->index;
437 		++rebal->stat_deletions;
438 	}
439 
440 	/*
441 	 * All done, sync the locked child tree to disk.  This will also
442 	 * flush and delete deleted nodes.
443 	 */
444 	rebal->stat_nrebal += hammer_btree_sync_copy(cursor, &lockroot);
445 done:
446 	hammer_btree_unlock_children(cursor, &lockroot);
447 	hammer_cursor_downgrade(cursor);
448 	return (error);
449 }
450 
451 /*
452  * Close-out the child base_item.  This node contains base_count
453  * elements.
454  *
455  * If the node is an internal node the right-hand boundary must be
456  * set to elm.
457  */
458 static
459 void
460 rebalance_closeout(hammer_node_lock_t base_item, int base_count,
461 		   hammer_btree_elm_t elm)
462 {
463 	hammer_node_lock_t parent;
464 	hammer_btree_elm_t base_elm;
465 	hammer_btree_elm_t rbound_elm;
466 	u_int8_t save;
467 
468 	/*
469 	 * Update the count.  NOTE:  base_count can be 0 for the
470 	 * degenerate leaf case.
471 	 */
472 	if (hammer_debug_general & 0x1000) {
473 		kprintf("rebalance_closeout %016llx:",
474 			(long long)base_item->node->node_offset);
475 	}
476 	if (base_item->copy->count != base_count) {
477 		base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
478 		base_item->copy->count = base_count;
479 		if (hammer_debug_general & 0x1000)
480 			kprintf(" (count update)");
481 	}
482 
483 	/*
484 	 * If we are closing out an internal node we must assign
485 	 * a right-hand boundary.  Use the element contents as the
486 	 * right-hand boundary.
487 	 *
488 	 * Internal nodes are required to have at least one child,
489 	 * otherwise the left and right boundary would end up being
490 	 * the same element.  Only leaf nodes can be empty.
491 	 *
492 	 * Rebalancing may cut-off an internal node such that the
493 	 * new right hand boundary is the next element anyway, but
494 	 * we still have to make sure that subtree_offset, btype,
495 	 * and mirror_tid are all 0.
496 	 */
497 	if (base_item->copy->type == HAMMER_BTREE_TYPE_INTERNAL) {
498 		KKASSERT(base_count != 0);
499 		base_elm = &base_item->copy->elms[base_count];
500 
501 		if (bcmp(base_elm, elm, sizeof(*elm)) != 0 ||
502 		    elm->internal.subtree_offset ||
503 		    elm->internal.mirror_tid ||
504 		    elm->base.btype) {
505 			*base_elm = *elm;
506 			base_elm->internal.subtree_offset = 0;
507 			base_elm->internal.mirror_tid = 0;
508 			base_elm->base.btype = 0;
509 			base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
510 			if (hammer_debug_general & 0x1000)
511 				kprintf(" (rhs update)");
512 		} else {
513 			if (hammer_debug_general & 0x1000)
514 				kprintf(" (rhs same)");
515 		}
516 	}
517 
518 	/*
519 	 * The parent's boundary must be updated.  Be careful to retain
520 	 * the btype and non-base internal fields as that information is
521 	 * unrelated.
522 	 */
523 	parent = base_item->parent;
524 	rbound_elm = &parent->copy->elms[base_item->index + 1];
525 	if (bcmp(&rbound_elm->base, &elm->base, sizeof(elm->base)) != 0) {
526 		save = rbound_elm->base.btype;
527 		rbound_elm->base = elm->base;
528 		rbound_elm->base.btype = save;
529 		parent->flags |= HAMMER_NODE_LOCK_UPDATED;
530 		if (hammer_debug_general & 0x1000) {
531 			kprintf(" (parent bound update %d)",
532 				base_item->index + 1);
533 		}
534 	}
535 	if (hammer_debug_general & 0x1000)
536 		kprintf("\n");
537 }
538 
539 /*
540  * An element in item has moved to base_item.  We must update the parent
541  * pointer of the node the element points to (which is chld_item).
542  */
543 static
544 void
545 rebalance_parent_ptrs(hammer_node_lock_t base_item, int index,
546 		      hammer_node_lock_t item, hammer_node_lock_t chld_item)
547 {
548 	KKASSERT(chld_item->node->ondisk->parent == item->node->node_offset);
549 	chld_item->copy->parent = base_item->node->node_offset;
550 	chld_item->flags |= HAMMER_NODE_LOCK_UPDATED;
551 	hammer_cursor_parent_changed(chld_item->node,
552 				     item->node, base_item->node, index);
553 }
554