xref: /dragonfly/sys/vfs/hammer/hammer_rebalance.c (revision 896f2e3a)
1 /*
2  * Copyright (c) 2009 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer.h"
36 
37 static int rebalance_node(struct hammer_ioc_rebalance *rebal,
38 			hammer_cursor_t cursor, hammer_node_lock_t lcache);
39 static void rebalance_closeout(hammer_node_lock_t base_item, int base_count,
40 			hammer_btree_elm_t elm);
41 static void rebalance_parent_ptrs(hammer_node_lock_t base_item, int index,
42 			hammer_node_lock_t item, hammer_node_lock_t chld_item);
43 
44 /*
45  * Iterate through the specified range of object ids and rebalance B-Tree
46  * leaf and internal nodes we encounter.  A forwards iteration is used.
47  *
48  * All leafs are at the same depth.  We use the b-tree scan code loosely
49  * to position ourselves and create degenerate cases to skip indices
50  * that we have rebalanced in bulk.
51  */
52 
53 int
54 hammer_ioc_rebalance(hammer_transaction_t trans, hammer_inode_t ip,
55 		 struct hammer_ioc_rebalance *rebal)
56 {
57 	struct hammer_cursor cursor;
58 	struct hammer_node_lock lcache;
59 	hammer_btree_leaf_elm_t elm;
60 	int error;
61 	int seq;
62 	u_int32_t key_end_localization;
63 
64 	if ((rebal->key_beg.localization | rebal->key_end.localization) &
65 	    HAMMER_LOCALIZE_PSEUDOFS_MASK) {
66 		return(EINVAL);
67 	}
68 	if (rebal->key_beg.localization > rebal->key_end.localization)
69 		return(EINVAL);
70 	if (rebal->key_beg.localization == rebal->key_end.localization) {
71 		if (rebal->key_beg.obj_id > rebal->key_end.obj_id)
72 			return(EINVAL);
73 		/* key-space limitations - no check needed */
74 	}
75 	if (rebal->saturation < HAMMER_BTREE_INT_ELMS / 2)
76 		rebal->saturation = HAMMER_BTREE_INT_ELMS / 2;
77 	if (rebal->saturation > HAMMER_BTREE_INT_ELMS)
78 		rebal->saturation = HAMMER_BTREE_INT_ELMS;
79 
80 	/*
81 	 * Ioctl caller has only set localization type to rebalance.
82 	 * Initialize cursor key localization with ip localization.
83 	 */
84 	rebal->key_cur = rebal->key_beg;
85 	rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK;
86 	if (rebal->allpfs == 0)
87 		rebal->key_cur.localization += ip->obj_localization;
88 
89 	key_end_localization = rebal->key_end.localization;
90 	key_end_localization &= HAMMER_LOCALIZE_MASK;
91 	if (rebal->allpfs == 0)
92 		key_end_localization += ip->obj_localization;
93 	else
94 		key_end_localization += ((HAMMER_MAX_PFS - 1) << 16);
95 
96 	hammer_btree_lcache_init(trans->hmp, &lcache, 2);
97 
98 	seq = trans->hmp->flusher.done;
99 
100 	/*
101 	 * Scan forwards.  Retries typically occur if a deadlock is detected.
102 	 */
103 retry:
104 	error = hammer_init_cursor(trans, &cursor, NULL, NULL);
105 	if (error) {
106 		hammer_done_cursor(&cursor);
107 		goto failed;
108 	}
109 	cursor.key_beg = rebal->key_cur;
110 	cursor.key_end = rebal->key_end;
111 	cursor.key_end.localization = key_end_localization;
112 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
113 	cursor.flags |= HAMMER_CURSOR_BACKEND;
114 
115 	/*
116 	 * Cause internal nodes to be returned on the way up.  Internal nodes
117 	 * are not returned on the way down so we can create a degenerate
118 	 * case to handle internal nodes as a trailing function of their
119 	 * sub-trees.
120 	 *
121 	 * Note that by not setting INSERTING or PRUNING no boundary
122 	 * corrections will be made and a sync lock is not needed for the
123 	 * B-Tree scan itself.
124 	 */
125 	cursor.flags |= HAMMER_CURSOR_REBLOCKING;
126 
127 	error = hammer_btree_first(&cursor);
128 
129 	while (error == 0) {
130 		/*
131 		 * Rebalancing can be hard on the memory allocator, make
132 		 * sure there is enough free memory before doing it.
133 		 */
134 		if (vm_test_nominal()) {
135 			hammer_unlock_cursor(&cursor);
136 			vm_wait_nominal();
137 			hammer_lock_cursor(&cursor);
138 		}
139 
140 		/*
141 		 * We only care about internal nodes visited for the last
142 		 * time on the way up... that is, a trailing scan of the
143 		 * internal node after all of its children have been recursed
144 		 * through.
145 		 */
146 		if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
147 			/*
148 			 * Leave cursor.index alone, we want to recurse
149 			 * through all children of the internal node before
150 			 * visiting it.
151 			 *
152 			 * Process the internal node on the way up after
153 			 * the last child's sub-tree has been balanced.
154 			 */
155 			if (cursor.index == cursor.node->ondisk->count - 1) {
156 				hammer_sync_lock_sh(trans);
157 				error = rebalance_node(rebal, &cursor, &lcache);
158 				hammer_sync_unlock(trans);
159 			}
160 		} else {
161 			/*
162 			 * We don't need to iterate through all the leaf
163 			 * elements, we only care about the parent (internal)
164 			 * node.
165 			 */
166 			cursor.index = cursor.node->ondisk->count - 1;
167 		}
168 		if (error)
169 			break;
170 
171 		/*
172 		 * Update returned scan position and do a flush if
173 		 * necessary.
174 		 *
175 		 * WARNING: We extract the base using the leaf element
176 		 *	    type but this could be an internal node.  The
177 		 *	    base is the same either way.
178 		 *
179 		 *	    However, due to the rebalancing operation the
180 		 *	    cursor position may have exceeded the right-hand
181 		 *	    boundary.
182 		 *
183 		 * WARNING: See warnings in hammer_unlock_cursor()
184 		 *	    function.
185 		 */
186 		elm = &cursor.node->ondisk->elms[cursor.index].leaf;
187 		rebal->key_cur = elm->base;
188 		++rebal->stat_ncount;
189 
190 		while (hammer_flusher_meta_halflimit(trans->hmp) ||
191 		       hammer_flusher_undo_exhausted(trans, 2)) {
192 			hammer_unlock_cursor(&cursor);
193 			hammer_flusher_wait(trans->hmp, seq);
194 			hammer_lock_cursor(&cursor);
195 			seq = hammer_flusher_async_one(trans->hmp);
196 		}
197 
198 		/*
199 		 * Before iterating check if the rebalance operation caused
200 		 * the cursor to index past the right-hand boundary and make
201 		 * sure to stop if it does.  Otherwise the iteration may
202 		 * panic e.g. due to the key maxing out its fields and no
203 		 * longer being within the strict bounds of the root node.
204 		 */
205 		if (hammer_btree_cmp(&rebal->key_cur, &cursor.key_end) > 0) {
206 			rebal->key_cur = cursor.key_end;
207 			break;
208 		}
209 
210 		/*
211 		 * Iterate, stop if a signal was received.
212 		 */
213 		if ((error = hammer_signal_check(trans->hmp)) != 0)
214 			break;
215 		error = hammer_btree_iterate(&cursor);
216 	}
217 	if (error == ENOENT)
218 		error = 0;
219 	hammer_done_cursor(&cursor);
220 	if (error == EDEADLK) {
221 		++rebal->stat_collisions;
222 		goto retry;
223 	}
224 	if (error == EINTR) {
225 		rebal->head.flags |= HAMMER_IOC_HEAD_INTR;
226 		error = 0;
227 	}
228 failed:
229 	rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK;
230 	hammer_btree_lcache_free(trans->hmp, &lcache);
231 	return(error);
232 }
233 
234 /*
235  * Rebalance an internal node, called via a trailing upward recursion.
236  * All the children have already been individually rebalanced.
237  *
238  * To rebalance we scan the elements in the children and pack them,
239  * so we actually need to lock the children and the children's children.
240  *
241  *	INTERNAL_NODE
242  *	/ / | | | \ \
243  *     C C  C C C  C C	children (first level) (internal or leaf nodes)
244  *			children's elements (second level)
245  *
246  *	<<<----------   pack children's elements, possibly remove excess
247  *			children after packing.
248  *
249  * NOTE: The mirror_tids, parent pointers, and child pointers must be updated.
250  *       Any live tracked B-Tree nodes must be updated (we worm out of that
251  *       by not allowing any).  And boundary elements must be preserved.
252  *
253  * NOTE: If the children are leaf nodes we may have a degenerate case
254  *       case where there are no elements in the leafs.
255  *
256  * XXX live-tracked
257  */
258 static int
259 rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor,
260 	       struct hammer_node_lock *lcache)
261 {
262 	struct hammer_node_lock lockroot;
263 	hammer_node_lock_t base_item;
264 	hammer_node_lock_t chld_item;
265 	hammer_node_lock_t item;
266 	hammer_btree_elm_t elm;
267 	hammer_node_t node;
268 	hammer_tid_t tid;
269 	u_int8_t type1 __debugvar;
270 	int base_count;
271 	int root_count;
272 	int avg_elms;
273 	int count;
274 	int error;
275 	int i;
276 	int n;
277 
278 	/*
279 	 * Lock the parent node via the cursor, collect and lock our
280 	 * children and children's children.
281 	 *
282 	 * By the way, this is a LOT of locks.
283 	 */
284 	hammer_node_lock_init(&lockroot, cursor->node);
285 	error = hammer_cursor_upgrade(cursor);
286 	if (error)
287 		goto done;
288 	error = hammer_btree_lock_children(cursor, 2, &lockroot, lcache);
289 	if (error)
290 		goto done;
291 
292 	/*
293 	 * Make a copy of all the locked on-disk data to simplify the element
294 	 * shifting we are going to have to do.  We will modify the copy
295 	 * first.
296 	 */
297 	hammer_btree_lock_copy(cursor, &lockroot);
298 
299 	/*
300 	 * Look at the first child node.
301 	 */
302 	if (TAILQ_FIRST(&lockroot.list) == NULL)
303 		goto done;
304 	type1 = TAILQ_FIRST(&lockroot.list)->node->ondisk->type;
305 
306 	/*
307 	 * Figure out the total number of children's children and
308 	 * calculate the average number of elements per child.
309 	 *
310 	 * The minimum avg_elms is 1 when count > 0.  avg_elms * root_elms
311 	 * is always greater or equal to count.
312 	 *
313 	 * If count == 0 we hit a degenerate case which will cause
314 	 * avg_elms to also calculate as 0.
315 	 */
316 	if (hammer_debug_general & 0x1000)
317 		kprintf("lockroot %p count %d\n", &lockroot, lockroot.count);
318 	count = 0;
319 	TAILQ_FOREACH(item, &lockroot.list, entry) {
320 		if (hammer_debug_general & 0x1000)
321 			kprintf("add count %d\n", item->count);
322 		count += item->count;
323 		KKASSERT(item->node->ondisk->type == type1);
324 	}
325 	avg_elms = (count + (lockroot.count - 1)) / lockroot.count;
326 	KKASSERT(avg_elms >= 0);
327 
328 	/*
329 	 * If the average number of elements per child is too low then
330 	 * calculate the desired number of children (n) such that the
331 	 * average number of elements is reasonable.
332 	 *
333 	 * If the desired number of children is 1 then avg_elms will
334 	 * wind up being count, which may still be smaller then saturation
335 	 * but that is ok.
336 	 */
337 	if (count && avg_elms < rebal->saturation) {
338 		n = (count + (rebal->saturation - 1)) / rebal->saturation;
339 		avg_elms = (count + (n - 1)) / n;
340 	}
341 
342 	/*
343 	 * Pack the elements in the children.  Elements for each item is
344 	 * packed into base_item until avg_elms is reached, then base_item
345 	 * iterates.
346 	 *
347 	 * hammer_cursor_moved_element() is called for each element moved
348 	 * to update tracked cursors, including the index beyond the last
349 	 * element (at count).
350 	 *
351 	 * Any cursors tracking the internal node itself must also be
352 	 * updated, potentially repointing them at a leaf and clearing
353 	 * ATEDISK.
354 	 */
355 	base_item = TAILQ_FIRST(&lockroot.list);
356 	base_count = 0;
357 	root_count = 0;
358 
359 	TAILQ_FOREACH(item, &lockroot.list, entry) {
360 		node = item->node;
361 		KKASSERT(item->count == node->ondisk->count);
362 		chld_item = TAILQ_FIRST(&item->list);
363 		for (i = 0; i < item->count; ++i) {
364 			/*
365 			 * Closeout.  If the next element is at index 0
366 			 * just use the existing separator in the parent.
367 			 */
368 			if (base_count == avg_elms) {
369 				if (i == 0) {
370 					elm = &lockroot.node->ondisk->elms[
371 						item->index];
372 				} else {
373 					elm = &node->ondisk->elms[i];
374 				}
375 				rebalance_closeout(base_item, base_count, elm);
376 				base_item = TAILQ_NEXT(base_item, entry);
377 				KKASSERT(base_item);
378 				base_count = 0;
379 				++root_count;
380 			}
381 
382 			/*
383 			 * Check degenerate no-work case.  Otherwise pack
384 			 * the element.
385 			 *
386 			 * All changes are made to the copy.
387 			 */
388 			if (item == base_item && i == base_count) {
389 				++base_count;
390 				if (chld_item)
391 					chld_item = TAILQ_NEXT(chld_item, entry);
392 				continue;
393 			}
394 
395 			/*
396 			 * Pack element.
397 			 */
398 			elm = &base_item->copy->elms[base_count];
399 			*elm = node->ondisk->elms[i];
400 			base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
401 
402 			/*
403 			 * Adjust the mirror_tid of the target and the
404 			 * internal element linkage.
405 			 *
406 			 * The parent node (lockroot.node) should already
407 			 * have an aggregate mirror_tid so we do not have
408 			 * to update that.  However, it is possible for us
409 			 * to catch a hammer_btree_mirror_propagate() with
410 			 * its pants down.  Update the parent if necessary.
411 			 */
412 			tid = node->ondisk->mirror_tid;
413 
414 			if (base_item->copy->mirror_tid < tid) {
415 				base_item->copy->mirror_tid = tid;
416 				if (lockroot.copy->mirror_tid < tid) {
417 					lockroot.copy->mirror_tid = tid;
418 					lockroot.flags |=
419 						HAMMER_NODE_LOCK_UPDATED;
420 				}
421 				if (lockroot.copy->elms[root_count].
422 				    internal.mirror_tid < tid) {
423 					lockroot.copy->elms[root_count].
424 						internal.mirror_tid = tid;
425 					lockroot.flags |=
426 						HAMMER_NODE_LOCK_UPDATED;
427 				}
428 				base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
429 			}
430 
431 			/*
432 			 * We moved elm.  The parent pointers for any
433 			 * children of elm must be repointed.
434 			 */
435 			if (item != base_item &&
436 			    node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
437 				KKASSERT(chld_item);
438 				rebalance_parent_ptrs(base_item, base_count,
439 						      item, chld_item);
440 			}
441 			hammer_cursor_moved_element(item->parent->node,
442 						    item->index,
443 						    node, i,
444 						    base_item->node,
445 						    base_count);
446 			++base_count;
447 			if (chld_item)
448 				chld_item = TAILQ_NEXT(chld_item, entry);
449 		}
450 
451 		/*
452 		 * Always call at the end (i == number of elements) in
453 		 * case a cursor is sitting indexed there.
454 		 */
455 		hammer_cursor_moved_element(item->parent->node, item->index,
456 					    node, i,
457 					    base_item->node, base_count);
458 	}
459 
460 	/*
461 	 * Packing complete, close-out base_item using the right-hand
462 	 * boundary of the original parent.
463 	 *
464 	 * If we will be deleting nodes from the root shift the old
465 	 * right-hand-boundary to the new ending index.
466 	 */
467 	elm = &lockroot.node->ondisk->elms[lockroot.node->ondisk->count];
468 	rebalance_closeout(base_item, base_count, elm);
469 	++root_count;
470 	if (lockroot.copy->count != root_count) {
471 		lockroot.copy->count = root_count;
472 		lockroot.copy->elms[root_count] = *elm;
473 		lockroot.flags |= HAMMER_NODE_LOCK_UPDATED;
474 	}
475 
476 	/*
477 	 * Any extra items beyond base_item are now completely empty and
478 	 * can be destroyed.  Queue the destruction up in the copy.  Note
479 	 * that none of the destroyed nodes are part of our cursor.
480 	 *
481 	 * The cursor is locked so it isn't on the tracking list.  It
482 	 * should have been pointing at the boundary element (at root_count).
483 	 * When deleting elements from the root (which is cursor.node), we
484 	 * have to update the cursor.index manually to keep it in bounds.
485 	 */
486 	while ((base_item = TAILQ_NEXT(base_item, entry)) != NULL) {
487 		hammer_cursor_removed_node(base_item->node, lockroot.node,
488 					   base_count);
489 		hammer_cursor_deleted_element(lockroot.node, base_count);
490 		base_item->copy->type = HAMMER_BTREE_TYPE_DELETED;
491 		base_item->copy->count = 0;
492 		base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
493 		if (cursor->index > lockroot.copy->count)
494 			--cursor->index;
495 		++rebal->stat_deletions;
496 	}
497 
498 	/*
499 	 * All done, sync the locked child tree to disk.  This will also
500 	 * flush and delete deleted nodes.
501 	 */
502 	rebal->stat_nrebal += hammer_btree_sync_copy(cursor, &lockroot);
503 done:
504 	hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, lcache);
505 	hammer_cursor_downgrade(cursor);
506 	return (error);
507 }
508 
509 /*
510  * Close-out the child base_item.  This node contains base_count
511  * elements.
512  *
513  * If the node is an internal node the right-hand boundary must be
514  * set to elm.
515  */
516 static
517 void
518 rebalance_closeout(hammer_node_lock_t base_item, int base_count,
519 		   hammer_btree_elm_t elm)
520 {
521 	hammer_node_lock_t parent;
522 	hammer_btree_elm_t base_elm;
523 	hammer_btree_elm_t rbound_elm;
524 	u_int8_t save;
525 
526 	/*
527 	 * Update the count.  NOTE:  base_count can be 0 for the
528 	 * degenerate leaf case.
529 	 */
530 	if (hammer_debug_general & 0x1000) {
531 		kprintf("rebalance_closeout %016llx:",
532 			(long long)base_item->node->node_offset);
533 	}
534 	if (base_item->copy->count != base_count) {
535 		base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
536 		base_item->copy->count = base_count;
537 		if (hammer_debug_general & 0x1000)
538 			kprintf(" (count update)");
539 	}
540 
541 	/*
542 	 * If we are closing out an internal node we must assign
543 	 * a right-hand boundary.  Use the element contents as the
544 	 * right-hand boundary.
545 	 *
546 	 * Internal nodes are required to have at least one child,
547 	 * otherwise the left and right boundary would end up being
548 	 * the same element.  Only leaf nodes can be empty.
549 	 *
550 	 * Rebalancing may cut-off an internal node such that the
551 	 * new right hand boundary is the next element anyway, but
552 	 * we still have to make sure that subtree_offset, btype,
553 	 * and mirror_tid are all 0.
554 	 */
555 	if (base_item->copy->type == HAMMER_BTREE_TYPE_INTERNAL) {
556 		KKASSERT(base_count != 0);
557 		base_elm = &base_item->copy->elms[base_count];
558 
559 		if (bcmp(base_elm, elm, sizeof(*elm)) != 0 ||
560 		    elm->internal.subtree_offset ||
561 		    elm->internal.mirror_tid ||
562 		    elm->base.btype) {
563 			*base_elm = *elm;
564 			base_elm->internal.subtree_offset = 0;
565 			base_elm->internal.mirror_tid = 0;
566 			base_elm->base.btype = 0;
567 			base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
568 			if (hammer_debug_general & 0x1000)
569 				kprintf(" (rhs update)");
570 		} else {
571 			if (hammer_debug_general & 0x1000)
572 				kprintf(" (rhs same)");
573 		}
574 	}
575 
576 	/*
577 	 * The parent's boundary must be updated.  Be careful to retain
578 	 * the btype and non-base internal fields as that information is
579 	 * unrelated.
580 	 */
581 	parent = base_item->parent;
582 	rbound_elm = &parent->copy->elms[base_item->index + 1];
583 	if (bcmp(&rbound_elm->base, &elm->base, sizeof(elm->base)) != 0) {
584 		save = rbound_elm->base.btype;
585 		rbound_elm->base = elm->base;
586 		rbound_elm->base.btype = save;
587 		parent->flags |= HAMMER_NODE_LOCK_UPDATED;
588 		if (hammer_debug_general & 0x1000) {
589 			kprintf(" (parent bound update %d)",
590 				base_item->index + 1);
591 		}
592 	}
593 	if (hammer_debug_general & 0x1000)
594 		kprintf("\n");
595 }
596 
597 /*
598  * An element in item has moved to base_item.  We must update the parent
599  * pointer of the node the element points to (which is chld_item).
600  */
601 static
602 void
603 rebalance_parent_ptrs(hammer_node_lock_t base_item, int index,
604 		      hammer_node_lock_t item, hammer_node_lock_t chld_item)
605 {
606 	KKASSERT(chld_item->node->ondisk->parent == item->node->node_offset);
607 	chld_item->copy->parent = base_item->node->node_offset;
608 	chld_item->flags |= HAMMER_NODE_LOCK_UPDATED;
609 	hammer_cursor_parent_changed(chld_item->node,
610 				     item->node, base_item->node, index);
611 }
612