xref: /dragonfly/sys/vfs/hammer/hammer_btree.c (revision 1465342b)
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.10 2007/12/14 08:05:39 dillon Exp $
35  */
36 
37 /*
38  * HAMMER B-Tree index
39  *
40  * HAMMER implements a modified B+Tree.  In documentation this will
41  * simply be refered to as the HAMMER B-Tree.  Basically a B-Tree
42  * looks like a B+Tree (A B-Tree which stores its records only at the leafs
43  * of the tree), but adds two additional boundary elements which describe
44  * the left-most and right-most element a node is able to represent.  In
45  * otherwords, we have boundary elements at the two ends of a B-Tree node
46  * instead of sub-tree pointers.
47  *
48  * A B-Tree internal node looks like this:
49  *
50  *	B N N N N N N B   <-- boundary and internal elements
51  *       S S S S S S S    <-- subtree pointers
52  *
53  * A B-Tree leaf node basically looks like this:
54  *
55  *	L L L L L L L L   <-- leaf elemenets
56  *
57  * The radix for an internal node is 1 less then a leaf but we get a
58  * number of significant benefits for our troubles.
59  *
60  * The big benefit to using a B-Tree containing boundary information
61  * is that it is possible to cache pointers into the middle of the tree
62  * and not have to start searches, insertions, OR deletions at the root
63  * node.   In particular, searches are able to progress in a definitive
64  * direction from any point in the tree without revisting nodes.  This
65  * greatly improves the efficiency of many operations, most especially
66  * record appends.
67  *
68  * B-Trees also make the stacking of trees fairly straightforward.
69  *
70  * INTER-CLUSTER ELEMENTS:  An element of an internal node may reference
71  * the root of another cluster rather then a node in the current cluster.
72  * This is known as an inter-cluster references.  Only B-Tree searches
73  * will cross cluster boundaries.  The rebalancing and collapse code does
74  * not attempt to move children between clusters.  A major effect of this
75  * is that we have to relax minimum element count requirements and allow
76  * trees to become somewhat unabalanced.
77  *
78  * INSERTIONS AND DELETIONS:  When inserting we split full nodes on our
79  * way down as an optimization.  I originally experimented with rebalancing
80  * nodes on the way down for deletions but it created a huge mess due to
81  * the way inter-cluster linkages work.  Instead, now I simply allow
82  * the tree to become unbalanced and allow leaf nodes to become empty.
83  * The delete code will try to clean things up from the bottom-up but
84  * will stop if related elements are not in-core or if it cannot get a node
85  * lock.
86  */
87 #include "hammer.h"
88 #include <sys/buf.h>
89 #include <sys/buf2.h>
90 
91 static int btree_search(hammer_cursor_t cursor, int flags);
92 static int btree_split_internal(hammer_cursor_t cursor);
93 static int btree_split_leaf(hammer_cursor_t cursor);
94 static int btree_remove(hammer_cursor_t cursor);
95 static int btree_set_parent(hammer_node_t node, hammer_btree_elm_t elm);
96 #if 0
97 static int btree_rebalance(hammer_cursor_t cursor);
98 static int btree_collapse(hammer_cursor_t cursor);
99 #endif
100 static int btree_node_is_full(hammer_node_ondisk_t node);
101 static void hammer_make_separator(hammer_base_elm_t key1,
102 			hammer_base_elm_t key2, hammer_base_elm_t dest);
103 
104 /*
105  * Iterate records after a search.  The cursor is iterated forwards past
106  * the current record until a record matching the key-range requirements
107  * is found.  ENOENT is returned if the iteration goes past the ending
108  * key.
109  *
110  * key_beg/key_end is an INCLUSVE range.  i.e. if you are scanning to load
111  * a 4096 byte buffer key_beg might specify an offset of 0 and key_end an
112  * offset of 4095.
113  *
114  * cursor->key_beg may or may not be modified by this function during
115  * the iteration.
116  */
117 int
118 hammer_btree_iterate(hammer_cursor_t cursor)
119 {
120 	hammer_node_ondisk_t node;
121 	hammer_btree_elm_t elm;
122 	int error;
123 	int r;
124 #if 0
125 	int s;
126 	int64_t save_key;
127 #endif
128 
129 	/*
130 	 * Skip past the current record
131 	 */
132 	node = cursor->node->ondisk;
133 	if (node == NULL)
134 		return(ENOENT);
135 	if (cursor->index < node->count &&
136 	    (cursor->flags & HAMMER_CURSOR_ATEDISK)) {
137 		++cursor->index;
138 	}
139 
140 	/*
141 	 * Loop until an element is found or we are done.
142 	 */
143 	for (;;) {
144 		/*
145 		 * We iterate up the tree and then index over one element
146 		 * while we are at the last element in the current node.
147 		 *
148 		 * NOTE: This can pop us up to another cluster.
149 		 *
150 		 * If we are at the root of the root cluster, cursor_up
151 		 * returns ENOENT.
152 		 *
153 		 * NOTE: hammer_cursor_up() will adjust cursor->key_beg
154 		 * when told to re-search for the cluster tag.
155 		 *
156 		 * XXX this could be optimized by storing the information in
157 		 * the parent reference.
158 		 *
159 		 * XXX we can lose the node lock temporarily, this could mess
160 		 * up our scan.
161 		 */
162 		if (cursor->index == node->count) {
163 			error = hammer_cursor_up(cursor, 0);
164 			if (error)
165 				break;
166 			node = cursor->node->ondisk;
167 			KKASSERT(cursor->index != node->count);
168 			++cursor->index;
169 			continue;
170 		}
171 
172 		/*
173 		 * Iterate down the tree while we are at an internal node.
174 		 * Nodes cannot be empty, assert the case because if one is
175 		 * we will wind up in an infinite loop.
176 		 *
177 		 * We can avoid iterating through large swaths of transaction
178 		 * id space if the left and right separators are the same
179 		 * except for their transaction spaces.  We can then skip
180 		 * the node if the left and right transaction spaces are the
181 		 * same sign.  This directly optimized accesses to files with
182 		 * HUGE transactional histories, such as database files,
183 		 * allowing us to avoid having to iterate through the entire
184 		 * history.
185 		 */
186 		if (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
187 			KKASSERT(node->count != 0);
188 			elm = &node->elms[cursor->index];
189 #if 0
190 			/*
191 			 * temporarily disable this optimization, it needs
192 			 * more of a theoretical review.
193 			 */
194 			if (elm[0].base.obj_id == elm[1].base.obj_id &&
195 			    elm[0].base.rec_type == elm[1].base.rec_type &&
196 			    elm[0].base.key == elm[1].base.key) {
197 				/*
198 				 * Left side transaction space
199 				 */
200 				save_key = cursor->key_beg.key;
201 				cursor->key_beg.key = elm[0].base.key;
202 				r = hammer_btree_cmp(&cursor->key_beg,
203 						     &elm[0].base);
204 				cursor->key_beg.key = save_key;
205 
206 				/*
207 				 * Right side transaction space
208 				 */
209 				save_key = cursor->key_end.key;
210 				cursor->key_end.key = elm[1].base.key;
211 				s = hammer_btree_cmp(&cursor->key_end,
212 						     &elm[1].base);
213 				cursor->key_end.key = save_key;
214 
215 				/*
216 				 * If our range is entirely on one side or
217 				 * the other side we can skip the sub-tree.
218 				 */
219 				if ((r < 0 && s < 0) || (r > 0 && s > 0)) {
220 					++cursor->index;
221 					continue;
222 				}
223 			}
224 #endif
225 			error = hammer_cursor_down(cursor);
226 			if (error)
227 				break;
228 			KKASSERT(cursor->index == 0);
229 			node = cursor->node->ondisk;
230 			continue;
231 		}
232 
233 		/*
234 		 * We are at a leaf.
235 		 *
236 		 * Determine if the record at the cursor has gone beyond the
237 		 * end of our range.  Remember that our key range is inclusive.
238 		 *
239 		 * When iterating we may have to 'pick out' records matching
240 		 * our transaction requirements.  A comparison return of
241 		 * +1 or -1 indicates a transactional record that is too
242 		 * old or too new but does not terminate the search.
243 		 */
244 		elm = &node->elms[cursor->index];
245 		r = hammer_btree_range_cmp(cursor, &elm->base);
246 		if (r == -1 || r == 1) {
247 			++cursor->index;
248 			continue;
249 		}
250 
251 		/*
252 		 * We either found a match or are now out of bounds.
253 		 */
254 		error = r ? ENOENT : 0;
255 		break;
256 	}
257 	return(error);
258 }
259 
260 /*
261  * Lookup cursor->key_beg.  0 is returned on success, ENOENT if the entry
262  * could not be found, and a fatal error otherwise.
263  *
264  * The cursor is suitably positioned for a deletion on success, and suitably
265  * positioned for an insertion on ENOENT.
266  *
267  * The cursor may begin anywhere, the search will traverse clusters in
268  * either direction to locate the requested element.
269  */
270 int
271 hammer_btree_lookup(hammer_cursor_t cursor)
272 {
273 	int error;
274 
275 	error = btree_search(cursor, 0);
276 	if (error == 0 && cursor->flags)
277 		error = hammer_btree_extract(cursor, cursor->flags);
278 	return(error);
279 }
280 
281 /*
282  * Extract the record and/or data associated with the cursor's current
283  * position.  Any prior record or data stored in the cursor is replaced.
284  * The cursor must be positioned at a leaf node.
285  *
286  * NOTE: Only records can be extracted from internal B-Tree nodes, and
287  *       only for inter-cluster references.  At the moment we only support
288  *	 extractions from leaf nodes.
289  */
290 int
291 hammer_btree_extract(hammer_cursor_t cursor, int flags)
292 {
293 	hammer_node_ondisk_t node;
294 	hammer_btree_elm_t elm;
295 	hammer_cluster_t cluster;
296 	u_int64_t buf_type;
297 	int32_t cloff;
298 	int error;
299 
300 	/*
301 	 * A cluster record type has no data reference, the information
302 	 * is stored directly in the record and B-Tree element.
303 	 *
304 	 * The case where the data reference resolves to the same buffer
305 	 * as the record reference must be handled.
306 	 */
307 	node = cursor->node->ondisk;
308 	KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
309 	elm = &node->elms[cursor->index];
310 	cluster = cursor->node->cluster;
311 	error = 0;
312 
313 	if ((flags & HAMMER_CURSOR_GET_RECORD) && error == 0) {
314 		cloff = elm->leaf.rec_offset;
315 		cursor->record = hammer_bread(cluster, cloff,
316 					      HAMMER_FSBUF_RECORDS, &error,
317 					      &cursor->record_buffer);
318 	} else {
319 		cloff = 0;
320 	}
321 	if ((flags & HAMMER_CURSOR_GET_DATA) && error == 0) {
322 		if ((cloff ^ elm->leaf.data_offset) & ~HAMMER_BUFMASK) {
323 			/*
324 			 * The data is not in the same buffer as the last
325 			 * record we cached, but it could still be embedded
326 			 * in a record.  Note that we may not have loaded the
327 			 * record's buffer above, depending on flags.
328 			 */
329 			if ((elm->leaf.rec_offset ^ elm->leaf.data_offset) &
330 			    ~HAMMER_BUFMASK) {
331 				if (elm->leaf.data_len & HAMMER_BUFMASK)
332 					buf_type = HAMMER_FSBUF_DATA;
333 				else
334 					buf_type = 0;	/* pure data buffer */
335 			} else {
336 				buf_type = HAMMER_FSBUF_RECORDS;
337 			}
338 			cursor->data = hammer_bread(cluster,
339 						  elm->leaf.data_offset,
340 						  buf_type, &error,
341 						  &cursor->data_buffer);
342 		} else {
343 			/*
344 			 * Data in same buffer as record.  Note that we
345 			 * leave any existing data_buffer intact, even
346 			 * though we don't use it in this case, in case
347 			 * other records extracted during an iteration
348 			 * go back to it.
349 			 *
350 			 * Just assume the buffer type is correct.
351 			 */
352 			cursor->data = (void *)
353 				((char *)cursor->record_buffer->ondisk +
354 				 (elm->leaf.data_offset & HAMMER_BUFMASK));
355 		}
356 	}
357 	return(error);
358 }
359 
360 
361 /*
362  * Insert a leaf element into the B-Tree at the current cursor position.
363  * The cursor is positioned such that the element at and beyond the cursor
364  * are shifted to make room for the new record.
365  *
366  * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_INSERT
367  * flag set and that call must return ENOENT before this function can be
368  * called.
369  *
370  * ENOSPC is returned if there is no room to insert a new record.
371  */
372 int
373 hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_elm_t elm)
374 {
375 	hammer_node_ondisk_t parent;
376 	hammer_node_ondisk_t node;
377 	int i;
378 
379 #if 0
380 	/* HANDLED BY CALLER */
381 	/*
382 	 * Issue a search to get our cursor at the right place.  The search
383 	 * will get us to a leaf node.
384 	 *
385 	 * The search also does some setup for our insert, so there is always
386 	 * room in the leaf.
387 	 */
388 	error = btree_search(cursor, HAMMER_CURSOR_INSERT);
389 	if (error != ENOENT) {
390 		if (error == 0)
391 			error = EEXIST;
392 		return (error);
393 	}
394 #endif
395 
396 	/*
397 	 * Insert the element at the leaf node and update the count in the
398 	 * parent.  It is possible for parent to be NULL, indicating that
399 	 * the root of the B-Tree in the cluster is a leaf.  It is also
400 	 * possible for the leaf to be empty.
401 	 *
402 	 * Remember that the right-hand boundary is not included in the
403 	 * count.
404 	 */
405 	node = cursor->node->ondisk;
406 	i = cursor->index;
407 	KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
408 	KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS);
409 	if (i != node->count) {
410 		bcopy(&node->elms[i], &node->elms[i+1],
411 		      (node->count - i) * sizeof(*elm));
412 	}
413 	node->elms[i] = *elm;
414 	++node->count;
415 	hammer_modify_node(cursor->node);
416 
417 	/*
418 	 * Adjust the sub-tree count in the parent.  note that the parent
419 	 * may be in a different cluster.
420 	 */
421 	if (cursor->parent) {
422 		parent = cursor->parent->ondisk;
423 		i = cursor->parent_index;
424 		++parent->elms[i].internal.subtree_count;
425 		KKASSERT(parent->elms[i].internal.subtree_count <= node->count);
426 		hammer_modify_node(cursor->parent);
427 	}
428 	return(0);
429 }
430 
431 /*
432  * Delete a record from the B-Tree's at the current cursor position.
433  * The cursor is positioned such that the current element is the one
434  * to be deleted.
435  *
436  * On return the cursor will be positioned after the deleted element and
437  * MAY point to an internal node.  It will be suitable for the continuation
438  * of an iteration but not for an insertion or deletion.
439  *
440  * Deletions will attempt to partially rebalance the B-Tree in an upward
441  * direction.  It is possible to end up with empty leafs.  An empty internal
442  * node is impossible (worst case: it has one element pointing to an empty
443  * leaf).
444  */
445 int
446 hammer_btree_delete(hammer_cursor_t cursor)
447 {
448 	hammer_node_ondisk_t ondisk;
449 	hammer_node_t node;
450 	hammer_node_t parent;
451 	hammer_btree_elm_t elm;
452 	int error;
453 	int i;
454 
455 #if 0
456 	/* HANDLED BY CALLER */
457 	/*
458 	 * Locate the leaf element to delete.  The search is also responsible
459 	 * for doing some of the rebalancing work on its way down.
460 	 */
461 	error = btree_search(cursor, HAMMER_CURSOR_DELETE);
462 	if (error)
463 		return (error);
464 #endif
465 
466 	/*
467 	 * Delete the element from the leaf node.
468 	 *
469 	 * Remember that leaf nodes do not have boundaries.
470 	 */
471 	node = cursor->node;
472 	ondisk = node->ondisk;
473 	i = cursor->index;
474 
475 	KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF);
476 	if (i + 1 != ondisk->count) {
477 		bcopy(&ondisk->elms[i+1], &ondisk->elms[i],
478 		      (ondisk->count - i - 1) * sizeof(ondisk->elms[0]));
479 	}
480 	--ondisk->count;
481 	hammer_modify_node(node);
482 	if (cursor->parent != NULL) {
483 		/*
484 		 * Adjust parent's notion of the leaf's count.  subtree_count
485 		 * is only approximate, it is allowed to be too small but
486 		 * never allowed to be too large.  Make sure we don't drop
487 		 * the count below 0.
488 		 */
489 		parent = cursor->parent;
490 		elm = &parent->ondisk->elms[cursor->parent_index];
491 		if (elm->internal.subtree_count)
492 			--elm->internal.subtree_count;
493 		KKASSERT(elm->internal.subtree_count <= ondisk->count);
494 		hammer_modify_node(parent);
495 	}
496 
497 	/*
498 	 * It is possible, but not desireable, to stop here.  If the element
499 	 * count drops to 0 (which is allowed for a leaf), try recursively
500 	 * remove the B-Tree node.
501 	 *
502 	 * XXX rebalancing calls would go here too.
503 	 *
504 	 * This may reposition the cursor at one of the parent's of the
505 	 * current node.
506 	 */
507 	if (ondisk->count == 0) {
508 		error = btree_remove(cursor);
509 		if (error == EAGAIN)
510 			error = 0;
511 	} else {
512 		error = 0;
513 	}
514 	return(error);
515 }
516 
517 /*
518  * PRIMAY B-TREE SEARCH SUPPORT PROCEDURE
519  *
520  * Search a cluster's B-Tree for cursor->key_beg, return the matching node.
521  *
522  * The search begins at the current node and will instantiate a NULL
523  * parent if necessary and if not at the root of the cluster.  On return
524  * parent will be non-NULL unless the cursor is sitting at a root-leaf.
525  *
526  * The search code may be forced to iterate up the tree if the conditions
527  * required for an insertion or deletion are not met.  This does not occur
528  * very often.
529  *
530  * INSERTIONS: The search will split full nodes and leaves on its way down
531  * and guarentee that the leaf it ends up on is not full.
532  *
533  * DELETIONS: The search will rebalance the tree on its way down.
534  *
535  * The search is only guarenteed to end up on a leaf if an error code of 0
536  * is returned, or if inserting and an error code of ENOENT is returned.
537  */
538 static
539 int
540 btree_search(hammer_cursor_t cursor, int flags)
541 {
542 	hammer_node_ondisk_t node;
543 	hammer_cluster_t cluster;
544 	int error;
545 	int i;
546 	int r;
547 
548 	flags |= cursor->flags;
549 
550 	/*
551 	 * Move our cursor up the tree until we find a node whos range covers
552 	 * the key we are trying to locate.  This may move us between
553 	 * clusters.
554 	 *
555 	 * The left bound is inclusive, the right bound is non-inclusive.
556 	 * It is ok to cursor up too far so when cursoring across a cluster
557 	 * boundary.
558 	 *
559 	 * First see if we can skip the whole cluster.  hammer_cursor_up()
560 	 * handles both cases but this way we don't check the cluster
561 	 * bounds when going up the tree within a cluster.
562 	 */
563 	cluster = cursor->node->cluster;
564 	while (
565 	    hammer_btree_cmp(&cursor->key_beg, &cluster->clu_btree_beg) < 0 ||
566 	    hammer_btree_cmp(&cursor->key_beg, &cluster->clu_btree_end) >= 0) {
567 		error = hammer_cursor_toroot(cursor);
568 		if (error)
569 			goto done;
570 		error = hammer_cursor_up(cursor, 0);
571 		if (error)
572 			goto done;
573 		cluster = cursor->node->cluster;
574 	}
575 
576 	/*
577 	 * Deal with normal cursoring within a cluster.  The right bound
578 	 * is non-inclusive.  That is, the bounds form a separator.
579 	 */
580 	while (hammer_btree_cmp(&cursor->key_beg, cursor->left_bound) < 0 ||
581 	       hammer_btree_cmp(&cursor->key_beg, cursor->right_bound) >= 0) {
582 		error = hammer_cursor_up(cursor, 0);
583 		if (error)
584 			goto done;
585 	}
586 
587 	/*
588 	 * We better have ended up with a node somewhere, and our second
589 	 * while loop had better not have traversed up a cluster.
590 	 */
591 	KKASSERT(cursor->node != NULL && cursor->node->cluster == cluster);
592 
593 	/*
594 	 * If we are inserting we can't start at a full node if the parent
595 	 * is also full (because there is no way to split the node),
596 	 * continue running up the tree until we hit the root of the
597 	 * root cluster or until the requirement is satisfied.
598 	 *
599 	 * NOTE: These cursor-up's CAN continue to cross cluster boundaries.
600 	 *
601 	 * XXX as an optimization it should be possible to unbalance the tree
602 	 * and stop at the root of the current cluster.
603 	 */
604 	while (flags & HAMMER_CURSOR_INSERT) {
605 		if (btree_node_is_full(cursor->node->ondisk) == 0)
606 			break;
607 		if (cursor->parent == NULL)
608 			break;
609 		if (cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS)
610 			break;
611 		error = hammer_cursor_up(cursor, 0);
612 		/* cluster and node are now may become stale */
613 		if (error)
614 			goto done;
615 	}
616 	/* cluster = cursor->node->cluster; not needed until next cluster = */
617 
618 #if 0
619 	/*
620 	 * If we are deleting we can't start at an internal node with only
621 	 * one element unless it is root, because all of our code assumes
622 	 * that internal nodes will never be empty.  Just do this generally
623 	 * for both leaf and internal nodes to get better balance.
624 	 *
625 	 * This handles the case where the cursor is sitting at a leaf and
626 	 * either the leaf or parent contain an insufficient number of
627 	 * elements.
628 	 *
629 	 * NOTE: These cursor-up's CAN continue to cross cluster boundaries.
630 	 *
631 	 * XXX NOTE: Iterations may not set this flag anyway.
632 	 */
633 	while (flags & HAMMER_CURSOR_DELETE) {
634 		if (cursor->node->ondisk->count > 1)
635 			break;
636 		if (cursor->parent == NULL)
637 			break;
638 		KKASSERT(cursor->node->ondisk->count != 0);
639 		error = hammer_cursor_up(cursor, 0);
640 		/* cluster and node are now may become stale */
641 		if (error)
642 			goto done;
643 	}
644 #endif
645 
646 /*new_cluster:*/
647 	/*
648 	 * Push down through internal nodes to locate the requested key.
649 	 */
650 	cluster = cursor->node->cluster;
651 	node = cursor->node->ondisk;
652 	while (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
653 #if 0
654 		/*
655 		 * If we are a the root node and deleting, try to collapse
656 		 * all of the root's children into the root.  This is the
657 		 * only point where tree depth is reduced.
658 		 *
659 		 * XXX NOTE: Iterations may not set this flag anyway.
660 		 */
661 		if ((flags & HAMMER_CURSOR_DELETE) && cursor->parent == NULL) {
662 			error = btree_collapse(cursor);
663 			/* node becomes stale after call */
664 			if (error)
665 				goto done;
666 		}
667 		node = cursor->node->ondisk;
668 #endif
669 		/*
670 		 * Scan the node to find the subtree index to push down into.
671 		 * We go one-past, then back-up.
672 		 */
673 		for (i = 0; i < node->count; ++i) {
674 			r = hammer_btree_cmp(&cursor->key_beg,
675 					     &node->elms[i].base);
676 			if (r < 0)
677 				break;
678 		}
679 
680 		/*
681 		 * It is possible for the search to terminate at i == 0,
682 		 * which is to the LEFT of the LEFT boundary but the RIGHT
683 		 * of the parent's boundary on the left of the sub-tree
684 		 * element.  This case can occur due to deletions (see
685 		 * btree_remove()).
686 		 *
687 		 * When this case occurs an ENOENT return is guarenteed but
688 		 * if inserting we must still terminate at a leaf.  The
689 		 * solution is to make the node's left boundary inherit the
690 		 * boundary stored in the parent.
691 		 *
692 		 * When doing this inheritance some fields in 'base' are
693 		 * actually related to the internal element's child
694 		 * specification and not to the key.  These have to be
695 		 * retained.
696 		 */
697 		if (i == 0) {
698 			u_int8_t save;
699 			if ((flags & HAMMER_CURSOR_INSERT) == 0) {
700 				cursor->index = 0;
701 				return(ENOENT);
702 			}
703 			save = node->elms[0].subtree_type;
704 			node->elms[0].base = *cursor->left_bound;
705 			node->elms[0].subtree_type = save;
706 			hammer_modify_node(cursor->node);
707 		} else {
708 			/*
709 			 * The push-down index is now i - 1.
710 			 */
711 			--i;
712 		}
713 		cursor->index = i;
714 
715 		/*
716 		 * Handle insertion and deletion requirements.
717 		 *
718 		 * If inserting split full nodes.  The split code will
719 		 * adjust cursor->node and cursor->index if the current
720 		 * index winds up in the new node.
721 		 */
722 		if (flags & HAMMER_CURSOR_INSERT) {
723 			if (node->count == HAMMER_BTREE_INT_ELMS) {
724 				error = btree_split_internal(cursor);
725 				if (error)
726 					goto done;
727 				/*
728 				 * reload stale pointers
729 				 */
730 				i = cursor->index;
731 				node = cursor->node->ondisk;
732 			}
733 		}
734 
735 #if 0
736 		/*
737 		 * If deleting rebalance - do not allow the child to have
738 		 * just one element or we will not be able to delete it.
739 		 *
740 		 * Neither internal or leaf nodes (except a root-leaf) are
741 		 * allowed to drop to 0 elements.  (XXX - well, leaf nodes
742 		 * can at the moment).
743 		 *
744 		 * Our separators may have been reorganized after rebalancing,
745 		 * so we have to pop back up and rescan.
746 		 *
747 		 * XXX test for subtree_count < maxelms / 2, minus 1 or 2
748 		 * for hysteresis?
749 		 *
750 		 * XXX NOTE: Iterations may not set this flag anyway.
751 		 */
752 		if (flags & HAMMER_CURSOR_DELETE) {
753 			if (node->elms[i].internal.subtree_count <= 1) {
754 				error = btree_rebalance(cursor);
755 				if (error)
756 					goto done;
757 				/* cursor->index is invalid after call */
758 				goto new_cluster;
759 			}
760 		}
761 #endif
762 
763 		/*
764 		 * Push down (push into new node, existing node becomes
765 		 * the parent).
766 		 */
767 		error = hammer_cursor_down(cursor);
768 		/* node and cluster become stale */
769 		if (error)
770 			goto done;
771 		node = cursor->node->ondisk;
772 		cluster = cursor->node->cluster;
773 	}
774 
775 	/*
776 	 * We are at a leaf, do a linear search of the key array.
777 	 * (XXX do a binary search).  On success the index is set to the
778 	 * matching element, on failure the index is set to the insertion
779 	 * point.
780 	 *
781 	 * Boundaries are not stored in leaf nodes, so the index can wind
782 	 * up to the left of element 0 (index == 0) or past the end of
783 	 * the array (index == node->count).
784 	 */
785 	KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS);
786 
787 	for (i = 0; i < node->count; ++i) {
788 		r = hammer_btree_cmp(&cursor->key_beg, &node->elms[i].base);
789 
790 		/*
791 		 * Stop if we've flipped past key_beg
792 		 */
793 		if (r < 0)
794 			break;
795 
796 		/*
797 		 * Return an exact match
798 		 */
799 		if (r == 0) {
800 			cursor->index = i;
801 			error = 0;
802 			goto done;
803 		}
804 	}
805 
806 	/*
807 	 * No exact match was found, i is now at the insertion point.
808 	 *
809 	 * If inserting split a full leaf before returning.  This
810 	 * may have the side effect of adjusting cursor->node and
811 	 * cursor->index.
812 	 */
813 	cursor->index = i;
814 	if ((flags & HAMMER_CURSOR_INSERT) &&
815 	    node->count == HAMMER_BTREE_LEAF_ELMS) {
816 		error = btree_split_leaf(cursor);
817 		/* NOT USED
818 		i = cursor->index;
819 		node = &cursor->node->internal;
820 		*/
821 		if (error)
822 			goto done;
823 	}
824 	error = ENOENT;
825 done:
826 	return(error);
827 }
828 
829 
830 /************************************************************************
831  *			   SPLITTING AND MERGING 			*
832  ************************************************************************
833  *
834  * These routines do all the dirty work required to split and merge nodes.
835  */
836 
837 /*
838  * Split an internal node into two nodes and move the separator at the split
839  * point to the parent.  Note that the parent's parent's element pointing
840  * to our parent will have an incorrect subtree_count (we don't update it).
841  * It will be low, which is ok.
842  *
843  * (cursor->node, cursor->index) indicates the element the caller intends
844  * to push into.  We will adjust node and index if that element winds
845  * up in the split node.
846  *
847  * If we are at the root of a cluster a new root must be created with two
848  * elements, one pointing to the original root and one pointing to the
849  * newly allocated split node.
850  *
851  * NOTE! Being at the root of a cluster is different from being at the
852  * root of the root cluster.  cursor->parent will not be NULL and
853  * cursor->node->ondisk.parent must be tested against 0.  Theoretically
854  * we could propogate the algorithm into the parent and deal with multiple
855  * 'roots' in the cluster header, but it's easier not to.
856  */
857 static
858 int
859 btree_split_internal(hammer_cursor_t cursor)
860 {
861 	hammer_node_ondisk_t ondisk;
862 	hammer_node_t node;
863 	hammer_node_t parent;
864 	hammer_node_t new_node;
865 	hammer_btree_elm_t elm;
866 	hammer_btree_elm_t parent_elm;
867 	int parent_index;
868 	int made_root;
869 	int split;
870 	int error;
871 	int i;
872 	const int esize = sizeof(*elm);
873 
874 	/*
875 	 * We are splitting but elms[split] will be promoted to the parent,
876 	 * leaving the right hand node with one less element.  If the
877 	 * insertion point will be on the left-hand side adjust the split
878 	 * point to give the right hand side one additional node.
879 	 */
880 	node = cursor->node;
881 	ondisk = node->ondisk;
882 	split = (ondisk->count + 1) / 2;
883 	if (cursor->index <= split)
884 		--split;
885 	error = 0;
886 
887 	/*
888 	 * If we are at the root of the cluster, create a new root node with
889 	 * 1 element and split normally.  Avoid making major modifications
890 	 * until we know the whole operation will work.
891 	 *
892 	 * The root of the cluster is different from the root of the root
893 	 * cluster.  Use the node's on-disk structure's parent offset to
894 	 * detect the case.
895 	 */
896 	if (ondisk->parent == 0) {
897 		parent = hammer_alloc_btree(node->cluster, &error);
898 		if (parent == NULL)
899 			return(error);
900 		hammer_lock_ex(&parent->lock);
901 		ondisk = parent->ondisk;
902 		ondisk->count = 1;
903 		ondisk->parent = 0;
904 		ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
905 		ondisk->elms[0].base = node->cluster->clu_btree_beg;
906 		ondisk->elms[0].internal.subtree_type = node->ondisk->type;
907 		ondisk->elms[0].internal.subtree_offset = node->node_offset;
908 		ondisk->elms[1].base = node->cluster->clu_btree_end;
909 		made_root = 1;
910 		parent_index = 0;	/* index of current node in parent */
911 	} else {
912 		made_root = 0;
913 		parent = cursor->parent;
914 		parent_index = cursor->parent_index;
915 		KKASSERT(parent->cluster == node->cluster);
916 	}
917 
918 	/*
919 	 * Split node into new_node at the split point.
920 	 *
921 	 *  B O O O P N N B	<-- P = node->elms[split]
922 	 *   0 1 2 3 4 5 6	<-- subtree indices
923 	 *
924 	 *       x x P x x
925 	 *        s S S s
926 	 *         /   \
927 	 *  B O O O B    B N N B	<--- inner boundary points are 'P'
928 	 *   0 1 2 3      4 5 6
929 	 *
930 	 */
931 	new_node = hammer_alloc_btree(node->cluster, &error);
932 	if (new_node == NULL) {
933 		if (made_root) {
934 			hammer_unlock(&parent->lock);
935 			hammer_free_btree(parent->cluster, parent->node_offset);
936 			hammer_rel_node(parent);
937 		}
938 		return(error);
939 	}
940 	hammer_lock_ex(&new_node->lock);
941 
942 	/*
943 	 * Create the new node.  P becomes the left-hand boundary in the
944 	 * new node.  Copy the right-hand boundary as well.
945 	 *
946 	 * elm is the new separator.
947 	 */
948 	ondisk = node->ondisk;
949 	elm = &ondisk->elms[split];
950 	bcopy(elm, &new_node->ondisk->elms[0],
951 	      (ondisk->count - split + 1) * esize);
952 	new_node->ondisk->count = ondisk->count - split;
953 	new_node->ondisk->parent = parent->node_offset;
954 	new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
955 	KKASSERT(ondisk->type == new_node->ondisk->type);
956 
957 	/*
958 	 * Cleanup the original node.  P becomes the new boundary, its
959 	 * subtree_offset was moved to the new node.  If we had created
960 	 * a new root its parent pointer may have changed.
961 	 */
962 	elm->internal.subtree_offset = 0;
963 	ondisk->count = split;
964 
965 	/*
966 	 * Insert the separator into the parent, fixup the parent's
967 	 * reference to the original node, and reference the new node.
968 	 * The separator is P.
969 	 *
970 	 * Remember that base.count does not include the right-hand boundary.
971 	 */
972 	ondisk = parent->ondisk;
973 	ondisk->elms[parent_index].internal.subtree_count = split;
974 	parent_elm = &ondisk->elms[parent_index+1];
975 	bcopy(parent_elm, parent_elm + 1,
976 	      (ondisk->count - parent_index) * esize);
977 	parent_elm->internal.base = elm->base;	/* separator P */
978 	parent_elm->internal.subtree_offset = new_node->node_offset;
979 	parent_elm->internal.subtree_count = new_node->ondisk->count;
980 	parent_elm->internal.subtree_type = new_node->ondisk->type;
981 	++ondisk->count;
982 
983 	/*
984 	 * The children of new_node need their parent pointer set to new_node.
985 	 */
986 	for (i = 0; i < new_node->ondisk->count; ++i) {
987 		elm = &new_node->ondisk->elms[i];
988 		error = btree_set_parent(new_node, elm);
989 		if (error) {
990 			panic("btree_split_internal: btree-fixup problem");
991 		}
992 	}
993 
994 	/*
995 	 * The cluster's root pointer may have to be updated.
996 	 */
997 	if (made_root) {
998 		node->cluster->ondisk->clu_btree_root = parent->node_offset;
999 		hammer_modify_cluster(node->cluster);
1000 		node->ondisk->parent = parent->node_offset;
1001 		if (cursor->parent) {
1002 			hammer_unlock(&cursor->parent->lock);
1003 			hammer_rel_node(cursor->parent);
1004 		}
1005 		cursor->parent = parent;	/* lock'd and ref'd */
1006 	}
1007 
1008 	hammer_modify_node(node);
1009 	hammer_modify_node(new_node);
1010 	hammer_modify_node(parent);
1011 
1012 	/*
1013 	 * Ok, now adjust the cursor depending on which element the original
1014 	 * index was pointing at.  If we are >= the split point the push node
1015 	 * is now in the new node.
1016 	 *
1017 	 * NOTE: If we are at the split point itself we cannot stay with the
1018 	 * original node because the push index will point at the right-hand
1019 	 * boundary, which is illegal.
1020 	 *
1021 	 * NOTE: The cursor's parent or parent_index must be adjusted for
1022 	 * the case where a new parent (new root) was created, and the case
1023 	 * where the cursor is now pointing at the split node.
1024 	 */
1025 	if (cursor->index >= split) {
1026 		cursor->parent_index = parent_index + 1;
1027 		cursor->index -= split;
1028 		hammer_unlock(&cursor->node->lock);
1029 		hammer_rel_node(cursor->node);
1030 		cursor->node = new_node;	/* locked and ref'd */
1031 	} else {
1032 		cursor->parent_index = parent_index;
1033 		hammer_unlock(&new_node->lock);
1034 		hammer_rel_node(new_node);
1035 	}
1036 
1037 	/*
1038 	 * Fixup left and right bounds
1039 	 */
1040 	parent_elm = &parent->ondisk->elms[cursor->parent_index];
1041 	cursor->left_bound = &parent_elm[0].internal.base;
1042 	cursor->right_bound = &parent_elm[1].internal.base;
1043 
1044 	return (0);
1045 }
1046 
1047 /*
1048  * Same as the above, but splits a full leaf node.
1049  */
1050 static
1051 int
1052 btree_split_leaf(hammer_cursor_t cursor)
1053 {
1054 	hammer_node_ondisk_t ondisk;
1055 	hammer_node_t parent;
1056 	hammer_node_t leaf;
1057 	hammer_node_t new_leaf;
1058 	hammer_btree_elm_t elm;
1059 	hammer_btree_elm_t parent_elm;
1060 	int parent_index;
1061 	int made_root;
1062 	int split;
1063 	int error;
1064 	const size_t esize = sizeof(*elm);
1065 
1066 	/*
1067 	 * Calculate the split point.  If the insertion point will be on
1068 	 * the left-hand side adjust the split point to give the right
1069 	 * hand side one additional node.
1070 	 */
1071 	leaf = cursor->node;
1072 	ondisk = leaf->ondisk;
1073 	split = (ondisk->count + 1) / 2;
1074 	if (cursor->index <= split)
1075 		--split;
1076 	error = 0;
1077 
1078 	/*
1079 	 * If we are at the root of the tree, create a new root node with
1080 	 * 1 element and split normally.  Avoid making major modifications
1081 	 * until we know the whole operation will work.
1082 	 */
1083 	if (ondisk->parent == 0) {
1084 		parent = hammer_alloc_btree(leaf->cluster, &error);
1085 		if (parent == NULL)
1086 			return(error);
1087 		hammer_lock_ex(&parent->lock);
1088 		ondisk = parent->ondisk;
1089 		ondisk->count = 1;
1090 		ondisk->parent = 0;
1091 		ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
1092 		ondisk->elms[0].base = leaf->cluster->clu_btree_beg;
1093 		ondisk->elms[0].internal.subtree_type = leaf->ondisk->type;
1094 		ondisk->elms[0].internal.subtree_offset = leaf->node_offset;
1095 		ondisk->elms[1].base = leaf->cluster->clu_btree_end;
1096 		made_root = 1;
1097 		parent_index = 0;	/* insertion point in parent */
1098 	} else {
1099 		made_root = 0;
1100 		parent = cursor->parent;
1101 		parent_index = cursor->parent_index;
1102 		KKASSERT(parent->cluster == leaf->cluster);
1103 	}
1104 
1105 	/*
1106 	 * Split leaf into new_leaf at the split point.  Select a separator
1107 	 * value in-between the two leafs but with a bent towards the right
1108 	 * leaf since comparisons use an 'elm >= separator' inequality.
1109 	 *
1110 	 *  L L L L L L L L
1111 	 *
1112 	 *       x x P x x
1113 	 *        s S S s
1114 	 *         /   \
1115 	 *  L L L L     L L L L
1116 	 */
1117 	new_leaf = hammer_alloc_btree(leaf->cluster, &error);
1118 	if (new_leaf == NULL) {
1119 		if (made_root) {
1120 			hammer_unlock(&parent->lock);
1121 			hammer_free_btree(parent->cluster, parent->node_offset);
1122 			hammer_rel_node(parent);
1123 		}
1124 		return(error);
1125 	}
1126 	hammer_lock_ex(&new_leaf->lock);
1127 
1128 	/*
1129 	 * Create the new node.  P become the left-hand boundary in the
1130 	 * new node.  Copy the right-hand boundary as well.
1131 	 */
1132 	ondisk = leaf->ondisk;
1133 	elm = &ondisk->elms[split];
1134 	bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize);
1135 	new_leaf->ondisk->count = ondisk->count - split;
1136 	new_leaf->ondisk->parent = parent->node_offset;
1137 	new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
1138 	KKASSERT(ondisk->type == new_leaf->ondisk->type);
1139 
1140 	/*
1141 	 * Cleanup the original node.  Because this is a leaf node and
1142 	 * leaf nodes do not have a right-hand boundary, there
1143 	 * aren't any special edge cases to clean up.  We just fixup the
1144 	 * count.
1145 	 */
1146 	ondisk->count = split;
1147 
1148 	/*
1149 	 * Insert the separator into the parent, fixup the parent's
1150 	 * reference to the original node, and reference the new node.
1151 	 * The separator is P.
1152 	 *
1153 	 * Remember that base.count does not include the right-hand boundary.
1154 	 * We are copying parent_index+1 to parent_index+2, not +0 to +1.
1155 	 */
1156 	ondisk = parent->ondisk;
1157 	ondisk->elms[parent_index].internal.subtree_count = split;
1158 	parent_elm = &ondisk->elms[parent_index+1];
1159 	if (parent_index + 1 != ondisk->count) {
1160 		bcopy(parent_elm, parent_elm + 1,
1161 		      (ondisk->count - parent_index - 1) * esize);
1162 	}
1163 	hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base);
1164 	parent_elm->internal.subtree_offset = new_leaf->node_offset;
1165 	parent_elm->internal.subtree_count = new_leaf->ondisk->count;
1166 	parent_elm->internal.subtree_type = new_leaf->ondisk->type;
1167 	++ondisk->count;
1168 
1169 	/*
1170 	 * The cluster's root pointer may have to be updated.
1171 	 */
1172 	if (made_root) {
1173 		leaf->cluster->ondisk->clu_btree_root = parent->node_offset;
1174 		hammer_modify_cluster(leaf->cluster);
1175 		leaf->ondisk->parent = parent->node_offset;
1176 		if (cursor->parent) {
1177 			hammer_unlock(&cursor->parent->lock);
1178 			hammer_rel_node(cursor->parent);
1179 		}
1180 		cursor->parent = parent;	/* lock'd and ref'd */
1181 	}
1182 
1183 	hammer_modify_node(leaf);
1184 	hammer_modify_node(new_leaf);
1185 	hammer_modify_node(parent);
1186 
1187 	/*
1188 	 * Ok, now adjust the cursor depending on which element the original
1189 	 * index was pointing at.  If we are >= the split point the push node
1190 	 * is now in the new node.
1191 	 *
1192 	 * NOTE: If we are at the split point itself we cannot stay with the
1193 	 * original node because the push index will point at the right-hand
1194 	 * boundary, which is illegal.
1195 	 */
1196 	if (cursor->index >= split) {
1197 		cursor->parent_index = parent_index + 1;
1198 		cursor->index -= split;
1199 		hammer_unlock(&cursor->node->lock);
1200 		hammer_rel_node(cursor->node);
1201 		cursor->node = new_leaf;
1202 	} else {
1203 		cursor->parent_index = parent_index;
1204 		hammer_unlock(&new_leaf->lock);
1205 		hammer_rel_node(new_leaf);
1206 	}
1207 
1208 	/*
1209 	 * Fixup left and right bounds
1210 	 */
1211 	parent_elm = &parent->ondisk->elms[cursor->parent_index];
1212 	cursor->left_bound = &parent_elm[0].internal.base;
1213 	cursor->right_bound = &parent_elm[1].internal.base;
1214 
1215 	return (0);
1216 }
1217 
1218 /*
1219  * Attempt to remove the empty B-Tree node at (cursor->node).  Returns 0
1220  * on success, EAGAIN if we could not acquire the necessary locks, or some
1221  * other error.
1222  *
1223  * On return the cursor may end up pointing at an internal node, suitable
1224  * for further iteration but not for insertion or deletion.
1225  *
1226  * cursor->node may be an internal node or a leaf node.
1227  */
1228 int
1229 btree_remove(hammer_cursor_t cursor)
1230 {
1231 	hammer_node_ondisk_t ondisk;
1232 	hammer_btree_elm_t elm;
1233 	hammer_node_t save;
1234 	hammer_node_t node;
1235 	hammer_node_t parent;
1236 	int error;
1237 	int i;
1238 
1239 	/*
1240 	 * If we are at the root of the root cluster there is nothing to
1241 	 * remove, but an internal node at the root of a cluster is not
1242 	 * allowed to be empty so convert it to a leaf node.
1243 	 */
1244 	if (cursor->parent == NULL) {
1245 		ondisk = cursor->node->ondisk;
1246 		KKASSERT(ondisk->parent == 0);
1247 		ondisk->type = HAMMER_BTREE_TYPE_LEAF;
1248 		ondisk->count = 0;
1249 		hammer_modify_node(cursor->node);
1250 		kprintf("EMPTY ROOT OF ROOT CLUSTER -> LEAF\n");
1251 		return(0);
1252 	}
1253 
1254 	/*
1255 	 * Retain a reference to cursor->node, ex-lock again (2 locks now)
1256 	 * so we do not lose the lock when we cursor around.
1257 	 */
1258 	save = cursor->node;
1259 	hammer_ref_node(save);
1260 	hammer_lock_ex(&save->lock);
1261 
1262 	/*
1263 	 * We need to be able to lock the parent of the parent.  Do this
1264 	 * non-blocking and return EAGAIN if the lock cannot be acquired.
1265 	 * non-blocking is required in order to avoid a deadlock.
1266 	 *
1267 	 * After we cursor up, parent is moved to node and the new parent
1268 	 * is the parent of the parent.
1269 	 */
1270 	error = hammer_cursor_up(cursor, 1);
1271 	if (error) {
1272 		kprintf("BTREE_REMOVE: Cannot lock parent\n");
1273 		hammer_unlock(&save->lock);
1274 		hammer_rel_node(save);
1275 		return(error);
1276 	}
1277 
1278 	/*
1279 	 * At this point we want to remove the element at (node, index),
1280 	 * which is now the (original) parent pointing to the saved node.
1281 	 * Removing the element allows us to then free the node it was
1282 	 * pointing to.
1283 	 *
1284 	 * However, an internal node is not allowed to have 0 elements, so
1285 	 * if the count would drop to 0 we have to recurse.  It is possible
1286 	 * for the recursion to fail.
1287 	 *
1288 	 * NOTE: The cursor is in an indeterminant position after recursing,
1289 	 * but will still be suitable for an iteration.
1290 	 */
1291 	node = cursor->node;
1292 	KKASSERT(node->ondisk->count > 0);
1293 	if (node->ondisk->count == 1) {
1294 		error = btree_remove(cursor);
1295 		if (error == 0) {
1296 			kprintf("BTREE_REMOVE: Successful!\n");
1297 			hammer_flush_node(save);
1298 			hammer_free_btree(save->cluster, save->node_offset);
1299 		} else {
1300 			kprintf("BTREE_REMOVE: Recursion failed %d\n", error);
1301 		}
1302 		hammer_unlock(&save->lock);
1303 		hammer_rel_node(save);
1304 		return(error);
1305 	}
1306 
1307 	/*
1308 	 * Remove the element at (node, index) and adjust the parent's
1309 	 * subtree_count.
1310 	 *
1311 	 * NOTE!  Removing the element will cause the left-hand boundary
1312 	 * to no longer match the child node:
1313 	 *
1314 	 * LxMxR (left, middle, right).  If the first 'x' element is
1315 	 * removed then you get LxR and L no longer matches the left-hand
1316 	 * boundary of the sub-node pointed to by what used to be the
1317 	 * second x.  This case is handled in btree_search().
1318 	 */
1319 #if 0
1320 	kprintf("BTREE_REMOVE: Removing element %d\n", cursor->index);
1321 #endif
1322 	ondisk = node->ondisk;
1323 	i = cursor->index;
1324 	bcopy(&ondisk->elms[i+1], &ondisk->elms[i],
1325 	      (ondisk->count - i) * sizeof(ondisk->elms[0]));
1326 	--ondisk->count;
1327 	hammer_modify_node(cursor->node);
1328 
1329 	/*
1330 	 * Adjust the parent-parent's (now parent) reference to the parent
1331 	 * (now node).
1332 	 */
1333 	if ((parent = cursor->parent) != NULL) {
1334 		elm = &parent->ondisk->elms[cursor->parent_index];
1335 		if (elm->internal.subtree_count != ondisk->count) {
1336 			elm->internal.subtree_count = ondisk->count;
1337 			hammer_modify_node(parent);
1338 		}
1339 		if (elm->subtree_type != HAMMER_BTREE_TYPE_CLUSTER &&
1340 		    elm->subtree_type != ondisk->type) {
1341 			elm->subtree_type = ondisk->type;
1342 			hammer_modify_node(parent);
1343 		}
1344 	}
1345 
1346 	/*
1347 	 * Free the saved node.
1348 	 */
1349 	hammer_flush_node(save);
1350 	hammer_free_btree(save->cluster, save->node_offset);
1351 	hammer_unlock(&save->lock);
1352 	hammer_rel_node(save);
1353 	return(0);
1354 }
1355 
1356 /*
1357  * The child represented by the element in internal node node needs
1358  * to have its parent pointer adjusted.
1359  */
1360 static
1361 int
1362 btree_set_parent(hammer_node_t node, hammer_btree_elm_t elm)
1363 {
1364 	hammer_volume_t volume;
1365 	hammer_cluster_t cluster;
1366 	hammer_node_t child;
1367 	int error;
1368 
1369 	error = 0;
1370 
1371 	switch(elm->internal.subtree_type) {
1372 	case HAMMER_BTREE_TYPE_LEAF:
1373 	case HAMMER_BTREE_TYPE_INTERNAL:
1374 		child = hammer_get_node(node->cluster,
1375 					elm->internal.subtree_offset, &error);
1376 		if (error == 0) {
1377 			hammer_lock_ex(&child->lock);
1378 			child->ondisk->parent = node->node_offset;
1379 			hammer_modify_node(child);
1380 			hammer_unlock(&child->lock);
1381 			hammer_rel_node(child);
1382 		}
1383 		break;
1384 	case HAMMER_BTREE_TYPE_CLUSTER:
1385 		volume = hammer_get_volume(node->cluster->volume->hmp,
1386 					elm->internal.subtree_volno, &error);
1387 		if (error)
1388 			break;
1389 		cluster = hammer_get_cluster(volume,
1390 					elm->internal.subtree_cluid,
1391 					&error, 0);
1392                 hammer_rel_volume(volume, 0);
1393 		if (error)
1394 			break;
1395 		hammer_lock_ex(&cluster->io.lock);
1396 		cluster->ondisk->clu_btree_parent_offset = node->node_offset;
1397 		hammer_unlock(&cluster->io.lock);
1398 		KKASSERT(cluster->ondisk->clu_btree_parent_clu_no ==
1399 			 node->cluster->clu_no);
1400 		KKASSERT(cluster->ondisk->clu_btree_parent_vol_no ==
1401 			 node->cluster->volume->vol_no);
1402 		hammer_modify_cluster(cluster);
1403 		hammer_rel_cluster(cluster, 0);
1404 		break;
1405 	default:
1406 		hammer_print_btree_elm(elm, HAMMER_BTREE_TYPE_INTERNAL, -1);
1407 		panic("btree_set_parent: bad subtree_type");
1408 		break; /* NOT REACHED */
1409 	}
1410 	return(error);
1411 }
1412 
1413 #if 0
1414 
1415 /*
1416  * This routine is called on the internal node (node) prior to recursing down
1417  * through (node, index) when the node referenced by (node, index) MIGHT
1418  * have too few elements for the caller to perform a deletion.
1419  *
1420  * cursor->index is invalid on return because the separators may have gotten
1421  * adjusted, the caller must rescan the node's elements.  The caller may set
1422  * cursor->index to -1 if it wants us to do a general rebalancing.
1423  *
1424  * This routine rebalances the children of the (node), collapsing children
1425  * together if possible.  On return each child will have at least L/2-1
1426  * elements unless the node only has one child.
1427  *
1428  * NOTE: Because we do not update the parent's parent in the split code,
1429  * the subtree_count used by the caller may be incorrect.  We correct it
1430  * here.  Also note that we cannot change the depth of the tree's leaf
1431  * nodes here (see btree_collapse()).
1432  *
1433  * NOTE: We make no attempt to rebalance inter-cluster elements.
1434  */
1435 static
1436 int
1437 btree_rebalance(hammer_cursor_t cursor)
1438 {
1439 	hammer_node_ondisk_t ondisk;
1440 	hammer_node_t node;
1441 	hammer_node_t children[HAMMER_BTREE_INT_ELMS];
1442 	hammer_node_t child;
1443 	hammer_btree_elm_t elm;
1444 	hammer_btree_elm_t elms;
1445 	int i, j, n, nelms, goal;
1446 	int maxelms, halfelms;
1447 	int error;
1448 
1449 	/*
1450 	 * If the elm being recursed through is an inter-cluster reference,
1451 	 * don't worry about it.
1452 	 */
1453 	ondisk = cursor->node->ondisk;
1454 	elm = &ondisk->elms[cursor->index];
1455 	if (elm->internal.subtree_type == HAMMER_BTREE_TYPE_CLUSTER)
1456 		return(0);
1457 
1458 	KKASSERT(elm->internal.subtree_offset != 0);
1459 	error = 0;
1460 
1461 	/*
1462 	 * Load the children of node and do any necessary corrections
1463 	 * to subtree_count.  subtree_count may be too low due to the
1464 	 * way insertions split nodes.  Get a count of the total number
1465 	 * of actual elements held by our children.
1466 	 */
1467 	error = 0;
1468 
1469 	for (i = n = 0; i < node->base.count; ++i) {
1470 		struct hammer_btree_internal_elm *elm;
1471 
1472 		elm = &node->elms[i];
1473 		children[i] = NULL;
1474 		child_buffer[i] = NULL;	/* must be preinitialized for bread */
1475 		if (elm->subtree_offset == 0)
1476 			continue;
1477 		child = hammer_bread(cursor->cluster, elm->subtree_offset,
1478 				     HAMMER_FSBUF_BTREE, &error,
1479 				     &child_buffer[i], XXX);
1480 		children[i] = child;
1481 		if (child == NULL)
1482 			continue;
1483 		XXX
1484 		KKASSERT(node->base.subtype == child->base.type);
1485 
1486 		/*
1487 		 * Accumulate n for a good child, update the node's count
1488 		 * if it was wrong.
1489 		 */
1490 		if (node->elms[i].subtree_count != child->base.count) {
1491 			node->elms[i].subtree_count = child->base.count;
1492 		}
1493 		n += node->elms[i].subtree_count;
1494 	}
1495 	if (error)
1496 		goto failed;
1497 
1498 	/*
1499 	 * Collect all the children's elements together
1500 	 */
1501 	nelms = n;
1502 	elms = kmalloc(sizeof(*elms) * (nelms + 1), M_HAMMER, M_WAITOK|M_ZERO);
1503 	for (i = n = 0; i < node->base.count; ++i) {
1504 		child = children[i];
1505 		for (j = 0; j < child->base.count; ++j) {
1506 			elms[n].owner = child;
1507 			if (node->base.subtype == HAMMER_BTREE_TYPE_LEAF)
1508 				elms[n].u.leaf = child->leaf.elms[j];
1509 			else
1510 				elms[n].u.internal = child->internal.elms[j];
1511 			++n;
1512 		}
1513 	}
1514 	KKASSERT(n == nelms);
1515 
1516 	/*
1517 	 * Store a boundary in the elms array to ease the code below.  This
1518 	 * is only used if the children are internal nodes.
1519 	 */
1520 	elms[n].u.internal = node->elms[i];
1521 
1522 	/*
1523 	 * Calculate the number of elements each child should have (goal) by
1524 	 * reducing the number of elements until we achieve at least
1525 	 * halfelms - 1 per child, unless we are a degenerate case.
1526 	 */
1527 	maxelms = btree_max_elements(node->base.subtype);
1528 	halfelms = maxelms / 2;
1529 
1530 	goal = halfelms - 1;
1531 	while (i && n / i < goal)
1532 		--i;
1533 
1534 	/*
1535 	 * Now rebalance using the specified goal
1536 	 */
1537 	for (i = n = 0; i < node->base.count; ++i) {
1538 		struct hammer_buffer *subchild_buffer = NULL;
1539 		struct hammer_btree_internal_node *subchild;
1540 
1541 		child = children[i];
1542 		for (j = 0; j < goal && n < nelms; ++j) {
1543 			if (node->base.subtype == HAMMER_BTREE_TYPE_LEAF) {
1544 				child->leaf.elms[j] = elms[n].u.leaf;
1545 			} else {
1546 				child->internal.elms[j] = elms[n].u.internal;
1547 			}
1548 
1549 			/*
1550 			 * If the element's parent has changed we have to
1551 			 * update the parent pointer.  This is somewhat
1552 			 * expensive.
1553 			 */
1554 			if (elms[n].owner != child &&
1555 			    node->base.subtype == HAMMER_BTREE_TYPE_INTERNAL) {
1556 				subchild = hammer_bread(cursor->cluster,
1557 							elms[n].u.internal.subtree_offset,
1558 							HAMMER_FSBUF_BTREE,
1559 							&error,
1560 							&subchild_buffer, XXX);
1561 				if (subchild) {
1562 					subchild->base.parent =
1563 					    hammer_bclu_offset(child_buffer[i],
1564 								child);
1565 					hammer_modify_buffer(subchild_buffer);
1566 				}
1567 				/* XXX error */
1568 			}
1569 			++n;
1570 		}
1571 		/*
1572 		 * Set right boundary if the children are internal nodes.
1573 		 */
1574 		if (node->base.subtype == HAMMER_BTREE_TYPE_INTERNAL)
1575 			child->internal.elms[j] = elms[n].u.internal;
1576 		child->base.count = j;
1577 		hammer_modify_buffer(child_buffer[i]);
1578 		if (subchild_buffer)
1579 			hammer_put_buffer(subchild_buffer, 0);
1580 
1581 		/*
1582 		 * If we have run out of elements, break out
1583 		 */
1584 		if (n == nelms)
1585 			break;
1586 	}
1587 
1588 	/*
1589 	 * Physically destroy any left-over children.  These children's
1590 	 * elements have been packed into prior children.  The node's
1591 	 * right hand boundary and count gets shifted to index i.
1592 	 *
1593 	 * The subtree count in the node's parent MUST be updated because
1594 	 * we are removing elements.  The subtree_count field is allowed to
1595 	 * be too small, but not too large!
1596 	 */
1597 	if (i != node->base.count) {
1598 		n = i;
1599 		node->elms[n] = node->elms[node->base.count];
1600 		while (i < node->base.count) {
1601 			hammer_free_btree_ptr(child_buffer[i], children[i]);
1602 			hammer_put_buffer(child_buffer[i], 0);
1603 			++i;
1604 		}
1605 		node->base.count = n;
1606 		if (cursor->parent) {
1607 			cursor->parent->elms[cursor->parent_index].subtree_count = n;
1608 			hammer_modify_buffer(cursor->parent_buffer);
1609 		}
1610 	}
1611 
1612 	kfree(elms, M_HAMMER);
1613 failed:
1614 	hammer_modify_buffer(cursor->node_buffer);
1615 	for (i = 0; i < node->base.count; ++i) {
1616 		if (child_buffer[i])
1617 			hammer_put_buffer(child_buffer[i], 0);
1618 	}
1619 	return (error);
1620 }
1621 
1622 /*
1623  * This routine is only called if the cursor is at the root node and the
1624  * root node is an internal node.  We attempt to collapse the root node
1625  * by replacing it with all of its children, reducing tree depth by one.
1626  *
1627  * This is the only way to reduce tree depth in a HAMMER filesystem.
1628  * Note that all leaf nodes are at the same depth.
1629  *
1630  * This is a fairly expensive operation because we not only have to load
1631  * the root's children, we also have to scan each child and adjust the
1632  * parent offset for each element in each child.  Nasty all around.
1633  */
1634 static
1635 int
1636 btree_collapse(hammer_cursor_t cursor)
1637 {
1638 	hammer_btree_node_ondisk_t root, child;
1639 	hammer_btree_node_ondisk_t children[HAMMER_BTREE_INT_ELMS];
1640 	struct hammer_buffer *child_buffer[HAMMER_BTREE_INT_ELMS];
1641 	int count;
1642 	int i, j, n;
1643 	int root_modified;
1644 	int error;
1645 	int32_t root_offset;
1646 	u_int8_t subsubtype;
1647 
1648 	root = cursor->node;
1649 	count = root->base.count;
1650 	root_offset = hammer_bclu_offset(cursor->node_buffer, root);
1651 
1652 	/*
1653 	 * Sum up the number of children each element has.  This value is
1654 	 * only approximate due to the way the insertion node works.  It
1655 	 * may be too small but it will never be too large.
1656 	 *
1657 	 * Quickly terminate the collapse if the elements have too many
1658 	 * children.
1659 	 */
1660 	KKASSERT(root->base.parent == 0);	/* must be root node */
1661 	KKASSERT(root->base.type == HAMMER_BTREE_TYPE_INTERNAL);
1662 	KKASSERT(count <= HAMMER_BTREE_INT_ELMS);
1663 
1664 	for (i = n = 0; i < count; ++i) {
1665 		n += root->internal.elms[i].subtree_count;
1666 	}
1667 	if (n > btree_max_elements(root->base.subtype))
1668 		return(0);
1669 
1670 	/*
1671 	 * Iterate through the elements again and correct the subtree_count.
1672 	 * Terminate the collapse if we wind up with too many.
1673 	 */
1674 	error = 0;
1675 	root_modified = 0;
1676 
1677 	for (i = n = 0; i < count; ++i) {
1678 		struct hammer_btree_internal_elm *elm;
1679 
1680 		elm = &root->internal.elms[i];
1681 		child_buffer[i] = NULL;
1682 		children[i] = NULL;
1683 		if (elm->subtree_offset == 0)
1684 			continue;
1685 		child = hammer_bread(cursor->cluster, elm->subtree_offset,
1686 				     HAMMER_FSBUF_BTREE, &error,
1687 				     &child_buffer[i], XXX);
1688 		children[i] = child;
1689 		if (child == NULL)
1690 			continue;
1691 		KKASSERT(root->base.subtype == child->base.type);
1692 
1693 		/*
1694 		 * Accumulate n for a good child, update the root's count
1695 		 * if it was wrong.
1696 		 */
1697 		if (root->internal.elms[i].subtree_count != child->base.count) {
1698 			root->internal.elms[i].subtree_count = child->base.count;
1699 			root_modified = 1;
1700 		}
1701 		n += root->internal.elms[i].subtree_count;
1702 	}
1703 	if (error || n > btree_max_elements(root->base.subtype))
1704 		goto done;
1705 
1706 	/*
1707 	 * Ok, we can collapse the root.  If the root's children are leafs
1708 	 * the collapse is really simple.  If they are internal nodes the
1709 	 * collapse is not so simple because we have to fixup the parent
1710 	 * pointers for the root's children's children.
1711 	 *
1712 	 * When collapsing an internal node the far left and far right
1713 	 * element's boundaries should match the root's left and right
1714 	 * boundaries.
1715 	 */
1716 	if (root->base.subtype == HAMMER_BTREE_TYPE_LEAF) {
1717 		for (i = n = 0; i < count; ++i) {
1718 			child = children[i];
1719 			for (j = 0; j < child->base.count; ++j) {
1720 				root->leaf.elms[n] = child->leaf.elms[j];
1721 				++n;
1722 			}
1723 		}
1724 		root->base.type = root->base.subtype;
1725 		root->base.subtype = 0;
1726 		root->base.count = n;
1727 		root->leaf.link_left = 0;
1728 		root->leaf.link_right = 0;
1729 	} else {
1730 		struct hammer_btree_internal_elm *elm;
1731 		struct hammer_btree_internal_node *subchild;
1732 		struct hammer_buffer *subchild_buffer = NULL;
1733 
1734 		if (count) {
1735 			child = children[0];
1736 			subsubtype = child->base.subtype;
1737 			KKASSERT(child->base.count > 0);
1738 			KKASSERT(root->internal.elms[0].base.key ==
1739 				 child->internal.elms[0].base.key);
1740 			child = children[count-1];
1741 			KKASSERT(child->base.count > 0);
1742 			KKASSERT(root->internal.elms[count].base.key ==
1743 			     child->internal.elms[child->base.count].base.key);
1744 		} else {
1745 			subsubtype = 0;
1746 		}
1747 		for (i = n = 0; i < count; ++i) {
1748 			child = children[i];
1749 			KKASSERT(child->base.subtype == subsubtype);
1750 			for (j = 0; j < child->base.count; ++j) {
1751 				elm = &child->internal.elms[j];
1752 
1753 				root->internal.elms[n] = *elm;
1754 				subchild = hammer_bread(cursor->cluster,
1755 							elm->subtree_offset,
1756 							HAMMER_FSBUF_BTREE,
1757 							&error,
1758 							&subchild_buffer,
1759 							XXX);
1760 				if (subchild) {
1761 					subchild->base.parent = root_offset;
1762 					hammer_modify_buffer(subchild_buffer);
1763 				}
1764 				++n;
1765 			}
1766 			/* make sure the right boundary is correct */
1767 			/* (this gets overwritten when the loop continues) */
1768 			/* XXX generate a new separator? */
1769 			root->internal.elms[n] = child->internal.elms[j];
1770 		}
1771 		root->base.type = HAMMER_BTREE_TYPE_INTERNAL;
1772 		root->base.subtype = subsubtype;
1773 		if (subchild_buffer)
1774 			hammer_put_buffer(subchild_buffer, 0);
1775 	}
1776 	root_modified = 1;
1777 
1778 	/*
1779 	 * Cleanup
1780 	 */
1781 done:
1782 	if (root_modified)
1783 		hammer_modify_buffer(cursor->node_buffer);
1784 	for (i = 0; i < count; ++i) {
1785 		if (child_buffer[i])
1786 			hammer_put_buffer(child_buffer[i], 0);
1787 	}
1788 	return(error);
1789 }
1790 
1791 #endif
1792 
1793 /************************************************************************
1794  *			   MISCELLANIOUS SUPPORT 			*
1795  ************************************************************************/
1796 
1797 /*
1798  * Compare two B-Tree elements, return -1, 0, or +1 (e.g. similar to strcmp).
1799  *
1800  * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c.
1801  *
1802  * Note that key1 and key2 are treated differently.  key1 is allowed to
1803  * wildcard some of its fields by setting them to 0, while key2 is expected
1804  * to be in an on-disk form (no wildcards).
1805  */
1806 int
1807 hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2)
1808 {
1809 #if 0
1810 	kprintf("compare obj_id %016llx %016llx\n",
1811 		key1->obj_id, key2->obj_id);
1812 	kprintf("compare rec_type %04x %04x\n",
1813 		key1->rec_type, key2->rec_type);
1814 	kprintf("compare key %016llx %016llx\n",
1815 		key1->key, key2->key);
1816 #endif
1817 
1818 	/*
1819 	 * A key1->obj_id of 0 matches any object id
1820 	 */
1821 	if (key1->obj_id) {
1822 		if (key1->obj_id < key2->obj_id)
1823 			return(-4);
1824 		if (key1->obj_id > key2->obj_id)
1825 			return(4);
1826 	}
1827 
1828 	/*
1829 	 * A key1->rec_type of 0 matches any record type.
1830 	 */
1831 	if (key1->rec_type) {
1832 		if (key1->rec_type < key2->rec_type)
1833 			return(-3);
1834 		if (key1->rec_type > key2->rec_type)
1835 			return(3);
1836 	}
1837 
1838 	/*
1839 	 * There is no special case for key.  0 means 0.
1840 	 */
1841 	if (key1->key < key2->key)
1842 		return(-2);
1843 	if (key1->key > key2->key)
1844 		return(2);
1845 
1846 	/*
1847 	 * This test has a number of special cases.  create_tid in key1 is
1848 	 * the as-of transction id, and delete_tid in key1 is NOT USED.
1849 	 *
1850 	 * A key1->create_tid of 0 matches any record regardles of when
1851 	 * it was created or destroyed.  0xFFFFFFFFFFFFFFFFULL should be
1852 	 * used to search for the most current state of the object.
1853 	 *
1854 	 * key2->create_tid is a HAMMER record and will never be
1855 	 * 0.   key2->delete_tid is the deletion transaction id or 0 if
1856 	 * the record has not yet been deleted.
1857 	 */
1858 	if (key1->create_tid) {
1859 		if (key1->create_tid < key2->create_tid)
1860 			return(-1);
1861 		if (key2->delete_tid && key1->create_tid >= key2->delete_tid)
1862 			return(1);
1863 	}
1864 
1865 	return(0);
1866 }
1867 
1868 /*
1869  * Compare the element against the cursor's beginning and ending keys
1870  */
1871 int
1872 hammer_btree_range_cmp(hammer_cursor_t cursor, hammer_base_elm_t key2)
1873 {
1874 	/*
1875 	 * A cursor->key_beg.obj_id of 0 matches any object id
1876 	 */
1877 	if (cursor->key_beg.obj_id) {
1878 		if (cursor->key_end.obj_id < key2->obj_id)
1879 			return(-4);
1880 		if (cursor->key_beg.obj_id > key2->obj_id)
1881 			return(4);
1882 	}
1883 
1884 	/*
1885 	 * A cursor->key_beg.rec_type of 0 matches any record type.
1886 	 */
1887 	if (cursor->key_beg.rec_type) {
1888 		if (cursor->key_end.rec_type < key2->rec_type)
1889 			return(-3);
1890 		if (cursor->key_beg.rec_type > key2->rec_type)
1891 			return(3);
1892 	}
1893 
1894 	/*
1895 	 * There is no special case for key.  0 means 0.
1896 	 */
1897 	if (cursor->key_end.key < key2->key)
1898 		return(-2);
1899 	if (cursor->key_beg.key > key2->key)
1900 		return(2);
1901 
1902 	/*
1903 	 * This test has a number of special cases.  create_tid in key1 is
1904 	 * the as-of transction id, and delete_tid in key1 is NOT USED.
1905 	 *
1906 	 * A key1->create_tid of 0 matches any record regardles of when
1907 	 * it was created or destroyed.  0xFFFFFFFFFFFFFFFFULL should be
1908 	 * used to search for the most current state of the object.
1909 	 *
1910 	 * key2->create_tid is a HAMMER record and will never be
1911 	 * 0.   key2->delete_tid is the deletion transaction id or 0 if
1912 	 * the record has not yet been deleted.
1913 	 *
1914 	 * NOTE: only key_beg.create_tid is used for create_tid, we can only
1915 	 * do as-of scans at the moment.
1916 	 */
1917 	if (cursor->key_beg.create_tid) {
1918 		if (cursor->key_beg.create_tid < key2->create_tid)
1919 			return(-1);
1920 		if (key2->delete_tid && cursor->key_beg.create_tid >= key2->delete_tid)
1921 			return(1);
1922 	}
1923 
1924 	return(0);
1925 }
1926 
1927 /*
1928  * Create a separator half way inbetween key1 and key2.  For fields just
1929  * one unit apart, the separator will match key2.
1930  *
1931  * The handling of delete_tid is a little confusing.  It is only possible
1932  * to have one record in the B-Tree where all fields match except delete_tid.
1933  * This means, worse case, two adjacent elements may have a create_tid that
1934  * is one-apart and cause the separator to choose the right-hand element's
1935  * create_tid.  e.g.  (create,delete):  (1,x)(2,x) -> separator is (2,x).
1936  *
1937  * So all we have to do is set delete_tid to the right-hand element to
1938  * guarentee that the separator is properly between the two elements.
1939  */
1940 #define MAKE_SEPARATOR(key1, key2, dest, field)	\
1941 	dest->field = key1->field + ((key2->field - key1->field + 1) >> 1);
1942 
1943 static void
1944 hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2,
1945 		      hammer_base_elm_t dest)
1946 {
1947 	bzero(dest, sizeof(*dest));
1948 	MAKE_SEPARATOR(key1, key2, dest, obj_id);
1949 	MAKE_SEPARATOR(key1, key2, dest, rec_type);
1950 	MAKE_SEPARATOR(key1, key2, dest, key);
1951 	MAKE_SEPARATOR(key1, key2, dest, create_tid);
1952 	dest->delete_tid = key2->delete_tid;
1953 }
1954 
1955 #undef MAKE_SEPARATOR
1956 
1957 /*
1958  * Return whether a generic internal or leaf node is full
1959  */
1960 static int
1961 btree_node_is_full(hammer_node_ondisk_t node)
1962 {
1963 	switch(node->type) {
1964 	case HAMMER_BTREE_TYPE_INTERNAL:
1965 		if (node->count == HAMMER_BTREE_INT_ELMS)
1966 			return(1);
1967 		break;
1968 	case HAMMER_BTREE_TYPE_LEAF:
1969 		if (node->count == HAMMER_BTREE_LEAF_ELMS)
1970 			return(1);
1971 		break;
1972 	default:
1973 		panic("illegal btree subtype");
1974 	}
1975 	return(0);
1976 }
1977 
1978 #if 0
1979 static int
1980 btree_max_elements(u_int8_t type)
1981 {
1982 	if (type == HAMMER_BTREE_TYPE_LEAF)
1983 		return(HAMMER_BTREE_LEAF_ELMS);
1984 	if (type == HAMMER_BTREE_TYPE_INTERNAL)
1985 		return(HAMMER_BTREE_INT_ELMS);
1986 	panic("btree_max_elements: bad type %d\n", type);
1987 }
1988 #endif
1989 
1990 void
1991 hammer_print_btree_node(hammer_node_ondisk_t ondisk)
1992 {
1993 	hammer_btree_elm_t elm;
1994 	int i;
1995 
1996 	kprintf("node %p count=%d parent=%d type=%c\n",
1997 		ondisk, ondisk->count, ondisk->parent, ondisk->type);
1998 
1999 	/*
2000 	 * Dump both boundary elements if an internal node
2001 	 */
2002 	if (ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
2003 		for (i = 0; i <= ondisk->count; ++i) {
2004 			elm = &ondisk->elms[i];
2005 			hammer_print_btree_elm(elm, ondisk->type, i);
2006 		}
2007 	} else {
2008 		for (i = 0; i < ondisk->count; ++i) {
2009 			elm = &ondisk->elms[i];
2010 			hammer_print_btree_elm(elm, ondisk->type, i);
2011 		}
2012 	}
2013 }
2014 
2015 void
2016 hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i)
2017 {
2018 	kprintf("  %2d", i);
2019 	kprintf("\tobjid        = %016llx\n", elm->base.obj_id);
2020 	kprintf("\tkey          = %016llx\n", elm->base.key);
2021 	kprintf("\tcreate_tid   = %016llx\n", elm->base.create_tid);
2022 	kprintf("\tdelete_tid   = %016llx\n", elm->base.delete_tid);
2023 	kprintf("\trec_type     = %04x\n", elm->base.rec_type);
2024 	kprintf("\tobj_type     = %02x\n", elm->base.obj_type);
2025 	kprintf("\tsubtree_type = %02x\n", elm->subtree_type);
2026 
2027 	if (type == HAMMER_BTREE_TYPE_INTERNAL) {
2028 		if (elm->internal.rec_offset) {
2029 			kprintf("\tcluster_rec  = %08x\n",
2030 				elm->internal.rec_offset);
2031 			kprintf("\tcluster_id   = %08x\n",
2032 				elm->internal.subtree_cluid);
2033 			kprintf("\tvolno        = %08x\n",
2034 				elm->internal.subtree_volno);
2035 		} else {
2036 			kprintf("\tsubtree_off  = %08x\n",
2037 				elm->internal.subtree_offset);
2038 		}
2039 		kprintf("\tsubtree_count= %d\n", elm->internal.subtree_count);
2040 	} else {
2041 		kprintf("\trec_offset   = %08x\n", elm->leaf.rec_offset);
2042 		kprintf("\tdata_offset  = %08x\n", elm->leaf.data_offset);
2043 		kprintf("\tdata_len     = %08x\n", elm->leaf.data_len);
2044 		kprintf("\tdata_crc     = %08x\n", elm->leaf.data_crc);
2045 	}
2046 }
2047