xref: /original-bsd/lib/libc/db/btree/bt_split.c (revision 1f6db082)
1 /*-
2  * Copyright (c) 1990 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Mike Olson.
7  *
8  * %sccs.include.redist.c%
9  */
10 
11 #if defined(LIBC_SCCS) && !defined(lint)
12 static char sccsid[] = "@(#)bt_split.c	5.8 (Berkeley) 11/13/92";
13 #endif /* LIBC_SCCS and not lint */
14 
15 #include <sys/types.h>
16 
17 #define	__DBINTERFACE_PRIVATE
18 #include <db.h>
19 #include <limits.h>
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 
24 #include "btree.h"
25 
26 static int	 bt_preserve __P((BTREE *, pgno_t));
27 static PAGE	*bt_psplit __P((BTREE *, PAGE *, PAGE *, PAGE *, int *));
28 static PAGE	*bt_page __P((BTREE *, PAGE *, PAGE **, PAGE **, int *));
29 static PAGE	*bt_root __P((BTREE *, PAGE *, PAGE **, PAGE **, int *));
30 static int	 bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *));
31 static int	 bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *));
32 static recno_t	 rec_total __P((PAGE *));
33 
34 #ifdef STATISTICS
35 u_long	bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
36 #endif
37 
38 /*
39  * __BT_SPLIT -- Split the tree.
40  *
41  * Parameters:
42  *	t:	tree
43  *	h:	page to split
44  *	key:	key to insert
45  *	data:	data to insert
46  *	flags:	BIGKEY/BIGDATA flags
47  *	nbytes:	length of insertion
48  *	skip:	index to leave open
49  *
50  * Returns:
51  *	RET_ERROR, RET_SUCCESS
52  */
53 int
54 __bt_split(t, h, key, data, flags, nbytes, skip)
55 	BTREE *t;
56 	PAGE *h;
57 	const DBT *key, *data;
58 	u_long flags;
59 	size_t nbytes;
60 	int skip;
61 {
62 	BINTERNAL *bi;
63 	BLEAF *bl;
64 	DBT a, b;
65 	EPGNO *parent;
66 	PAGE *l, *r, *lchild, *rchild;
67 	index_t nxtindex;
68 	size_t nksize;
69 	int nosplit;
70 	char *dest;
71 
72 	/*
73 	 * Split the page into two pages, l and r.  The split routines return
74 	 * a pointer to the page into which the key should be inserted and skip
75 	 * set to the offset which should be used.  Additionally, l and r are
76 	 * pinned.
77 	 */
78 	h = h->pgno == P_ROOT ?
79 	    bt_root(t, h, &l, &r, &skip) : bt_page(t, h, &l, &r, &skip);
80 	if (h == NULL)
81 		return (RET_ERROR);
82 
83 	/*
84 	 * Grab the space and insert the [rb]leaf structure.  Always a [rb]leaf
85 	 * structure since key inserts always cause a leaf page to split first.
86 	 */
87 	h->linp[skip] = h->upper -= nbytes;
88 	dest = (char *)h + h->upper;
89 	if (ISSET(t, BTF_RECNO))
90 		WR_RLEAF(dest, data, flags)
91 	else
92 		WR_BLEAF(dest, key, data, flags)
93 
94 	/*
95 	 * Now we walk the parent page stack -- a LIFO stack of the pages that
96 	 * were traversed when we searched for the page that split.  Each stack
97 	 * entry is a page number and a page index offset.  The offset is for
98 	 * the page traversed on the search.  We've just split a page, so we
99 	 * have to insert a new key into the parent page.
100 	 *
101 	 * If the insert into the parent page causes it to split, may have to
102 	 * continue splitting all the way up the tree.  We stop if the root
103 	 * splits or the page inserted into didn't have to split to hold the
104 	 * new key.  Some algorithms replace the key for the old page as well
105 	 * as the new page.  We don't, as there's no reason to believe that the
106 	 * first key on the old page is any better than the key we have, and,
107 	 * in the case of a key being placed at index 0 causing the split, the
108 	 * key is unavailable.
109 	 *
110 	 * There are a maximum of 5 pages pinned at any time.  We keep the left
111 	 * and right pages pinned while working on the parent.   The 5 are the
112 	 * two children, left parent and right parent (when the parent splits)
113 	 * and the root page or the overflow key page when calling bt_preserve.
114 	 * This code must make sure that all pins are released other than the
115 	 * root page or overflow page which is unlocked elsewhere.
116 	 */
117 	for (nosplit = 0; (parent = BT_POP(t)) != NULL;) {
118 		lchild = l;
119 		rchild = r;
120 
121 		/* Get the parent page. */
122 		if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
123 			goto err2;
124 
125 	 	/* The new key goes ONE AFTER the index. */
126 		skip = parent->index + 1;
127 
128 		/*
129 		 * Calculate the space needed on the parent page.
130 		 *
131 		 * Space hack when inserting into BINTERNAL pages.  Only need to
132 		 * retain the number of bytes that will distinguish between the
133 		 * new entry and the LAST entry on the page to its left.  If the
134 		 * keys compare equal, retain the entire key.  Note, we don't
135 		 * touch overflow keys and the entire key must be retained for
136 		 * the next-to-leftmost key on the leftmost page of each level,
137 		 * or the search will fail.
138 		 */
139 		switch (rchild->flags & P_TYPE) {
140 		case P_BINTERNAL:
141 			bi = GETBINTERNAL(rchild, 0);
142 			nbytes = NBINTERNAL(bi->ksize);
143 			if (t->bt_pfx && (h->prevpg != P_INVALID || skip > 1) &&
144 			    !(bi->flags & P_BIGKEY)) {
145 				BINTERNAL *tbi;
146 				tbi =
147 				    GETBINTERNAL(lchild, NEXTINDEX(lchild) - 1);
148 				a.size = tbi->ksize;
149 				a.data = tbi->bytes;
150 				b.size = bi->ksize;
151 				b.data = bi->bytes;
152 				goto prefix;
153 			} else
154 				nksize = 0;
155 			break;
156 		case P_BLEAF:
157 			bl = GETBLEAF(rchild, 0);
158 			nbytes = NBINTERNAL(bl->ksize);
159 			if (t->bt_pfx && (h->prevpg != P_INVALID || skip > 1) &&
160 			    !(bl->flags & P_BIGKEY)) {
161 				BLEAF *tbl;
162 				size_t n;
163 
164 				tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
165 				a.size = tbl->ksize;
166 				a.data = tbl->bytes;
167 				b.size = bl->ksize;
168 				b.data = bl->bytes;
169 prefix:				nksize = t->bt_pfx(&a, &b);
170 				n = NBINTERNAL(nksize);
171 				if (n < nbytes) {
172 #ifdef STATISTICS
173 					bt_pfxsaved += nbytes - n;
174 #endif
175 					nbytes = n;
176 				} else
177 					nksize = 0;
178 			} else
179 				nksize = 0;
180 			break;
181 		case P_RINTERNAL:
182 		case P_RLEAF:
183 			nbytes = NRINTERNAL;
184 			break;
185 		default:
186 			abort();
187 		}
188 
189 		/* Split the parent page if necessary or shift the indices. */
190 		if (h->upper - h->lower < nbytes + sizeof(index_t)) {
191 			h = h->pgno == P_ROOT ?
192 			    bt_root(t, h, &l, &r, &skip) :
193 			    bt_page(t, h, &l, &r, &skip);
194 			if (h == NULL)
195 				goto err1;
196 		} else {
197 			if (skip < (nxtindex = NEXTINDEX(h)))
198 				bcopy(h->linp + skip, h->linp + skip + 1,
199 				    (nxtindex - skip) * sizeof(index_t));
200 			h->lower += sizeof(index_t);
201 			nosplit = 1;
202 		}
203 
204 		/* Insert the key into the parent page. */
205 		switch(rchild->flags & P_TYPE) {
206 		case P_BINTERNAL:
207 			h->linp[skip] = h->upper -= nbytes;
208 			dest = (char *)h + h->linp[skip];
209 			bcopy(bi, dest, nbytes);
210 			if (nksize)
211 				((BINTERNAL *)dest)->ksize = nksize;
212 			((BINTERNAL *)dest)->pgno = rchild->pgno;
213 			break;
214 		case P_BLEAF:
215 			h->linp[skip] = h->upper -= nbytes;
216 			dest = (char *)h + h->linp[skip];
217 			WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
218 			    rchild->pgno, bl->flags & P_BIGKEY);
219 			bcopy(bl->bytes, dest, nksize ? nksize : bl->ksize);
220 			if (bl->flags & P_BIGKEY &&
221 			    bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
222 				goto err1;
223 			break;
224 		case P_RINTERNAL:
225 			/* Update both left and right page counts. */
226 			h->linp[skip] = h->upper -= nbytes;
227 			dest = (char *)h + h->linp[skip];
228 			((RINTERNAL *)dest)->nrecs = rec_total(rchild);
229 			((RINTERNAL *)dest)->pgno = rchild->pgno;
230 			dest = (char *)h + h->linp[skip - 1];
231 			((RINTERNAL *)dest)->nrecs = rec_total(lchild);
232 			((RINTERNAL *)dest)->pgno = lchild->pgno;
233 			break;
234 		case P_RLEAF:
235 			/* Update both left and right page counts. */
236 			h->linp[skip] = h->upper -= nbytes;
237 			dest = (char *)h + h->linp[skip];
238 			((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
239 			((RINTERNAL *)dest)->pgno = rchild->pgno;
240 			dest = (char *)h + h->linp[skip - 1];
241 			((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
242 			((RINTERNAL *)dest)->pgno = lchild->pgno;
243 			break;
244 		default:
245 			abort();
246 		}
247 
248 		/* Unpin the held pages. */
249 		if (nosplit) {
250 			mpool_put(t->bt_mp, h, MPOOL_DIRTY);
251 			break;
252 		}
253 		mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
254 		mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
255 	}
256 
257 	/* Unpin the held pages. */
258 	mpool_put(t->bt_mp, l, MPOOL_DIRTY);
259 	mpool_put(t->bt_mp, r, MPOOL_DIRTY);
260 
261 	/* Clear any pages left on the stack. */
262 	BT_CLR(t);
263 	return (RET_SUCCESS);
264 
265 	/*
266 	 * If something fails in the above loop we were already walking back
267 	 * up the tree and the tree is now inconsistent.  Nothing much we can
268 	 * do about it but release any memory we're holding.
269 	 */
270 err1:	mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
271 	mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
272 
273 err2:	mpool_put(t->bt_mp, l, 0);
274 	mpool_put(t->bt_mp, r, 0);
275 	__dbpanic(t->bt_dbp);
276 	return (RET_ERROR);
277 }
278 
279 /*
280  * BT_PAGE -- Split a non-root page of a btree.
281  *
282  * Parameters:
283  *	t:	tree
284  *	h:	root page
285  *	lp:	pointer to left page pointer
286  *	rp:	pointer to right page pointer
287  *	skip:	pointer to index to leave open
288  *
289  * Returns:
290  *	Pointer to page in which to insert or NULL on error.
291  */
292 static PAGE *
293 bt_page(t, h, lp, rp, skip)
294 	BTREE *t;
295 	PAGE *h, **lp, **rp;
296 	int *skip;
297 {
298 	PAGE *l, *r, *tp;
299 	pgno_t npg;
300 
301 #ifdef STATISTICS
302 	++bt_split;
303 #endif
304 	/* Put the new right page for the split into place. */
305 	if ((r = __bt_new(t, &npg)) == NULL)
306 		return (NULL);
307 	r->pgno = npg;
308 	r->lower = BTDATAOFF;
309 	r->upper = t->bt_psize;
310 	r->nextpg = h->nextpg;
311 	r->prevpg = h->pgno;
312 	r->flags = h->flags & P_TYPE;
313 
314 	/*
315 	 * If we're splitting the last page on a level because we're appending
316 	 * a key to it (skip is NEXTINDEX()), it's likely that the data is
317 	 * sorted.  Adding an empty page on the side of the level is less work
318 	 * and can push the fill factor much higher than normal.  If we're
319 	 * wrong it's no big deal, we'll just do the split the right way next
320 	 * time.  It may look like it's equally easy to do a similar hack for
321 	 * reverse sorted data, that is, split the tree left, but it's not.
322 	 * Don't even try.
323 	 */
324 	if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
325 #ifdef STATISTICS
326 		++bt_sortsplit;
327 #endif
328 		h->nextpg = r->pgno;
329 		r->lower = BTDATAOFF + sizeof(index_t);
330 		*skip = 0;
331 		*lp = h;
332 		*rp = r;
333 		return (r);
334 	}
335 
336 	/* Put the new left page for the split into place. */
337 	if ((l = malloc(t->bt_psize)) == NULL) {
338 		mpool_put(t->bt_mp, r, 0);
339 		return (NULL);
340 	}
341 	l->pgno = h->pgno;
342 	l->nextpg = r->pgno;
343 	l->prevpg = h->prevpg;
344 	l->lower = BTDATAOFF;
345 	l->upper = t->bt_psize;
346 	l->flags = h->flags & P_TYPE;
347 
348 	/* Fix up the previous pointer of the page after the split page. */
349 	if (h->nextpg != P_INVALID) {
350 		if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
351 			free(l);
352 			/* XXX mpool_free(t->bt_mp, r->pgno); */
353 			return (NULL);
354 		}
355 		tp->prevpg = r->pgno;
356 		mpool_put(t->bt_mp, tp, 0);
357 	}
358 
359 	/*
360 	 * Split right.  The key/data pairs aren't sorted in the btree page so
361 	 * it's simpler to copy the data from the split page onto two new pages
362 	 * instead of copying half the data to the right page and compacting
363 	 * the left page in place.  Since the left page can't change, we have
364 	 * to swap the original and the allocated left page after the split.
365 	 */
366 	tp = bt_psplit(t, h, l, r, skip);
367 
368 	/* Move the new left page onto the old left page. */
369 	bcopy(l, h, t->bt_psize);
370 	if (tp == l)
371 		tp = h;
372 	free(l);
373 
374 	*lp = h;
375 	*rp = r;
376 	return (tp);
377 }
378 
379 /*
380  * BT_ROOT -- Split the root page of a btree.
381  *
382  * Parameters:
383  *	t:	tree
384  *	h:	root page
385  *	lp:	pointer to left page pointer
386  *	rp:	pointer to right page pointer
387  *	skip:	pointer to index to leave open
388  *
389  * Returns:
390  *	Pointer to page in which to insert or NULL on error.
391  */
392 static PAGE *
393 bt_root(t, h, lp, rp, skip)
394 	BTREE *t;
395 	PAGE *h, **lp, **rp;
396 	int *skip;
397 {
398 	PAGE *l, *r, *tp;
399 	pgno_t lnpg, rnpg;
400 
401 #ifdef STATISTICS
402 	++bt_split;
403 	++bt_rootsplit;
404 #endif
405 	/* Put the new left and right pages for the split into place. */
406 	if ((l = __bt_new(t, &lnpg)) == NULL ||
407 	    (r = __bt_new(t, &rnpg)) == NULL)
408 		return (NULL);
409 	l->pgno = lnpg;
410 	r->pgno = rnpg;
411 	l->nextpg = r->pgno;
412 	r->prevpg = l->pgno;
413 	l->prevpg = r->nextpg = P_INVALID;
414 	l->lower = r->lower = BTDATAOFF;
415 	l->upper = r->upper = t->bt_psize;
416 	l->flags = r->flags = h->flags & P_TYPE;
417 
418 	/* Split the root page. */
419 	tp = bt_psplit(t, h, l, r, skip);
420 
421 	/* Make the root page look right. */
422 	if ((ISSET(t, BTF_RECNO) ?
423 	    bt_rroot(t, h, l, r) : bt_broot(t, h, l, r)) == RET_ERROR)
424 		return (NULL);
425 
426 	*lp = l;
427 	*rp = r;
428 	return (tp);
429 }
430 
431 /*
432  * BT_RROOT -- Fix up the recno root page after the split.
433  *
434  * Parameters:
435  *	t:	tree
436  *	h:	root page
437  *
438  * Returns:
439  *	RET_ERROR, RET_SUCCESS
440  */
441 static int
442 bt_rroot(t, h, l, r)
443 	BTREE *t;
444 	PAGE *h, *l, *r;
445 {
446 	char *dest;
447 
448 	/* Insert the left and right keys, set the header information. */
449 	h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
450 	dest = (char *)h + h->upper;
451 	WR_RINTERNAL(dest,
452 	    l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
453 
454 	h->linp[1] = h->upper -= NRINTERNAL;
455 	dest = (char *)h + h->upper;
456 	WR_RINTERNAL(dest,
457 	    r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
458 
459 	h->lower = BTDATAOFF + 2 * sizeof(index_t);
460 
461 	/* Unpin the root page, set to recno internal page. */
462 	h->flags &= ~P_TYPE;
463 	h->flags |= P_RINTERNAL;
464 	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
465 
466 	return (RET_SUCCESS);
467 }
468 
469 /*
470  * BT_BROOT -- Fix up the btree root page after the split.
471  *
472  * Parameters:
473  *	t:	tree
474  *	h:	root page
475  *
476  * Returns:
477  *	RET_ERROR, RET_SUCCESS
478  */
479 static int
480 bt_broot(t, h, l, r)
481 	BTREE *t;
482 	PAGE *h, *l, *r;
483 {
484 	BINTERNAL *bi;
485 	BLEAF *bl;
486 	size_t nbytes;
487 	char *dest;
488 
489 	/*
490 	 * If the root page was a leaf page, change it into an internal page.
491 	 * We copy the key we split on (but not the key's data, in the case of
492 	 * a leaf page) to the new root page.
493 	 *
494 	 * The btree comparison code guarantees that the left-most key on any
495 	 * level of the tree is never used, so it doesn't need to be filled
496 	 * in.  (This is not just convenience -- if the insert index is 0, we
497 	 * don't *have* a key to fill in.)  The right key is available because
498 	 * the split code guarantees not to split on the skipped index.
499 	 */
500 	nbytes = NBINTERNAL(0);
501 	h->linp[0] = h->upper = t->bt_psize - nbytes;
502 	dest = (char *)h + h->upper;
503 	WR_BINTERNAL(dest, 0, l->pgno, 0);
504 
505 	switch(h->flags & P_TYPE) {
506 	case P_BLEAF:
507 		bl = GETBLEAF(r, 0);
508 		nbytes = NBINTERNAL(bl->ksize);
509 		h->linp[1] = h->upper -= nbytes;
510 		dest = (char *)h + h->upper;
511 		WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
512 		bcopy(bl->bytes, dest, bl->ksize);
513 
514 		/*
515 		 * If the key is on an overflow page, mark the overflow chain
516 		 * so it isn't deleted when the leaf copy of the key is deleted.
517 		 */
518 		if (bl->flags & P_BIGKEY &&
519 		    bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
520 			return (RET_ERROR);
521 		break;
522 	case P_BINTERNAL:
523 		bi = GETBINTERNAL(r, 0);
524 		nbytes = NBINTERNAL(bi->ksize);
525 		h->linp[1] = h->upper -= nbytes;
526 		dest = (char *)h + h->upper;
527 		bcopy(bi, dest, nbytes);
528 		((BINTERNAL *)dest)->pgno = r->pgno;
529 		break;
530 	default:
531 		abort();
532 	}
533 	h->lower = BTDATAOFF + 2 * sizeof(index_t);
534 
535 	/* Unpin the root page, set to btree internal page. */
536 	h->flags &= ~P_TYPE;
537 	h->flags |= P_BINTERNAL;
538 	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
539 
540 	return (RET_SUCCESS);
541 }
542 
543 /*
544  * BT_PSPLIT -- Do the real work of splitting the page.
545  *
546  * Parameters:
547  *	t:	tree
548  *	h:	page to be split
549  *	l:	page to put lower half of data
550  *	r:	page to put upper half of data
551  *	skip:	pointer to index to leave open
552  *
553  * Returns:
554  *	Pointer to page in which to insert.
555  */
556 static PAGE *
557 bt_psplit(t, h, l, r, pskip)
558 	BTREE *t;
559 	PAGE *h, *l, *r;
560 	int *pskip;
561 {
562 	BINTERNAL *bi;
563 	BLEAF *bl;
564 	RLEAF *rl;
565 	EPGNO *c;
566 	PAGE *rval;
567 	index_t half, skip;
568 	size_t nbytes;
569 	void *src;
570 	int bigkeycnt, isbigkey, nxt, off, top;
571 
572 	/*
573 	 * Split the data to the left and right pages. Leave the skip index
574 	 * open and guarantee that the split doesn't happen on that index (the
575 	 * right key must be available for the parent page).  Additionally,
576 	 * make some effort not to split on an overflow key.  This makes it
577 	 * faster to process internal pages and can save space since overflow
578 	 * keys used by internal pages are never deleted.
579 	 */
580 	bigkeycnt = 0;
581 	skip = *pskip;
582 	half = (t->bt_psize - BTDATAOFF) / 2;
583 	for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
584 		if (skip == off)
585 			continue;
586 		switch (h->flags & P_TYPE) {
587 		case P_BINTERNAL:
588 			src = bi = GETBINTERNAL(h, nxt);
589 			nbytes = NBINTERNAL(bi->ksize);
590 			isbigkey = bi->flags & P_BIGKEY;
591 			break;
592 		case P_BLEAF:
593 			src = bl = GETBLEAF(h, nxt);
594 			nbytes = NBLEAF(bl);
595 			isbigkey = bl->flags & P_BIGKEY;
596 			break;
597 		case P_RINTERNAL:
598 			src = GETRINTERNAL(h, nxt);
599 			nbytes = NRINTERNAL;
600 			isbigkey = 0;
601 			break;
602 		case P_RLEAF:
603 			src = rl = GETRLEAF(h, nxt);
604 			nbytes = NRLEAF(rl);
605 			isbigkey = 0;
606 			break;
607 		default:
608 			abort();
609 		}
610 		++nxt;
611 		l->linp[off] = l->upper -= nbytes;
612 		bcopy(src, (char *)l + l->upper, nbytes);
613 
614 		/* There's no empirical justification for the '3'. */
615 		if (half < nbytes) {
616 			if (skip != off + 1)
617 				if (!isbigkey || bigkeycnt == 3)
618 					break;
619 				else
620 					++bigkeycnt;
621 		} else
622 			half -= nbytes;
623 	}
624 	l->lower += (off + 1) * sizeof(index_t);
625 
626 	/*
627 	 * If splitting the page that the cursor was on, the cursor has to be
628 	 * adjusted to point to the same record as before the split.  If the
629 	 * skipped slot and the cursor are both on the left page and the cursor
630 	 * is on or past the skipped slot, the cursor is incremented by one.
631 	 * If the skipped slot and the cursor are both on the right page and
632 	 * the cursor is on or past the skipped slot, the cursor is incremented
633 	 * by one.  If the skipped slot and the cursor aren't on the same page,
634 	 * the cursor isn't changed.  Regardless of the relationship of the
635 	 * skipped slot and the cursor, if the cursor is on the right page it
636 	 * is decremented by the number of records split to the left page.
637 	 *
638 	 * Don't bother checking for the BTF_SEQINIT flag, the page number will
639 	 * be P_INVALID.
640 	 */
641 	c = &t->bt_bcursor;
642 	if (c->pgno == h->pgno)
643 		if (c->index < off) {			/* left page */
644 			c->pgno = l->pgno;
645 			if (c->index >= skip)
646 				++c->index;
647 		} else {				/* right page */
648 			c->pgno = r->pgno;
649 			if (c->index >= skip && skip > off)
650 				++c->index;
651 			c->index -= off;
652 		}
653 
654 	/*
655 	 * Decide which page to return, and adjust the skip index if the
656 	 * to-be-inserted-upon page has changed.
657 	 */
658 	if (skip > off) {
659 		rval = r;
660 		*pskip -= off + 1;
661 	} else
662 		rval = l;
663 
664 	for (off = 0; nxt < top; ++off) {
665 		if (skip == nxt) {
666 			skip = 0;
667 			continue;
668 		}
669 		switch (h->flags & P_TYPE) {
670 		case P_BINTERNAL:
671 			src = bi = GETBINTERNAL(h, nxt);
672 			nbytes = NBINTERNAL(bi->ksize);
673 			break;
674 		case P_BLEAF:
675 			src = bl = GETBLEAF(h, nxt);
676 			nbytes = NBLEAF(bl);
677 			break;
678 		case P_RINTERNAL:
679 			src = GETRINTERNAL(h, nxt);
680 			nbytes = NRINTERNAL;
681 			break;
682 		case P_RLEAF:
683 			src = rl = GETRLEAF(h, nxt);
684 			nbytes = NRLEAF(rl);
685 			break;
686 		default:
687 			abort();
688 		}
689 		++nxt;
690 		r->linp[off] = r->upper -= nbytes;
691 		bcopy(src, (char *)r + r->upper, nbytes);
692 	}
693 	r->lower += off * sizeof(index_t);
694 
695 	/* If the key is being appended to the page, adjust the index. */
696 	if (skip == top)
697 		r->lower += sizeof(index_t);
698 
699 	return (rval);
700 }
701 
702 /*
703  * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
704  *
705  * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
706  * record that references them gets deleted.  Chains pointed to by internal
707  * pages never get deleted.  This routine marks a chain as pointed to by an
708  * internal page.
709  *
710  * Parameters:
711  *	t:	tree
712  *	pg:	page number of first page in the chain.
713  *
714  * Returns:
715  *	RET_SUCCESS, RET_ERROR.
716  */
717 static int
718 bt_preserve(t, pg)
719 	BTREE *t;
720 	pgno_t pg;
721 {
722 	PAGE *h;
723 
724 	if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
725 		return (RET_ERROR);
726 	h->flags |= P_PRESERVE;
727 	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
728 	return (RET_SUCCESS);
729 }
730 
731 /*
732  * REC_TOTAL -- Return the number of recno entries below a page.
733  *
734  * Parameters:
735  *	h:	page
736  *
737  * Returns:
738  *	The number of recno entries below a page.
739  *
740  * XXX
741  * These values could be set by the bt_psplit routine.  The problem is that the
742  * entry has to be popped off of the stack etc. or the values have to be passed
743  * all the way back to bt_split/bt_rroot and it's not very clean.
744  */
745 static recno_t
746 rec_total(h)
747 	PAGE *h;
748 {
749 	recno_t recs;
750 	index_t nxt, top;
751 
752 	for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
753 		recs += GETRINTERNAL(h, nxt)->nrecs;
754 	return (recs);
755 }
756