xref: /original-bsd/lib/libc/db/btree/bt_split.c (revision 7eb91141)
1 /*-
2  * Copyright (c) 1990 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Mike Olson.
7  *
8  * %sccs.include.redist.c%
9  */
10 
11 #if defined(LIBC_SCCS) && !defined(lint)
12 static char sccsid[] = "@(#)bt_split.c	5.4 (Berkeley) 09/12/91";
13 #endif /* LIBC_SCCS and not lint */
14 
15 #include <sys/types.h>
16 #define	__DBINTERFACE_PRIVATE
17 #include <db.h>
18 #include <limits.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include "btree.h"
23 
24 static int	 bt_preserve __P((BTREE *, pgno_t));
25 static PAGE	*bt_psplit __P((BTREE *, PAGE *, PAGE *, PAGE *, int *));
26 static PAGE	*bt_page __P((BTREE *, PAGE *, PAGE **, PAGE **, int *));
27 static PAGE	*bt_root __P((BTREE *, PAGE *, PAGE **, PAGE **, int *));
28 static int	 bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *));
29 static int	 bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *));
30 static recno_t	 rec_total __P((PAGE *));
31 
32 #ifdef STATISTICS
33 u_long	bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
34 #endif
35 
36 /*
37  * __BT_SPLIT -- Split the tree.
38  *
39  * Parameters:
40  *	t:	tree
41  *	h:	page to split
42  *	key:	key to insert
43  *	data:	data to insert
44  *	flags:	BIGKEY/BIGDATA flags
45  *	nbytes:	length of insertion
46  *	skip:	index to leave open
47  *
48  * Returns:
49  *	RET_ERROR, RET_SUCCESS
50  */
51 int
52 __bt_split(t, h, key, data, flags, nbytes, skip)
53 	BTREE *t;
54 	PAGE *h;
55 	const DBT *key, *data;
56 	u_long flags;
57 	size_t nbytes;
58 	int skip;
59 {
60 	BINTERNAL *bi;
61 	BLEAF *bl;
62 	DBT a, b;
63 	EPGNO *parent;
64 	PAGE *l, *r, *lchild, *rchild;
65 	index_t nxtindex;
66 	size_t nksize;
67 	int nosplit;
68 	char *dest;
69 
70 	/*
71 	 * Split the page into two pages, l and r.  The split routines return
72 	 * a pointer to the page into which the key should be inserted and skip
73 	 * set to the offset which should be used.  Additionally, l and r are
74 	 * pinned.
75 	 */
76 	h = h->pgno == P_ROOT ?
77 	    bt_root(t, h, &l, &r, &skip) : bt_page(t, h, &l, &r, &skip);
78 	if (h == NULL)
79 		return (RET_ERROR);
80 
81 	/*
82 	 * Grab the space and insert the [rb]leaf structure.  Always a [rb]leaf
83 	 * structure since key inserts always cause a leaf page to split first.
84 	 */
85 	h->linp[skip] = h->upper -= nbytes;
86 	dest = (char *)h + h->upper;
87 	if (ISSET(t, BTF_RECNO))
88 		WR_RLEAF(dest, data, flags)
89 	else
90 		WR_BLEAF(dest, key, data, flags)
91 
92 	/*
93 	 * Now we walk the parent page stack -- a LIFO stack of the pages that
94 	 * were traversed when we searched for the page that split.  Each stack
95 	 * entry is a page number and a page index offset.  The offset is for
96 	 * the page traversed on the search.  We've just split a page, so we
97 	 * have to insert a new key into the parent page.
98 	 *
99 	 * If the insert into the parent page causes it to split, may have to
100 	 * continue splitting all the way up the tree.  We stop if the root
101 	 * splits or the page inserted into didn't have to split to hold the
102 	 * new key.  Some algorithms replace the key for the old page as well
103 	 * as the new page.  We don't, as there's no reason to believe that the
104 	 * first key on the old page is any better than the key we have, and,
105 	 * in the case of a key being placed at index 0 causing the split, the
106 	 * key is unavailable.
107 	 *
108 	 * There are a maximum of 5 pages pinned at any time.  We keep the left
109 	 * and right pages pinned while working on the parent.   The 5 are the
110 	 * two children, left parent and right parent (when the parent splits)
111 	 * and the root page or the overflow key page when calling bt_preserve.
112 	 * This code must make sure that all pins are released other than the
113 	 * root page or overflow page which is unlocked elsewhere.
114 	 */
115 	for (nosplit = 0; (parent = BT_POP(t)) != NULL;) {
116 		lchild = l;
117 		rchild = r;
118 
119 		/* Get the parent page. */
120 		if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
121 			goto err2;
122 
123 	 	/* The new key goes ONE AFTER the index. */
124 		skip = parent->index + 1;
125 
126 		/*
127 		 * Calculate the space needed on the parent page.
128 		 *
129 		 * Space hack when inserting into BINTERNAL pages.  Only need to
130 		 * retain the number of bytes that will distinguish between the
131 		 * new entry and the LAST entry on the page to its left.  If the
132 		 * keys compare equal, retain the entire key.  Note, we don't
133 		 * touch overflow keys and the entire key must be retained for
134 		 * the next-to-leftmost key on the leftmost page of each level,
135 		 * or the search will fail.
136 		 */
137 		switch (rchild->flags & P_TYPE) {
138 		case P_BINTERNAL:
139 			bi = GETBINTERNAL(rchild, 0);
140 			nbytes = NBINTERNAL(bi->ksize);
141 			if (t->bt_pfx && (h->prevpg != P_INVALID || skip > 1) &&
142 			    !(bi->flags & P_BIGKEY)) {
143 				BINTERNAL *tbi;
144 				tbi =
145 				    GETBINTERNAL(lchild, NEXTINDEX(lchild) - 1);
146 				a.size = tbi->ksize;
147 				a.data = tbi->bytes;
148 				b.size = bi->ksize;
149 				b.data = bi->bytes;
150 				goto prefix;
151 			}
152 			break;
153 		case P_BLEAF:
154 			bl = GETBLEAF(rchild, 0);
155 			nbytes = NBLEAF(bl);
156 			if (t->bt_pfx && (h->prevpg != P_INVALID || skip > 1) &&
157 			    !(bl->flags & P_BIGKEY)) {
158 				BLEAF *tbl;
159 				size_t n;
160 
161 				tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
162 				a.size = tbl->ksize;
163 				a.data = tbl->bytes;
164 				b.size = bl->ksize;
165 				b.data = bl->bytes;
166 prefix:				nksize = t->bt_pfx(&a, &b);
167 				n = NBINTERNAL(nksize);
168 				if (n < nbytes) {
169 #ifdef STATISTICS
170 					bt_pfxsaved += nbytes - n;
171 #endif
172 					nbytes = n;
173 				} else
174 					nksize = 0;
175 			} else
176 				nksize = 0;
177 			break;
178 		case P_RINTERNAL:
179 		case P_RLEAF:
180 			nbytes = NRINTERNAL;
181 			break;
182 		}
183 
184 		/* Split the parent page if necessary or shift the indices. */
185 		if (h->upper - h->lower < nbytes + sizeof(index_t)) {
186 			h = h->pgno == P_ROOT ?
187 			    bt_root(t, h, &l, &r, &skip) :
188 			    bt_page(t, h, &l, &r, &skip);
189 			if (h == NULL)
190 				goto err1;
191 		} else {
192 			if (skip < (nxtindex = NEXTINDEX(h)))
193 				bcopy(h->linp + skip, h->linp + skip + 1,
194 				    (nxtindex - skip) * sizeof(index_t));
195 			h->lower += sizeof(index_t);
196 			nosplit = 1;
197 		}
198 
199 		/* Insert the key into the parent page. */
200 		switch(rchild->flags & P_TYPE) {
201 		case P_BINTERNAL:
202 			h->linp[skip] = h->upper -= nbytes;
203 			dest = (char *)h + h->linp[skip];
204 			bcopy(bi, dest, nbytes);
205 			if (nksize)
206 				((BINTERNAL *)dest)->ksize = nksize;
207 			((BINTERNAL *)dest)->pgno = rchild->pgno;
208 			break;
209 		case P_BLEAF:
210 			h->linp[skip] = h->upper -= nbytes;
211 			dest = (char *)h + h->linp[skip];
212 			WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
213 			    rchild->pgno, bl->flags & P_BIGKEY);
214 			bcopy(bl->bytes, dest, nksize ? nksize : bl->ksize);
215 			if (bl->flags & P_BIGKEY &&
216 			    bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
217 				goto err1;
218 			break;
219 		case P_RINTERNAL:
220 			/* Update both left and right page counts. */
221 			h->linp[skip] = h->upper -= nbytes;
222 			dest = (char *)h + h->linp[skip];
223 			((RINTERNAL *)dest)->nrecs = rec_total(rchild);
224 			((RINTERNAL *)dest)->pgno = rchild->pgno;
225 			dest = (char *)h + h->linp[skip - 1];
226 			((RINTERNAL *)dest)->nrecs = rec_total(lchild);
227 			((RINTERNAL *)dest)->pgno = lchild->pgno;
228 			break;
229 		case P_RLEAF:
230 			/* Update both left and right page counts. */
231 			h->linp[skip] = h->upper -= nbytes;
232 			dest = (char *)h + h->linp[skip];
233 			((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
234 			((RINTERNAL *)dest)->pgno = rchild->pgno;
235 			dest = (char *)h + h->linp[skip - 1];
236 			((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
237 			((RINTERNAL *)dest)->pgno = lchild->pgno;
238 			break;
239 		}
240 
241 		/* Unpin the held pages. */
242 		if (nosplit) {
243 			mpool_put(t->bt_mp, h, MPOOL_DIRTY);
244 			break;
245 		}
246 		mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
247 		mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
248 	}
249 
250 	/* Unpin the held pages. */
251 	mpool_put(t->bt_mp, l, MPOOL_DIRTY);
252 	mpool_put(t->bt_mp, r, MPOOL_DIRTY);
253 
254 	/* Clear any pages left on the stack. */
255 	BT_CLR(t);
256 	return (RET_SUCCESS);
257 
258 	/*
259 	 * If something fails in the above loop we were already walking back
260 	 * up the tree and the tree is now inconsistent.  Nothing much we can
261 	 * do about it but release any memory we're holding.
262 	 */
263 err1:	mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
264 	mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
265 
266 err2:	mpool_put(t->bt_mp, l, 0);
267 	mpool_put(t->bt_mp, r, 0);
268 	__dbpanic(t->bt_dbp);
269 	return (RET_ERROR);
270 }
271 
272 /*
273  * BT_PAGE -- Split a non-root page of a btree.
274  *
275  * Parameters:
276  *	t:	tree
277  *	h:	root page
278  *	lp:	pointer to left page pointer
279  *	rp:	pointer to right page pointer
280  *	skip:	pointer to index to leave open
281  *
282  * Returns:
283  *	Pointer to page in which to insert or NULL on error.
284  */
285 static PAGE *
286 bt_page(t, h, lp, rp, skip)
287 	BTREE *t;
288 	PAGE *h, **lp, **rp;
289 	int *skip;
290 {
291 	PAGE *l, *r, *tp;
292 	pgno_t npg;
293 
294 #ifdef STATISTICS
295 	++bt_split;
296 #endif
297 	/* Put the new right page for the split into place. */
298 	if ((r = mpool_new(t->bt_mp, &npg)) == NULL)
299 		return (NULL);
300 	r->pgno = npg;
301 	r->lower = BTDATAOFF;
302 	r->upper = t->bt_psize;
303 	r->nextpg = h->nextpg;
304 	r->prevpg = h->pgno;
305 	r->flags = h->flags & P_TYPE;
306 
307 	/*
308 	 * If we're splitting the last page on a level because we're appending
309 	 * a key to it (skip is NEXTINDEX()), it's likely that the data is
310 	 * sorted.  Adding an empty page on the side of the level is less work
311 	 * and can push the fill factor much higher than normal.  If we're
312 	 * wrong it's no big deal, we'll just do the split the right way next
313 	 * time.  It may look like it's equally easy to do a similar hack for
314 	 * reverse sorted data, that is, split the tree left, but it's not.
315 	 * Don't even try.
316 	 */
317 	if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
318 #ifdef STATISTICS
319 		++bt_sortsplit;
320 #endif
321 		h->nextpg = r->pgno;
322 		r->lower = BTDATAOFF + sizeof(index_t);
323 		*skip = 0;
324 		*lp = h;
325 		*rp = r;
326 		return (r);
327 	}
328 
329 	/* Put the new left page for the split into place. */
330 	if ((l = malloc(t->bt_psize)) == NULL) {
331 		mpool_put(t->bt_mp, r, 0);
332 		return (NULL);
333 	}
334 	l->pgno = h->pgno;
335 	l->nextpg = r->pgno;
336 	l->prevpg = h->prevpg;
337 	l->lower = BTDATAOFF;
338 	l->upper = t->bt_psize;
339 	l->flags = h->flags & P_TYPE;
340 
341 	/* Fix up the previous pointer of the page after the split page. */
342 	if (h->nextpg != P_INVALID) {
343 		if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
344 			free(l);
345 			/* XXX mpool_free(t->bt_mp, r->pgno); */
346 			return (NULL);
347 		}
348 		tp->prevpg = r->pgno;
349 		mpool_put(t->bt_mp, tp, 0);
350 	}
351 
352 	/*
353 	 * Split right.  The key/data pairs aren't sorted in the btree page so
354 	 * it's simpler to copy the data from the split page onto two new pages
355 	 * instead of copying half the data to the right page and compacting
356 	 * the left page in place.  Since the left page can't change, we have
357 	 * to swap the original and the allocated left page after the split.
358 	 */
359 	tp = bt_psplit(t, h, l, r, skip);
360 
361 	/* Move the new left page onto the old left page. */
362 	bcopy(l, h, t->bt_psize);
363 	if (tp == l)
364 		tp = h;
365 	free(l);
366 
367 	*lp = h;
368 	*rp = r;
369 	return (tp);
370 }
371 
372 /*
373  * BT_ROOT -- Split the root page of a btree.
374  *
375  * Parameters:
376  *	t:	tree
377  *	h:	root page
378  *	lp:	pointer to left page pointer
379  *	rp:	pointer to right page pointer
380  *	skip:	pointer to index to leave open
381  *
382  * Returns:
383  *	Pointer to page in which to insert or NULL on error.
384  */
385 static PAGE *
386 bt_root(t, h, lp, rp, skip)
387 	BTREE *t;
388 	PAGE *h, **lp, **rp;
389 	int *skip;
390 {
391 	PAGE *l, *r, *tp;
392 	pgno_t lnpg, rnpg;
393 
394 #ifdef STATISTICS
395 	++bt_split;
396 	++bt_rootsplit;
397 #endif
398 	/* Put the new left and right pages for the split into place. */
399 	if ((l = mpool_new(t->bt_mp, &lnpg)) == NULL ||
400 	    (r = mpool_new(t->bt_mp, &rnpg)) == NULL)
401 		return (NULL);
402 	l->pgno = lnpg;
403 	r->pgno = rnpg;
404 	l->nextpg = r->pgno;
405 	r->prevpg = l->pgno;
406 	l->prevpg = r->nextpg = P_INVALID;
407 	l->lower = r->lower = BTDATAOFF;
408 	l->upper = r->upper = t->bt_psize;
409 	l->flags = r->flags = h->flags & P_TYPE;
410 
411 	/* Split the root page. */
412 	tp = bt_psplit(t, h, l, r, skip);
413 
414 	/* Make the root page look right. */
415 	if ((ISSET(t, BTF_RECNO) ?
416 	    bt_rroot(t, h, l, r) : bt_broot(t, h, l, r)) == RET_ERROR)
417 		return (NULL);
418 
419 	*lp = l;
420 	*rp = r;
421 	return (tp);
422 }
423 
424 /*
425  * BT_RROOT -- Fix up the recno root page after the split.
426  *
427  * Parameters:
428  *	t:	tree
429  *	h:	root page
430  *
431  * Returns:
432  *	RET_ERROR, RET_SUCCESS
433  */
434 static int
435 bt_rroot(t, h, l, r)
436 	BTREE *t;
437 	PAGE *h, *l, *r;
438 {
439 	char *dest;
440 
441 	/* Insert the left and right keys, set the header information. */
442 	h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
443 	dest = (char *)h + h->upper;
444 	WR_RINTERNAL(dest,
445 	    l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
446 
447 	h->linp[1] = h->upper -= NRINTERNAL;
448 	dest = (char *)h + h->upper;
449 	WR_RINTERNAL(dest,
450 	    r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
451 
452 	h->lower = BTDATAOFF + 2 * sizeof(index_t);
453 
454 	/* Unpin the root page, set to recno internal page. */
455 	h->flags &= ~P_TYPE;
456 	h->flags |= P_RINTERNAL;
457 	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
458 
459 	return (RET_SUCCESS);
460 }
461 
462 /*
463  * BT_BROOT -- Fix up the btree root page after the split.
464  *
465  * Parameters:
466  *	t:	tree
467  *	h:	root page
468  *
469  * Returns:
470  *	RET_ERROR, RET_SUCCESS
471  */
472 static int
473 bt_broot(t, h, l, r)
474 	BTREE *t;
475 	PAGE *h, *l, *r;
476 {
477 	BINTERNAL *bi;
478 	BLEAF *bl;
479 	size_t nbytes;
480 	char *dest;
481 
482 	/*
483 	 * If the root page was a leaf page, change it into an internal page.
484 	 * We copy the key we split on (but not the key's data, in the case of
485 	 * a leaf page) to the new root page.  If the key is on an overflow
486 	 * page, mark the overflow chain so it isn't deleted when the leaf copy
487 	 * of the key is deleted.
488 	 *
489 	 * The btree comparison code guarantees that the left-most key on any
490 	 * level of the tree is never used, so it doesn't need to be filled
491 	 * in.  (This is not just convenience -- if the insert index is 0, we
492 	 * don't *have* a key to fill in.)  The right key is available because
493 	 * the split code guarantees not to split on the skipped index.
494 	 */
495 	nbytes = LALIGN(sizeof(size_t) + sizeof(pgno_t) + sizeof(u_char));
496 	h->linp[0] = h->upper = t->bt_psize - nbytes;
497 	dest = (char *)h + h->upper;
498 	WR_BINTERNAL(dest, 0, l->pgno, 0);
499 
500 	switch(h->flags & P_TYPE) {
501 	case P_BLEAF:
502 		bl = GETBLEAF(r, 0);
503 		nbytes = NBINTERNAL(bl->ksize);
504 		h->linp[1] = h->upper -= nbytes;
505 		dest = (char *)h + h->upper;
506 		WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
507 		bcopy(bl->bytes, dest, bl->ksize);
508 
509 		if (bl->flags & P_BIGKEY &&
510 		    bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
511 			return (RET_ERROR);
512 		break;
513 	case P_BINTERNAL:
514 		bi = GETBINTERNAL(r, 0);
515 		nbytes = NBINTERNAL(bi->ksize);
516 		h->linp[1] = h->upper -= nbytes;
517 		dest = (char *)h + h->upper;
518 		bcopy(bi, dest, nbytes);
519 		((BINTERNAL *)dest)->pgno = r->pgno;
520 		break;
521 	}
522 	h->lower = BTDATAOFF + 2 * sizeof(index_t);
523 
524 	/* Unpin the root page, set to btree internal page. */
525 	h->flags &= ~P_TYPE;
526 	h->flags |= P_BINTERNAL;
527 	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
528 
529 	return (RET_SUCCESS);
530 }
531 
532 /*
533  * BT_PSPLIT -- Do the real work of splitting the page.
534  *
535  * Parameters:
536  *	t:	tree
537  *	h:	page to be split
538  *	l:	page to put lower half of data
539  *	r:	page to put upper half of data
540  *	skip:	pointer to index to leave open
541  *
542  * Returns:
543  *	Pointer to page in which to insert.
544  */
545 static PAGE *
546 bt_psplit(t, h, l, r, pskip)
547 	BTREE *t;
548 	PAGE *h, *l, *r;
549 	int *pskip;
550 {
551 	BINTERNAL *bi;
552 	BLEAF *bl;
553 	RLEAF *rl;
554 	EPGNO *c;
555 	PAGE *rval;
556 	index_t half, skip;
557 	size_t nbytes;
558 	void *src;
559 	int bigkeycnt, isbigkey, nxt, off, top;
560 
561 	/*
562 	 * Split the data to the left and right pages. Leave the skip index
563 	 * open and guarantee that the split doesn't happen on that index (the
564 	 * right key must be available for the parent page).  Additionally,
565 	 * make some effort not to split on an overflow key.  This makes it
566 	 * faster to process internal pages and can save space since overflow
567 	 * keys used by internal pages are never deleted.
568 	 */
569 	bigkeycnt = 0;
570 	skip = *pskip;
571 	half = (t->bt_psize - BTDATAOFF) / 2;
572 	for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
573 		if (skip == off)
574 			continue;
575 		switch (h->flags & P_TYPE) {
576 		case P_BINTERNAL:
577 			src = bi = GETBINTERNAL(h, nxt);
578 			nbytes = NBINTERNAL(bi->ksize);
579 			isbigkey = bi->flags & P_BIGKEY;
580 			break;
581 		case P_BLEAF:
582 			src = bl = GETBLEAF(h, nxt);
583 			nbytes = NBLEAF(bl);
584 			isbigkey = bl->flags & P_BIGKEY;
585 			break;
586 		case P_RINTERNAL:
587 			src = GETRINTERNAL(h, nxt);
588 			nbytes = NRINTERNAL;
589 			isbigkey = 0;
590 			break;
591 		case P_RLEAF:
592 			src = rl = GETRLEAF(h, nxt);
593 			nbytes = NRLEAF(rl);
594 			isbigkey = 0;
595 			break;
596 		}
597 		++nxt;
598 		l->linp[off] = l->upper -= nbytes;
599 		bcopy(src, (char *)l + l->upper, nbytes);
600 
601 		/* There's no empirical justification for the '3'. */
602 		if (half < nbytes)
603 			if (!isbigkey || bigkeycnt == 3)
604 				break;
605 			else
606 				++bigkeycnt;
607 		else
608 			half -= nbytes;
609 	}
610 	l->lower += (off + 1) * sizeof(index_t);
611 
612 	/*
613 	 * If splitting the page that the cursor was on, the cursor has to be
614 	 * adjusted to point to the same record as before the split.  If the
615 	 * skipped slot and the cursor are both on the left page and the cursor
616 	 * is on or past the skipped slot, the cursor is incremented by one.
617 	 * If the skipped slot and the cursor are both on the right page and
618 	 * the cursor is on or past the skipped slot, the cursor is incremented
619 	 * by one.  If the skipped slot and the cursor aren't on the same page,
620 	 * the cursor isn't changed.  Regardless of the relationship of the
621 	 * skipped slot and the cursor, if the cursor is on the right page it
622 	 * is decremented by the number of records split to the left page.
623 	 *
624 	 * Don't bother checking for the BTF_SEQINIT flag, the page number will
625 	 * be P_INVALID.
626 	 */
627 	c = &t->bt_bcursor;
628 	if (c->pgno == h->pgno)
629 		if (c->index < off) {			/* left page */
630 			c->pgno = l->pgno;
631 			if (c->index >= skip)
632 				++c->index;
633 		} else {				/* right page */
634 			c->pgno = r->pgno;
635 			if (c->index >= skip && skip > off)
636 				++c->index;
637 			c->index -= off;
638 		}
639 
640 	/*
641 	 * Decide which page to return, and adjust the skip index if the
642 	 * to-be-inserted-upon page has changed.
643 	 */
644 	if (skip > off) {
645 		rval = r;
646 		*pskip -= off + 1;
647 	} else
648 		rval = l;
649 
650 	for (off = 0; nxt < top; ++off) {
651 		if (skip == nxt) {
652 			skip = 0;
653 			continue;
654 		}
655 		switch (h->flags & P_TYPE) {
656 		case P_BINTERNAL:
657 			src = bi = GETBINTERNAL(h, nxt);
658 			nbytes = NBINTERNAL(bi->ksize);
659 			break;
660 		case P_BLEAF:
661 			src = bl = GETBLEAF(h, nxt);
662 			nbytes = NBLEAF(bl);
663 			break;
664 		case P_RINTERNAL:
665 			src = GETRINTERNAL(h, nxt);
666 			nbytes = NRINTERNAL;
667 			break;
668 		case P_RLEAF:
669 			src = rl = GETRLEAF(h, nxt);
670 			nbytes = NRLEAF(rl);
671 			break;
672 		}
673 		++nxt;
674 		r->linp[off] = r->upper -= nbytes;
675 		bcopy(src, (char *)r + r->upper, nbytes);
676 	}
677 	r->lower += off * sizeof(index_t);
678 
679 	/* If the key is being appended to the page, adjust the index. */
680 	if (skip == top)
681 		r->lower += sizeof(index_t);
682 
683 	return (rval);
684 }
685 
686 /*
687  * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
688  *
689  * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
690  * record that references them gets deleted.  Chains pointed to by internal
691  * pages never get deleted.  This routine marks a chain as pointed to by an
692  * internal page.
693  *
694  * Parameters:
695  *	t:	tree
696  *	pg:	page number of first page in the chain.
697  *
698  * Returns:
699  *	RET_SUCCESS, RET_ERROR.
700  */
701 static int
702 bt_preserve(t, pg)
703 	BTREE *t;
704 	pgno_t pg;
705 {
706 	PAGE *h;
707 
708 	if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
709 		return (RET_ERROR);
710 	h->flags |= P_PRESERVE;
711 	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
712 	return (RET_SUCCESS);
713 }
714 
715 /*
716  * REC_TOTAL -- Return the number of recno entries below a page.
717  *
718  * Parameters:
719  *	h:	page
720  *
721  * Returns:
722  *	The number of recno entries below a page.
723  *
724  * XXX
725  * These values could be set by the bt_psplit routine.  The problem is that the
726  * entry has to be popped off of the stack etc. or the values have to be passed
727  * all the way back to bt_split/bt_rroot and it's not very clean.
728  */
729 static recno_t
730 rec_total(h)
731 	PAGE *h;
732 {
733 	recno_t recs;
734 	index_t nxt, top;
735 
736 	for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
737 		recs += GETRINTERNAL(h, nxt)->nrecs;
738 	return (recs);
739 }
740