1 /*-
2 * Copyright (c) 1990, 1993, 1994
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Mike Olson.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)bt_split.c 8.10 (Berkeley) 1/9/95
33 * $FreeBSD: head/lib/libc/db/btree/bt_split.c 223262 2011-06-18 13:56:33Z benl $
34 */
35
36 #include <sys/types.h>
37 #include <sys/param.h>
38
39 #include <limits.h>
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <string.h>
43
44 #include <db.h>
45 #include "btree.h"
46
47 static int bt_broot(BTREE *, PAGE *, PAGE *, PAGE *);
48 static PAGE *bt_page(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
49 static int bt_preserve(BTREE *, pgno_t);
50 static PAGE *bt_psplit(BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t);
51 static PAGE *bt_root(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
52 static int bt_rroot(BTREE *, PAGE *, PAGE *, PAGE *);
53 static recno_t rec_total(PAGE *);
54
55 #ifdef STATISTICS
56 unsigned long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
57 #endif
58
59 /*
60 * __BT_SPLIT -- Split the tree.
61 *
62 * Parameters:
63 * t: tree
64 * sp: page to split
65 * key: key to insert
66 * data: data to insert
67 * flags: BIGKEY/BIGDATA flags
68 * ilen: insert length
69 * skip: index to leave open
70 *
71 * Returns:
72 * RET_ERROR, RET_SUCCESS
73 */
74 int
__bt_split(BTREE * t,PAGE * sp,const DBT * key,const DBT * data,int flags,size_t ilen,uint32_t argskip)75 __bt_split(BTREE *t, PAGE *sp, const DBT *key, const DBT *data, int flags,
76 size_t ilen, uint32_t argskip)
77 {
78 BINTERNAL *bi;
79 BLEAF *bl, *tbl;
80 DBT a, b;
81 EPGNO *parent;
82 PAGE *h, *l, *r, *lchild, *rchild;
83 indx_t nxtindex;
84 uint16_t skip;
85 uint32_t n, nbytes, nksize;
86 int parentsplit;
87 char *dest;
88
89 /*
90 * Split the page into two pages, l and r. The split routines return
91 * a pointer to the page into which the key should be inserted and with
92 * skip set to the offset which should be used. Additionally, l and r
93 * are pinned.
94 */
95 skip = argskip;
96 h = sp->pgno == P_ROOT ?
97 bt_root(t, sp, &l, &r, &skip, ilen) :
98 bt_page(t, sp, &l, &r, &skip, ilen);
99 if (h == NULL)
100 return (RET_ERROR);
101
102 /*
103 * Insert the new key/data pair into the leaf page. (Key inserts
104 * always cause a leaf page to split first.)
105 */
106 h->linp[skip] = h->upper -= ilen;
107 dest = (char *)h + h->upper;
108 if (F_ISSET(t, R_RECNO))
109 WR_RLEAF(dest, data, flags)
110 else
111 WR_BLEAF(dest, key, data, flags)
112
113 /* If the root page was split, make it look right. */
114 if (sp->pgno == P_ROOT &&
115 (F_ISSET(t, R_RECNO) ?
116 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
117 goto err2;
118
119 /*
120 * Now we walk the parent page stack -- a LIFO stack of the pages that
121 * were traversed when we searched for the page that split. Each stack
122 * entry is a page number and a page index offset. The offset is for
123 * the page traversed on the search. We've just split a page, so we
124 * have to insert a new key into the parent page.
125 *
126 * If the insert into the parent page causes it to split, may have to
127 * continue splitting all the way up the tree. We stop if the root
128 * splits or the page inserted into didn't have to split to hold the
129 * new key. Some algorithms replace the key for the old page as well
130 * as the new page. We don't, as there's no reason to believe that the
131 * first key on the old page is any better than the key we have, and,
132 * in the case of a key being placed at index 0 causing the split, the
133 * key is unavailable.
134 *
135 * There are a maximum of 5 pages pinned at any time. We keep the left
136 * and right pages pinned while working on the parent. The 5 are the
137 * two children, left parent and right parent (when the parent splits)
138 * and the root page or the overflow key page when calling bt_preserve.
139 * This code must make sure that all pins are released other than the
140 * root page or overflow page which is unlocked elsewhere.
141 */
142 while ((parent = BT_POP(t)) != NULL) {
143 lchild = l;
144 rchild = r;
145
146 /* Get the parent page. */
147 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
148 goto err2;
149
150 /*
151 * The new key goes ONE AFTER the index, because the split
152 * was to the right.
153 */
154 skip = parent->index + 1;
155
156 /*
157 * Calculate the space needed on the parent page.
158 *
159 * Prefix trees: space hack when inserting into BINTERNAL
160 * pages. Retain only what's needed to distinguish between
161 * the new entry and the LAST entry on the page to its left.
162 * If the keys compare equal, retain the entire key. Note,
163 * we don't touch overflow keys, and the entire key must be
164 * retained for the next-to-left most key on the leftmost
165 * page of each level, or the search will fail. Applicable
166 * ONLY to internal pages that have leaf pages as children.
167 * Further reduction of the key between pairs of internal
168 * pages loses too much information.
169 */
170 switch (rchild->flags & P_TYPE) {
171 case P_BINTERNAL:
172 bi = GETBINTERNAL(rchild, 0);
173 nbytes = NBINTERNAL(bi->ksize);
174 break;
175 case P_BLEAF:
176 bl = GETBLEAF(rchild, 0);
177 nbytes = NBINTERNAL(bl->ksize);
178 if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
179 (h->prevpg != P_INVALID || skip > 1)) {
180 tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
181 a.size = tbl->ksize;
182 a.data = tbl->bytes;
183 b.size = bl->ksize;
184 b.data = bl->bytes;
185 nksize = t->bt_pfx(&a, &b);
186 n = NBINTERNAL(nksize);
187 if (n < nbytes) {
188 #ifdef STATISTICS
189 bt_pfxsaved += nbytes - n;
190 #endif
191 nbytes = n;
192 } else
193 nksize = 0;
194 } else
195 nksize = 0;
196 break;
197 case P_RINTERNAL:
198 case P_RLEAF:
199 nbytes = NRINTERNAL;
200 break;
201 default:
202 abort();
203 }
204
205 /* Split the parent page if necessary or shift the indices. */
206 if ((uint32_t)(h->upper - h->lower) < nbytes + sizeof(indx_t)) {
207 sp = h;
208 h = h->pgno == P_ROOT ?
209 bt_root(t, h, &l, &r, &skip, nbytes) :
210 bt_page(t, h, &l, &r, &skip, nbytes);
211 if (h == NULL)
212 goto err1;
213 parentsplit = 1;
214 } else {
215 if (skip < (nxtindex = NEXTINDEX(h)))
216 memmove(h->linp + skip + 1, h->linp + skip,
217 (nxtindex - skip) * sizeof(indx_t));
218 h->lower += sizeof(indx_t);
219 parentsplit = 0;
220 }
221
222 /* Insert the key into the parent page. */
223 switch (rchild->flags & P_TYPE) {
224 case P_BINTERNAL:
225 h->linp[skip] = h->upper -= nbytes;
226 dest = (char *)h + h->linp[skip];
227 memmove(dest, bi, nbytes);
228 ((BINTERNAL *)dest)->pgno = rchild->pgno;
229 break;
230 case P_BLEAF:
231 h->linp[skip] = h->upper -= nbytes;
232 dest = (char *)h + h->linp[skip];
233 WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
234 rchild->pgno, bl->flags & P_BIGKEY);
235 memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
236 if (bl->flags & P_BIGKEY &&
237 bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
238 goto err1;
239 break;
240 case P_RINTERNAL:
241 /*
242 * Update the left page count. If split
243 * added at index 0, fix the correct page.
244 */
245 if (skip > 0)
246 dest = (char *)h + h->linp[skip - 1];
247 else
248 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
249 ((RINTERNAL *)dest)->nrecs = rec_total(lchild);
250 ((RINTERNAL *)dest)->pgno = lchild->pgno;
251
252 /* Update the right page count. */
253 h->linp[skip] = h->upper -= nbytes;
254 dest = (char *)h + h->linp[skip];
255 ((RINTERNAL *)dest)->nrecs = rec_total(rchild);
256 ((RINTERNAL *)dest)->pgno = rchild->pgno;
257 break;
258 case P_RLEAF:
259 /*
260 * Update the left page count. If split
261 * added at index 0, fix the correct page.
262 */
263 if (skip > 0)
264 dest = (char *)h + h->linp[skip - 1];
265 else
266 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
267 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
268 ((RINTERNAL *)dest)->pgno = lchild->pgno;
269
270 /* Update the right page count. */
271 h->linp[skip] = h->upper -= nbytes;
272 dest = (char *)h + h->linp[skip];
273 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
274 ((RINTERNAL *)dest)->pgno = rchild->pgno;
275 break;
276 default:
277 abort();
278 }
279
280 /* Unpin the held pages. */
281 if (!parentsplit) {
282 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
283 break;
284 }
285
286 /* If the root page was split, make it look right. */
287 if (sp->pgno == P_ROOT &&
288 (F_ISSET(t, R_RECNO) ?
289 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
290 goto err1;
291
292 mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
293 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
294 }
295
296 /* Unpin the held pages. */
297 mpool_put(t->bt_mp, l, MPOOL_DIRTY);
298 mpool_put(t->bt_mp, r, MPOOL_DIRTY);
299
300 /* Clear any pages left on the stack. */
301 return (RET_SUCCESS);
302
303 /*
304 * If something fails in the above loop we were already walking back
305 * up the tree and the tree is now inconsistent. Nothing much we can
306 * do about it but release any memory we're holding.
307 */
308 err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
309 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
310
311 err2: mpool_put(t->bt_mp, l, 0);
312 mpool_put(t->bt_mp, r, 0);
313 __dbpanic(t->bt_dbp);
314 return (RET_ERROR);
315 }
316
317 /*
318 * BT_PAGE -- Split a non-root page of a btree.
319 *
320 * Parameters:
321 * t: tree
322 * h: root page
323 * lp: pointer to left page pointer
324 * rp: pointer to right page pointer
325 * skip: pointer to index to leave open
326 * ilen: insert length
327 *
328 * Returns:
329 * Pointer to page in which to insert or NULL on error.
330 */
331 static PAGE *
bt_page(BTREE * t,PAGE * h,PAGE ** lp,PAGE ** rp,indx_t * skip,size_t ilen)332 bt_page(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen)
333 {
334 PAGE *l, *r, *tp;
335 pgno_t npg;
336
337 #ifdef STATISTICS
338 ++bt_split;
339 #endif
340 /* Put the new right page for the split into place. */
341 if ((r = __bt_new(t, &npg)) == NULL)
342 return (NULL);
343 r->pgno = npg;
344 r->lower = BTDATAOFF;
345 r->upper = t->bt_psize;
346 r->nextpg = h->nextpg;
347 r->prevpg = h->pgno;
348 r->flags = h->flags & P_TYPE;
349
350 /*
351 * If we're splitting the last page on a level because we're appending
352 * a key to it (skip is NEXTINDEX()), it's likely that the data is
353 * sorted. Adding an empty page on the side of the level is less work
354 * and can push the fill factor much higher than normal. If we're
355 * wrong it's no big deal, we'll just do the split the right way next
356 * time. It may look like it's equally easy to do a similar hack for
357 * reverse sorted data, that is, split the tree left, but it's not.
358 * Don't even try.
359 */
360 if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
361 #ifdef STATISTICS
362 ++bt_sortsplit;
363 #endif
364 h->nextpg = r->pgno;
365 r->lower = BTDATAOFF + sizeof(indx_t);
366 *skip = 0;
367 *lp = h;
368 *rp = r;
369 return (r);
370 }
371
372 /* Put the new left page for the split into place. */
373 if ((l = (PAGE *)calloc(1, t->bt_psize)) == NULL) {
374 mpool_put(t->bt_mp, r, 0);
375 return (NULL);
376 }
377 l->pgno = h->pgno;
378 l->nextpg = r->pgno;
379 l->prevpg = h->prevpg;
380 l->lower = BTDATAOFF;
381 l->upper = t->bt_psize;
382 l->flags = h->flags & P_TYPE;
383
384 /* Fix up the previous pointer of the page after the split page. */
385 if (h->nextpg != P_INVALID) {
386 if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
387 free(l);
388 /* XXX mpool_free(t->bt_mp, r->pgno); */
389 return (NULL);
390 }
391 tp->prevpg = r->pgno;
392 mpool_put(t->bt_mp, tp, MPOOL_DIRTY);
393 }
394
395 /*
396 * Split right. The key/data pairs aren't sorted in the btree page so
397 * it's simpler to copy the data from the split page onto two new pages
398 * instead of copying half the data to the right page and compacting
399 * the left page in place. Since the left page can't change, we have
400 * to swap the original and the allocated left page after the split.
401 */
402 tp = bt_psplit(t, h, l, r, skip, ilen);
403
404 /* Move the new left page onto the old left page. */
405 memmove(h, l, t->bt_psize);
406 if (tp == l)
407 tp = h;
408 free(l);
409
410 *lp = h;
411 *rp = r;
412 return (tp);
413 }
414
415 /*
416 * BT_ROOT -- Split the root page of a btree.
417 *
418 * Parameters:
419 * t: tree
420 * h: root page
421 * lp: pointer to left page pointer
422 * rp: pointer to right page pointer
423 * skip: pointer to index to leave open
424 * ilen: insert length
425 *
426 * Returns:
427 * Pointer to page in which to insert or NULL on error.
428 */
429 static PAGE *
bt_root(BTREE * t,PAGE * h,PAGE ** lp,PAGE ** rp,indx_t * skip,size_t ilen)430 bt_root(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen)
431 {
432 PAGE *l, *r, *tp;
433 pgno_t lnpg, rnpg;
434
435 #ifdef STATISTICS
436 ++bt_split;
437 ++bt_rootsplit;
438 #endif
439 /* Put the new left and right pages for the split into place. */
440 if ((l = __bt_new(t, &lnpg)) == NULL ||
441 (r = __bt_new(t, &rnpg)) == NULL)
442 return (NULL);
443 l->pgno = lnpg;
444 r->pgno = rnpg;
445 l->nextpg = r->pgno;
446 r->prevpg = l->pgno;
447 l->prevpg = r->nextpg = P_INVALID;
448 l->lower = r->lower = BTDATAOFF;
449 l->upper = r->upper = t->bt_psize;
450 l->flags = r->flags = h->flags & P_TYPE;
451
452 /* Split the root page. */
453 tp = bt_psplit(t, h, l, r, skip, ilen);
454
455 *lp = l;
456 *rp = r;
457 return (tp);
458 }
459
460 /*
461 * BT_RROOT -- Fix up the recno root page after it has been split.
462 *
463 * Parameters:
464 * t: tree
465 * h: root page
466 * l: left page
467 * r: right page
468 *
469 * Returns:
470 * RET_ERROR, RET_SUCCESS
471 */
472 static int
bt_rroot(BTREE * t,PAGE * h,PAGE * l,PAGE * r)473 bt_rroot(BTREE *t, PAGE *h, PAGE *l, PAGE *r)
474 {
475 char *dest;
476
477 /* Insert the left and right keys, set the header information. */
478 h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
479 dest = (char *)h + h->upper;
480 WR_RINTERNAL(dest,
481 l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
482
483 __PAST_END(h->linp, 1) = h->upper -= NRINTERNAL;
484 dest = (char *)h + h->upper;
485 WR_RINTERNAL(dest,
486 r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
487
488 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
489
490 /* Unpin the root page, set to recno internal page. */
491 h->flags &= ~P_TYPE;
492 h->flags |= P_RINTERNAL;
493 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
494
495 return (RET_SUCCESS);
496 }
497
498 /*
499 * BT_BROOT -- Fix up the btree root page after it has been split.
500 *
501 * Parameters:
502 * t: tree
503 * h: root page
504 * l: left page
505 * r: right page
506 *
507 * Returns:
508 * RET_ERROR, RET_SUCCESS
509 */
510 static int
bt_broot(BTREE * t,PAGE * h,PAGE * l,PAGE * r)511 bt_broot(BTREE *t, PAGE *h, PAGE *l, PAGE *r)
512 {
513 BINTERNAL *bi;
514 BLEAF *bl;
515 uint32_t nbytes;
516 char *dest;
517
518 /*
519 * If the root page was a leaf page, change it into an internal page.
520 * We copy the key we split on (but not the key's data, in the case of
521 * a leaf page) to the new root page.
522 *
523 * The btree comparison code guarantees that the left-most key on any
524 * level of the tree is never used, so it doesn't need to be filled in.
525 */
526 nbytes = NBINTERNAL(0);
527 h->linp[0] = h->upper = t->bt_psize - nbytes;
528 dest = (char *)h + h->upper;
529 WR_BINTERNAL(dest, 0, l->pgno, 0);
530
531 switch (h->flags & P_TYPE) {
532 case P_BLEAF:
533 bl = GETBLEAF(r, 0);
534 nbytes = NBINTERNAL(bl->ksize);
535 __PAST_END(h->linp, 1) = h->upper -= nbytes;
536 dest = (char *)h + h->upper;
537 WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
538 memmove(dest, bl->bytes, bl->ksize);
539
540 /*
541 * If the key is on an overflow page, mark the overflow chain
542 * so it isn't deleted when the leaf copy of the key is deleted.
543 */
544 if (bl->flags & P_BIGKEY &&
545 bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
546 return (RET_ERROR);
547 break;
548 case P_BINTERNAL:
549 bi = GETBINTERNAL(r, 0);
550 nbytes = NBINTERNAL(bi->ksize);
551 __PAST_END(h->linp, 1) = h->upper -= nbytes;
552 dest = (char *)h + h->upper;
553 memmove(dest, bi, nbytes);
554 ((BINTERNAL *)dest)->pgno = r->pgno;
555 break;
556 default:
557 abort();
558 }
559
560 /* There are two keys on the page. */
561 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
562
563 /* Unpin the root page, set to btree internal page. */
564 h->flags &= ~P_TYPE;
565 h->flags |= P_BINTERNAL;
566 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
567
568 return (RET_SUCCESS);
569 }
570
571 /*
572 * BT_PSPLIT -- Do the real work of splitting the page.
573 *
574 * Parameters:
575 * t: tree
576 * h: page to be split
577 * l: page to put lower half of data
578 * r: page to put upper half of data
579 * pskip: pointer to index to leave open
580 * ilen: insert length
581 *
582 * Returns:
583 * Pointer to page in which to insert.
584 */
585 static PAGE *
bt_psplit(BTREE * t,PAGE * h,PAGE * l,PAGE * r,indx_t * pskip,size_t ilen)586 bt_psplit(BTREE *t, PAGE *h, PAGE *l, PAGE *r, indx_t *pskip, size_t ilen)
587 {
588 BINTERNAL *bi;
589 BLEAF *bl;
590 CURSOR *c;
591 RLEAF *rl;
592 PAGE *rval;
593 void *src;
594 indx_t full, half, nxt, off, skip, top, used;
595 uint32_t nbytes;
596 int bigkeycnt, isbigkey;
597
598 /*
599 * Split the data to the left and right pages. Leave the skip index
600 * open. Additionally, make some effort not to split on an overflow
601 * key. This makes internal page processing faster and can save
602 * space as overflow keys used by internal pages are never deleted.
603 */
604 bigkeycnt = 0;
605 skip = *pskip;
606 full = t->bt_psize - BTDATAOFF;
607 half = full / 2;
608 used = 0;
609 for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
610 if (skip == off) {
611 nbytes = ilen;
612 isbigkey = 0; /* XXX: not really known. */
613 } else
614 switch (h->flags & P_TYPE) {
615 case P_BINTERNAL:
616 src = bi = GETBINTERNAL(h, nxt);
617 nbytes = NBINTERNAL(bi->ksize);
618 isbigkey = bi->flags & P_BIGKEY;
619 break;
620 case P_BLEAF:
621 src = bl = GETBLEAF(h, nxt);
622 nbytes = NBLEAF(bl);
623 isbigkey = bl->flags & P_BIGKEY;
624 break;
625 case P_RINTERNAL:
626 src = GETRINTERNAL(h, nxt);
627 nbytes = NRINTERNAL;
628 isbigkey = 0;
629 break;
630 case P_RLEAF:
631 src = rl = GETRLEAF(h, nxt);
632 nbytes = NRLEAF(rl);
633 isbigkey = 0;
634 break;
635 default:
636 abort();
637 }
638
639 /*
640 * If the key/data pairs are substantial fractions of the max
641 * possible size for the page, it's possible to get situations
642 * where we decide to try and copy too much onto the left page.
643 * Make sure that doesn't happen.
644 */
645 if ((skip <= off && used + nbytes + sizeof(indx_t) >= full) ||
646 nxt == top - 1) {
647 --off;
648 break;
649 }
650
651 /* Copy the key/data pair, if not the skipped index. */
652 if (skip != off) {
653 ++nxt;
654
655 l->linp[off] = l->upper -= nbytes;
656 memmove((char *)l + l->upper, src, nbytes);
657 }
658
659 used += nbytes + sizeof(indx_t);
660 if (used >= half) {
661 if (!isbigkey || bigkeycnt == 3)
662 break;
663 else
664 ++bigkeycnt;
665 }
666 }
667
668 /*
669 * Off is the last offset that's valid for the left page.
670 * Nxt is the first offset to be placed on the right page.
671 */
672 l->lower += (off + 1) * sizeof(indx_t);
673
674 /*
675 * If splitting the page that the cursor was on, the cursor has to be
676 * adjusted to point to the same record as before the split. If the
677 * cursor is at or past the skipped slot, the cursor is incremented by
678 * one. If the cursor is on the right page, it is decremented by the
679 * number of records split to the left page.
680 */
681 c = &t->bt_cursor;
682 if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) {
683 if (c->pg.index >= skip)
684 ++c->pg.index;
685 if (c->pg.index < nxt) /* Left page. */
686 c->pg.pgno = l->pgno;
687 else { /* Right page. */
688 c->pg.pgno = r->pgno;
689 c->pg.index -= nxt;
690 }
691 }
692
693 /*
694 * If the skipped index was on the left page, just return that page.
695 * Otherwise, adjust the skip index to reflect the new position on
696 * the right page.
697 */
698 if (skip <= off) {
699 skip = MAX_PAGE_OFFSET;
700 rval = l;
701 } else {
702 rval = r;
703 *pskip -= nxt;
704 }
705
706 for (off = 0; nxt < top; ++off) {
707 if (skip == nxt) {
708 ++off;
709 skip = MAX_PAGE_OFFSET;
710 }
711 switch (h->flags & P_TYPE) {
712 case P_BINTERNAL:
713 src = bi = GETBINTERNAL(h, nxt);
714 nbytes = NBINTERNAL(bi->ksize);
715 break;
716 case P_BLEAF:
717 src = bl = GETBLEAF(h, nxt);
718 nbytes = NBLEAF(bl);
719 break;
720 case P_RINTERNAL:
721 src = GETRINTERNAL(h, nxt);
722 nbytes = NRINTERNAL;
723 break;
724 case P_RLEAF:
725 src = rl = GETRLEAF(h, nxt);
726 nbytes = NRLEAF(rl);
727 break;
728 default:
729 abort();
730 }
731 ++nxt;
732 r->linp[off] = r->upper -= nbytes;
733 memmove((char *)r + r->upper, src, nbytes);
734 }
735 r->lower += off * sizeof(indx_t);
736
737 /* If the key is being appended to the page, adjust the index. */
738 if (skip == top)
739 r->lower += sizeof(indx_t);
740
741 return (rval);
742 }
743
744 /*
745 * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
746 *
747 * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
748 * record that references them gets deleted. Chains pointed to by internal
749 * pages never get deleted. This routine marks a chain as pointed to by an
750 * internal page.
751 *
752 * Parameters:
753 * t: tree
754 * pg: page number of first page in the chain.
755 *
756 * Returns:
757 * RET_SUCCESS, RET_ERROR.
758 */
759 static int
bt_preserve(BTREE * t,pgno_t pg)760 bt_preserve(BTREE *t, pgno_t pg)
761 {
762 PAGE *h;
763
764 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
765 return (RET_ERROR);
766 h->flags |= P_PRESERVE;
767 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
768 return (RET_SUCCESS);
769 }
770
771 /*
772 * REC_TOTAL -- Return the number of recno entries below a page.
773 *
774 * Parameters:
775 * h: page
776 *
777 * Returns:
778 * The number of recno entries below a page.
779 *
780 * XXX
781 * These values could be set by the bt_psplit routine. The problem is that the
782 * entry has to be popped off of the stack etc. or the values have to be passed
783 * all the way back to bt_split/bt_rroot and it's not very clean.
784 */
785 static recno_t
rec_total(PAGE * h)786 rec_total(PAGE *h)
787 {
788 recno_t recs;
789 indx_t nxt, top;
790
791 for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
792 recs += GETRINTERNAL(h, nxt)->nrecs;
793 return (recs);
794 }
795