1 /* $OpenBSD: bt_split.c,v 1.13 2005/08/05 13:03:00 espie Exp $ */
2
3 /*-
4 * Copyright (c) 1990, 1993, 1994
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Mike Olson.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 #include <sys/types.h>
36
37 #include <limits.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41
42 #include <db.h>
43 #include "btree.h"
44
45 static int bt_broot(BTREE *, PAGE *, PAGE *, PAGE *);
46 static PAGE *bt_page(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
47 static int bt_preserve(BTREE *, pgno_t);
48 static PAGE *bt_psplit(BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t);
49 static PAGE *bt_root(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
50 static int bt_rroot(BTREE *, PAGE *, PAGE *, PAGE *);
51 static recno_t rec_total(PAGE *);
52
53 #ifdef STATISTICS
54 u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
55 #endif
56
57 /*
58 * __BT_SPLIT -- Split the tree.
59 *
60 * Parameters:
61 * t: tree
62 * sp: page to split
63 * key: key to insert
64 * data: data to insert
65 * flags: BIGKEY/BIGDATA flags
66 * ilen: insert length
67 * skip: index to leave open
68 *
69 * Returns:
70 * RET_ERROR, RET_SUCCESS
71 */
72 int
__bt_split(BTREE * t,PAGE * sp,const DBT * key,const DBT * data,int flags,size_t ilen,u_int32_t argskip)73 __bt_split(BTREE *t, PAGE *sp, const DBT *key, const DBT *data, int flags,
74 size_t ilen, u_int32_t argskip)
75 {
76 BINTERNAL *bi;
77 BLEAF *bl, *tbl;
78 DBT a, b;
79 EPGNO *parent;
80 PAGE *h, *l, *r, *lchild, *rchild;
81 indx_t nxtindex;
82 u_int16_t skip;
83 u_int32_t n, nbytes, nksize;
84 int parentsplit;
85 char *dest;
86
87 /*
88 * Split the page into two pages, l and r. The split routines return
89 * a pointer to the page into which the key should be inserted and with
90 * skip set to the offset which should be used. Additionally, l and r
91 * are pinned.
92 */
93 skip = argskip;
94 h = sp->pgno == P_ROOT ?
95 bt_root(t, sp, &l, &r, &skip, ilen) :
96 bt_page(t, sp, &l, &r, &skip, ilen);
97 if (h == NULL)
98 return (RET_ERROR);
99
100 /*
101 * Insert the new key/data pair into the leaf page. (Key inserts
102 * always cause a leaf page to split first.)
103 */
104 h->linp[skip] = h->upper -= ilen;
105 dest = (char *)h + h->upper;
106 if (F_ISSET(t, R_RECNO))
107 WR_RLEAF(dest, data, flags)
108 else
109 WR_BLEAF(dest, key, data, flags)
110
111 /* If the root page was split, make it look right. */
112 if (sp->pgno == P_ROOT &&
113 (F_ISSET(t, R_RECNO) ?
114 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
115 goto err2;
116
117 /*
118 * Now we walk the parent page stack -- a LIFO stack of the pages that
119 * were traversed when we searched for the page that split. Each stack
120 * entry is a page number and a page index offset. The offset is for
121 * the page traversed on the search. We've just split a page, so we
122 * have to insert a new key into the parent page.
123 *
124 * If the insert into the parent page causes it to split, may have to
125 * continue splitting all the way up the tree. We stop if the root
126 * splits or the page inserted into didn't have to split to hold the
127 * new key. Some algorithms replace the key for the old page as well
128 * as the new page. We don't, as there's no reason to believe that the
129 * first key on the old page is any better than the key we have, and,
130 * in the case of a key being placed at index 0 causing the split, the
131 * key is unavailable.
132 *
133 * There are a maximum of 5 pages pinned at any time. We keep the left
134 * and right pages pinned while working on the parent. The 5 are the
135 * two children, left parent and right parent (when the parent splits)
136 * and the root page or the overflow key page when calling bt_preserve.
137 * This code must make sure that all pins are released other than the
138 * root page or overflow page which is unlocked elsewhere.
139 */
140 while ((parent = BT_POP(t)) != NULL) {
141 lchild = l;
142 rchild = r;
143
144 /* Get the parent page. */
145 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
146 goto err2;
147
148 /*
149 * The new key goes ONE AFTER the index, because the split
150 * was to the right.
151 */
152 skip = parent->index + 1;
153
154 /*
155 * Calculate the space needed on the parent page.
156 *
157 * Prefix trees: space hack when inserting into BINTERNAL
158 * pages. Retain only what's needed to distinguish between
159 * the new entry and the LAST entry on the page to its left.
160 * If the keys compare equal, retain the entire key. Note,
161 * we don't touch overflow keys, and the entire key must be
162 * retained for the next-to-left most key on the leftmost
163 * page of each level, or the search will fail. Applicable
164 * ONLY to internal pages that have leaf pages as children.
165 * Further reduction of the key between pairs of internal
166 * pages loses too much information.
167 */
168 switch (rchild->flags & P_TYPE) {
169 case P_BINTERNAL:
170 bi = GETBINTERNAL(rchild, 0);
171 nbytes = NBINTERNAL(bi->ksize);
172 break;
173 case P_BLEAF:
174 bl = GETBLEAF(rchild, 0);
175 nbytes = NBINTERNAL(bl->ksize);
176 if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
177 (h->prevpg != P_INVALID || skip > 1)) {
178 tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
179 a.size = tbl->ksize;
180 a.data = tbl->bytes;
181 b.size = bl->ksize;
182 b.data = bl->bytes;
183 nksize = t->bt_pfx(&a, &b);
184 n = NBINTERNAL(nksize);
185 if (n < nbytes) {
186 #ifdef STATISTICS
187 bt_pfxsaved += nbytes - n;
188 #endif
189 nbytes = n;
190 } else
191 nksize = 0;
192 } else
193 nksize = 0;
194 break;
195 case P_RINTERNAL:
196 case P_RLEAF:
197 nbytes = NRINTERNAL;
198 break;
199 default:
200 abort();
201 }
202
203 /* Split the parent page if necessary or shift the indices. */
204 if (h->upper - h->lower < nbytes + sizeof(indx_t)) {
205 sp = h;
206 h = h->pgno == P_ROOT ?
207 bt_root(t, h, &l, &r, &skip, nbytes) :
208 bt_page(t, h, &l, &r, &skip, nbytes);
209 if (h == NULL)
210 goto err1;
211 parentsplit = 1;
212 } else {
213 if (skip < (nxtindex = NEXTINDEX(h)))
214 memmove(h->linp + skip + 1, h->linp + skip,
215 (nxtindex - skip) * sizeof(indx_t));
216 h->lower += sizeof(indx_t);
217 parentsplit = 0;
218 }
219
220 /* Insert the key into the parent page. */
221 switch (rchild->flags & P_TYPE) {
222 case P_BINTERNAL:
223 h->linp[skip] = h->upper -= nbytes;
224 dest = (char *)h + h->linp[skip];
225 memmove(dest, bi, nbytes);
226 ((BINTERNAL *)dest)->pgno = rchild->pgno;
227 break;
228 case P_BLEAF:
229 h->linp[skip] = h->upper -= nbytes;
230 dest = (char *)h + h->linp[skip];
231 WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
232 rchild->pgno, bl->flags & P_BIGKEY);
233 memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
234 if (bl->flags & P_BIGKEY &&
235 bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
236 goto err1;
237 break;
238 case P_RINTERNAL:
239 /*
240 * Update the left page count. If split
241 * added at index 0, fix the correct page.
242 */
243 if (skip > 0)
244 dest = (char *)h + h->linp[skip - 1];
245 else
246 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
247 ((RINTERNAL *)dest)->nrecs = rec_total(lchild);
248 ((RINTERNAL *)dest)->pgno = lchild->pgno;
249
250 /* Update the right page count. */
251 h->linp[skip] = h->upper -= nbytes;
252 dest = (char *)h + h->linp[skip];
253 ((RINTERNAL *)dest)->nrecs = rec_total(rchild);
254 ((RINTERNAL *)dest)->pgno = rchild->pgno;
255 break;
256 case P_RLEAF:
257 /*
258 * Update the left page count. If split
259 * added at index 0, fix the correct page.
260 */
261 if (skip > 0)
262 dest = (char *)h + h->linp[skip - 1];
263 else
264 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
265 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
266 ((RINTERNAL *)dest)->pgno = lchild->pgno;
267
268 /* Update the right page count. */
269 h->linp[skip] = h->upper -= nbytes;
270 dest = (char *)h + h->linp[skip];
271 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
272 ((RINTERNAL *)dest)->pgno = rchild->pgno;
273 break;
274 default:
275 abort();
276 }
277
278 /* Unpin the held pages. */
279 if (!parentsplit) {
280 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
281 break;
282 }
283
284 /* If the root page was split, make it look right. */
285 if (sp->pgno == P_ROOT &&
286 (F_ISSET(t, R_RECNO) ?
287 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
288 goto err1;
289
290 mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
291 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
292 }
293
294 /* Unpin the held pages. */
295 mpool_put(t->bt_mp, l, MPOOL_DIRTY);
296 mpool_put(t->bt_mp, r, MPOOL_DIRTY);
297
298 /* Clear any pages left on the stack. */
299 return (RET_SUCCESS);
300
301 /*
302 * If something fails in the above loop we were already walking back
303 * up the tree and the tree is now inconsistent. Nothing much we can
304 * do about it but release any memory we're holding.
305 */
306 err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
307 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
308
309 err2: mpool_put(t->bt_mp, l, 0);
310 mpool_put(t->bt_mp, r, 0);
311 __dbpanic(t->bt_dbp);
312 return (RET_ERROR);
313 }
314
315 /*
316 * BT_PAGE -- Split a non-root page of a btree.
317 *
318 * Parameters:
319 * t: tree
320 * h: root page
321 * lp: pointer to left page pointer
322 * rp: pointer to right page pointer
323 * skip: pointer to index to leave open
324 * ilen: insert length
325 *
326 * Returns:
327 * Pointer to page in which to insert or NULL on error.
328 */
329 static PAGE *
bt_page(BTREE * t,PAGE * h,PAGE ** lp,PAGE ** rp,indx_t * skip,size_t ilen)330 bt_page(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen)
331 {
332 PAGE *l, *r, *tp;
333 pgno_t npg;
334
335 #ifdef STATISTICS
336 ++bt_split;
337 #endif
338 /* Put the new right page for the split into place. */
339 if ((r = __bt_new(t, &npg)) == NULL)
340 return (NULL);
341 r->pgno = npg;
342 r->lower = BTDATAOFF;
343 r->upper = t->bt_psize;
344 r->nextpg = h->nextpg;
345 r->prevpg = h->pgno;
346 r->flags = h->flags & P_TYPE;
347
348 /*
349 * If we're splitting the last page on a level because we're appending
350 * a key to it (skip is NEXTINDEX()), it's likely that the data is
351 * sorted. Adding an empty page on the side of the level is less work
352 * and can push the fill factor much higher than normal. If we're
353 * wrong it's no big deal, we'll just do the split the right way next
354 * time. It may look like it's equally easy to do a similar hack for
355 * reverse sorted data, that is, split the tree left, but it's not.
356 * Don't even try.
357 */
358 if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
359 #ifdef STATISTICS
360 ++bt_sortsplit;
361 #endif
362 h->nextpg = r->pgno;
363 r->lower = BTDATAOFF + sizeof(indx_t);
364 *skip = 0;
365 *lp = h;
366 *rp = r;
367 return (r);
368 }
369
370 /* Put the new left page for the split into place. */
371 if ((l = (PAGE *)malloc(t->bt_psize)) == NULL) {
372 mpool_put(t->bt_mp, r, 0);
373 return (NULL);
374 }
375 memset(l, 0xff, t->bt_psize);
376 l->pgno = h->pgno;
377 l->nextpg = r->pgno;
378 l->prevpg = h->prevpg;
379 l->lower = BTDATAOFF;
380 l->upper = t->bt_psize;
381 l->flags = h->flags & P_TYPE;
382
383 /* Fix up the previous pointer of the page after the split page. */
384 if (h->nextpg != P_INVALID) {
385 if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
386 free(l);
387 /* XXX mpool_free(t->bt_mp, r->pgno); */
388 return (NULL);
389 }
390 tp->prevpg = r->pgno;
391 mpool_put(t->bt_mp, tp, MPOOL_DIRTY);
392 }
393
394 /*
395 * Split right. The key/data pairs aren't sorted in the btree page so
396 * it's simpler to copy the data from the split page onto two new pages
397 * instead of copying half the data to the right page and compacting
398 * the left page in place. Since the left page can't change, we have
399 * to swap the original and the allocated left page after the split.
400 */
401 tp = bt_psplit(t, h, l, r, skip, ilen);
402
403 /* Move the new left page onto the old left page. */
404 memmove(h, l, t->bt_psize);
405 if (tp == l)
406 tp = h;
407 free(l);
408
409 *lp = h;
410 *rp = r;
411 return (tp);
412 }
413
414 /*
415 * BT_ROOT -- Split the root page of a btree.
416 *
417 * Parameters:
418 * t: tree
419 * h: root page
420 * lp: pointer to left page pointer
421 * rp: pointer to right page pointer
422 * skip: pointer to index to leave open
423 * ilen: insert length
424 *
425 * Returns:
426 * Pointer to page in which to insert or NULL on error.
427 */
428 static PAGE *
bt_root(BTREE * t,PAGE * h,PAGE ** lp,PAGE ** rp,indx_t * skip,size_t ilen)429 bt_root(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen)
430 {
431 PAGE *l, *r, *tp;
432 pgno_t lnpg, rnpg;
433
434 #ifdef STATISTICS
435 ++bt_split;
436 ++bt_rootsplit;
437 #endif
438 /* Put the new left and right pages for the split into place. */
439 if ((l = __bt_new(t, &lnpg)) == NULL ||
440 (r = __bt_new(t, &rnpg)) == NULL)
441 return (NULL);
442 l->pgno = lnpg;
443 r->pgno = rnpg;
444 l->nextpg = r->pgno;
445 r->prevpg = l->pgno;
446 l->prevpg = r->nextpg = P_INVALID;
447 l->lower = r->lower = BTDATAOFF;
448 l->upper = r->upper = t->bt_psize;
449 l->flags = r->flags = h->flags & P_TYPE;
450
451 /* Split the root page. */
452 tp = bt_psplit(t, h, l, r, skip, ilen);
453
454 *lp = l;
455 *rp = r;
456 return (tp);
457 }
458
459 /*
460 * BT_RROOT -- Fix up the recno root page after it has been split.
461 *
462 * Parameters:
463 * t: tree
464 * h: root page
465 * l: left page
466 * r: right page
467 *
468 * Returns:
469 * RET_ERROR, RET_SUCCESS
470 */
471 static int
bt_rroot(BTREE * t,PAGE * h,PAGE * l,PAGE * r)472 bt_rroot(BTREE *t, PAGE *h, PAGE *l, PAGE *r)
473 {
474 char *dest;
475
476 /* Insert the left and right keys, set the header information. */
477 h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
478 dest = (char *)h + h->upper;
479 WR_RINTERNAL(dest,
480 l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
481
482 h->linp[1] = h->upper -= NRINTERNAL;
483 dest = (char *)h + h->upper;
484 WR_RINTERNAL(dest,
485 r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
486
487 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
488
489 /* Unpin the root page, set to recno internal page. */
490 h->flags &= ~P_TYPE;
491 h->flags |= P_RINTERNAL;
492 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
493
494 return (RET_SUCCESS);
495 }
496
497 /*
498 * BT_BROOT -- Fix up the btree root page after it has been split.
499 *
500 * Parameters:
501 * t: tree
502 * h: root page
503 * l: left page
504 * r: right page
505 *
506 * Returns:
507 * RET_ERROR, RET_SUCCESS
508 */
509 static int
bt_broot(BTREE * t,PAGE * h,PAGE * l,PAGE * r)510 bt_broot(BTREE *t, PAGE *h, PAGE *l, PAGE *r)
511 {
512 BINTERNAL *bi;
513 BLEAF *bl;
514 u_int32_t nbytes;
515 char *dest;
516
517 /*
518 * If the root page was a leaf page, change it into an internal page.
519 * We copy the key we split on (but not the key's data, in the case of
520 * a leaf page) to the new root page.
521 *
522 * The btree comparison code guarantees that the left-most key on any
523 * level of the tree is never used, so it doesn't need to be filled in.
524 */
525 nbytes = NBINTERNAL(0);
526 h->linp[0] = h->upper = t->bt_psize - nbytes;
527 dest = (char *)h + h->upper;
528 WR_BINTERNAL(dest, 0, l->pgno, 0);
529
530 switch (h->flags & P_TYPE) {
531 case P_BLEAF:
532 bl = GETBLEAF(r, 0);
533 nbytes = NBINTERNAL(bl->ksize);
534 h->linp[1] = h->upper -= nbytes;
535 dest = (char *)h + h->upper;
536 WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
537 memmove(dest, bl->bytes, bl->ksize);
538
539 /*
540 * If the key is on an overflow page, mark the overflow chain
541 * so it isn't deleted when the leaf copy of the key is deleted.
542 */
543 if (bl->flags & P_BIGKEY &&
544 bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
545 return (RET_ERROR);
546 break;
547 case P_BINTERNAL:
548 bi = GETBINTERNAL(r, 0);
549 nbytes = NBINTERNAL(bi->ksize);
550 h->linp[1] = h->upper -= nbytes;
551 dest = (char *)h + h->upper;
552 memmove(dest, bi, nbytes);
553 ((BINTERNAL *)dest)->pgno = r->pgno;
554 break;
555 default:
556 abort();
557 }
558
559 /* There are two keys on the page. */
560 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
561
562 /* Unpin the root page, set to btree internal page. */
563 h->flags &= ~P_TYPE;
564 h->flags |= P_BINTERNAL;
565 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
566
567 return (RET_SUCCESS);
568 }
569
570 /*
571 * BT_PSPLIT -- Do the real work of splitting the page.
572 *
573 * Parameters:
574 * t: tree
575 * h: page to be split
576 * l: page to put lower half of data
577 * r: page to put upper half of data
578 * pskip: pointer to index to leave open
579 * ilen: insert length
580 *
581 * Returns:
582 * Pointer to page in which to insert.
583 */
584 static PAGE *
bt_psplit(BTREE * t,PAGE * h,PAGE * l,PAGE * r,indx_t * pskip,size_t ilen)585 bt_psplit(BTREE *t, PAGE *h, PAGE *l, PAGE *r, indx_t *pskip, size_t ilen)
586 {
587 BINTERNAL *bi;
588 BLEAF *bl;
589 CURSOR *c;
590 RLEAF *rl;
591 PAGE *rval;
592 void *src;
593 indx_t full, half, nxt, off, skip, top, used;
594 u_int32_t nbytes;
595 int bigkeycnt, isbigkey;
596
597 /*
598 * Split the data to the left and right pages. Leave the skip index
599 * open. Additionally, make some effort not to split on an overflow
600 * key. This makes internal page processing faster and can save
601 * space as overflow keys used by internal pages are never deleted.
602 */
603 bigkeycnt = 0;
604 skip = *pskip;
605 full = t->bt_psize - BTDATAOFF;
606 half = full / 2;
607 used = 0;
608 for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
609 if (skip == off) {
610 nbytes = ilen;
611 isbigkey = 0; /* XXX: not really known. */
612 } else
613 switch (h->flags & P_TYPE) {
614 case P_BINTERNAL:
615 src = bi = GETBINTERNAL(h, nxt);
616 nbytes = NBINTERNAL(bi->ksize);
617 isbigkey = bi->flags & P_BIGKEY;
618 break;
619 case P_BLEAF:
620 src = bl = GETBLEAF(h, nxt);
621 nbytes = NBLEAF(bl);
622 isbigkey = bl->flags & P_BIGKEY;
623 break;
624 case P_RINTERNAL:
625 src = GETRINTERNAL(h, nxt);
626 nbytes = NRINTERNAL;
627 isbigkey = 0;
628 break;
629 case P_RLEAF:
630 src = rl = GETRLEAF(h, nxt);
631 nbytes = NRLEAF(rl);
632 isbigkey = 0;
633 break;
634 default:
635 abort();
636 }
637
638 /*
639 * If the key/data pairs are substantial fractions of the max
640 * possible size for the page, it's possible to get situations
641 * where we decide to try and copy too much onto the left page.
642 * Make sure that doesn't happen.
643 */
644 if ((skip <= off && used + nbytes + sizeof(indx_t) >= full) ||
645 nxt == top - 1) {
646 --off;
647 break;
648 }
649
650 /* Copy the key/data pair, if not the skipped index. */
651 if (skip != off) {
652 ++nxt;
653
654 l->linp[off] = l->upper -= nbytes;
655 memmove((char *)l + l->upper, src, nbytes);
656 }
657
658 used += nbytes + sizeof(indx_t);
659 if (used >= half) {
660 if (!isbigkey || bigkeycnt == 3)
661 break;
662 else
663 ++bigkeycnt;
664 }
665 }
666
667 /*
668 * Off is the last offset that's valid for the left page.
669 * Nxt is the first offset to be placed on the right page.
670 */
671 l->lower += (off + 1) * sizeof(indx_t);
672
673 /*
674 * If splitting the page that the cursor was on, the cursor has to be
675 * adjusted to point to the same record as before the split. If the
676 * cursor is at or past the skipped slot, the cursor is incremented by
677 * one. If the cursor is on the right page, it is decremented by the
678 * number of records split to the left page.
679 */
680 c = &t->bt_cursor;
681 if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) {
682 if (c->pg.index >= skip)
683 ++c->pg.index;
684 if (c->pg.index < nxt) /* Left page. */
685 c->pg.pgno = l->pgno;
686 else { /* Right page. */
687 c->pg.pgno = r->pgno;
688 c->pg.index -= nxt;
689 }
690 }
691
692 /*
693 * If the skipped index was on the left page, just return that page.
694 * Otherwise, adjust the skip index to reflect the new position on
695 * the right page.
696 */
697 if (skip <= off) {
698 skip = MAX_PAGE_OFFSET;
699 rval = l;
700 } else {
701 rval = r;
702 *pskip -= nxt;
703 }
704
705 for (off = 0; nxt < top; ++off) {
706 if (skip == nxt) {
707 ++off;
708 skip = MAX_PAGE_OFFSET;
709 }
710 switch (h->flags & P_TYPE) {
711 case P_BINTERNAL:
712 src = bi = GETBINTERNAL(h, nxt);
713 nbytes = NBINTERNAL(bi->ksize);
714 break;
715 case P_BLEAF:
716 src = bl = GETBLEAF(h, nxt);
717 nbytes = NBLEAF(bl);
718 break;
719 case P_RINTERNAL:
720 src = GETRINTERNAL(h, nxt);
721 nbytes = NRINTERNAL;
722 break;
723 case P_RLEAF:
724 src = rl = GETRLEAF(h, nxt);
725 nbytes = NRLEAF(rl);
726 break;
727 default:
728 abort();
729 }
730 ++nxt;
731 r->linp[off] = r->upper -= nbytes;
732 memmove((char *)r + r->upper, src, nbytes);
733 }
734 r->lower += off * sizeof(indx_t);
735
736 /* If the key is being appended to the page, adjust the index. */
737 if (skip == top)
738 r->lower += sizeof(indx_t);
739
740 return (rval);
741 }
742
743 /*
744 * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
745 *
746 * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
747 * record that references them gets deleted. Chains pointed to by internal
748 * pages never get deleted. This routine marks a chain as pointed to by an
749 * internal page.
750 *
751 * Parameters:
752 * t: tree
753 * pg: page number of first page in the chain.
754 *
755 * Returns:
756 * RET_SUCCESS, RET_ERROR.
757 */
758 static int
bt_preserve(BTREE * t,pgno_t pg)759 bt_preserve(BTREE *t, pgno_t pg)
760 {
761 PAGE *h;
762
763 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
764 return (RET_ERROR);
765 h->flags |= P_PRESERVE;
766 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
767 return (RET_SUCCESS);
768 }
769
770 /*
771 * REC_TOTAL -- Return the number of recno entries below a page.
772 *
773 * Parameters:
774 * h: page
775 *
776 * Returns:
777 * The number of recno entries below a page.
778 *
779 * XXX
780 * These values could be set by the bt_psplit routine. The problem is that the
781 * entry has to be popped off of the stack etc. or the values have to be passed
782 * all the way back to bt_split/bt_rroot and it's not very clean.
783 */
784 static recno_t
rec_total(PAGE * h)785 rec_total(PAGE *h)
786 {
787 recno_t recs;
788 indx_t nxt, top;
789
790 for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
791 recs += GETRINTERNAL(h, nxt)->nrecs;
792 return (recs);
793 }
794