1 /*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Mike Olson. 7 * 8 * %sccs.include.redist.c% 9 */ 10 11 #if defined(LIBC_SCCS) && !defined(lint) 12 static char sccsid[] = "@(#)bt_split.c 5.2 (Berkeley) 02/22/91"; 13 #endif /* LIBC_SCCS and not lint */ 14 15 #include <sys/types.h> 16 #include <db.h> 17 #include <stdlib.h> 18 #include <string.h> 19 #include "btree.h" 20 21 /* 22 * _BT_SPLIT -- Split a page into two pages. 23 * 24 * Splits are caused by insertions, and propogate up the tree in 25 * the usual way. The root page is always page 1 in the file on 26 * disk, so root splits are handled specially. On entry to this 27 * routine, t->bt_curpage is the page to be split. 28 * 29 * Parameters: 30 * t -- btree in which to do split. 31 * 32 * Returns: 33 * RET_SUCCESS, RET_ERROR. 34 * 35 * Side Effects: 36 * Changes the notion of the current page. 37 */ 38 39 int 40 _bt_split(t) 41 BTREE_P t; 42 { 43 BTHEADER *h; 44 BTHEADER *left, *right; 45 pgno_t nextpgno, parent; 46 int nbytes, len; 47 IDATUM *id; 48 DATUM *d; 49 char *src; 50 IDATUM *new; 51 pgno_t oldchain; 52 u_char flags; 53 54 h = (BTHEADER *) t->bt_curpage; 55 56 /* split root page specially, since it must remain page 1 */ 57 if (h->h_pgno == P_ROOT) { 58 return (_bt_splitroot(t)); 59 } 60 61 /* 62 * This is a little complicated. We go to some trouble to 63 * figure out which of the three possible cases -- in-memory tree, 64 * disk tree (no cache), and disk tree (cache) -- we have, in order 65 * to avoid unnecessary copying. If we have a disk cache, then we 66 * have to do some extra copying, though, since the cache code 67 * manages buffers externally to this code. 68 */ 69 70 if (ISDISK(t) && ISCACHE(t)) { 71 if ((left = (BTHEADER *) malloc((unsigned) t->bt_psize)) 72 == (BTHEADER *) NULL) 73 return (RET_ERROR); 74 left->h_pgno = left->h_prevpg = left->h_nextpg = P_NONE; 75 left->h_flags = t->bt_curpage->h_flags; 76 left->h_lower = (index_t) 77 (((char *) &(left->h_linp[0])) - ((char *) left)); 78 left->h_upper = t->bt_psize; 79 80 } else { 81 if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL) 82 return (RET_ERROR); 83 } 84 left->h_pgno = h->h_pgno; 85 86 if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL) 87 return (RET_ERROR); 88 right->h_pgno = ++(t->bt_npages); 89 90 /* now do the split */ 91 if (_bt_dopsplit(t, left, right) == RET_ERROR) 92 return (RET_ERROR); 93 94 right->h_prevpg = left->h_pgno; 95 nextpgno = right->h_nextpg = h->h_nextpg; 96 left->h_nextpg = right->h_pgno; 97 left->h_prevpg = h->h_prevpg; 98 99 /* okay, now use the left half of the page as the new page */ 100 if (ISDISK(t) && ISCACHE(t)) { 101 (void) bcopy((char *) left, (char *) t->bt_curpage, 102 (int) t->bt_psize); 103 (void) free ((char *) left); 104 left = t->bt_curpage; 105 } else { 106 (void) free((char *) t->bt_curpage); 107 t->bt_curpage = left; 108 } 109 110 /* 111 * Write the new pages out. We need them to stay where they are 112 * until we're done updating the parent pages. 113 */ 114 115 if (_bt_write(t, left, NORELEASE) == RET_ERROR) 116 return (RET_ERROR); 117 if (_bt_write(t, right, NORELEASE) == RET_ERROR) 118 return (RET_ERROR); 119 120 /* update 'prev' pointer of old neighbor of left */ 121 if (nextpgno != P_NONE) { 122 if (_bt_getpage(t, nextpgno) == RET_ERROR) 123 return (RET_ERROR); 124 h = t->bt_curpage; 125 h->h_prevpg = right->h_pgno; 126 h->h_flags |= F_DIRTY; 127 } 128 129 if ((parent = _bt_pop(t)) != P_NONE) { 130 if (right->h_flags & F_LEAF) { 131 d = (DATUM *) GETDATUM(right, 0); 132 len = d->d_ksize; 133 if (d->d_flags & D_BIGKEY) { 134 bcopy(&(d->d_bytes[0]), 135 (char *) &oldchain, 136 sizeof(oldchain)); 137 if (_bt_markchain(t, oldchain) == RET_ERROR) 138 return (RET_ERROR); 139 src = (char *) &oldchain; 140 flags = D_BIGKEY; 141 } else { 142 src = (char *) &(d->d_bytes[0]); 143 flags = 0; 144 } 145 } else { 146 id = (IDATUM *) GETDATUM(right, 0); 147 len = id->i_size; 148 flags = id->i_flags; 149 src = (char *) &(id->i_bytes[0]); 150 } 151 nbytes = len + (sizeof(IDATUM) - sizeof(char)); 152 new = (IDATUM *) malloc((unsigned) nbytes); 153 if (new == (IDATUM *) NULL) 154 return (RET_ERROR); 155 new->i_size = len; 156 new->i_pgno = right->h_pgno; 157 new->i_flags = flags; 158 if (len > 0) 159 (void) bcopy(src, (char *) &(new->i_bytes[0]), len); 160 161 nbytes = LONGALIGN(nbytes) + sizeof(index_t); 162 if (_bt_getpage(t, parent) == RET_ERROR) 163 return (RET_ERROR); 164 165 h = t->bt_curpage; 166 167 /* 168 * Split the parent if we need to, then reposition the 169 * tree's current page pointer for the new datum. 170 */ 171 if ((h->h_upper - h->h_lower) < nbytes) { 172 if (_bt_split(t) == RET_ERROR) 173 return (RET_ERROR); 174 if (_bt_reposition(t, new, parent, right->h_prevpg) 175 == RET_ERROR) 176 return (RET_ERROR); 177 } 178 179 /* okay, now insert the new idatum */ 180 if (_bt_inserti(t, new, right->h_prevpg) == RET_ERROR) 181 return (RET_ERROR); 182 } 183 184 /* 185 * Okay, split is done; don't need right page stapled down anymore. 186 * The page we call 'left' above is the new version of the old 187 * (split) page, so we can't release it. 188 */ 189 190 if (_bt_release(t, right) == RET_ERROR) 191 return (RET_ERROR); 192 if (ISDISK(t) && !ISCACHE(t)) 193 (void) free((char *) right); 194 195 return (RET_SUCCESS); 196 } 197 198 /* 199 * _BT_REPOSITION -- Reposition the current page pointer of a btree. 200 * 201 * After splitting a node in the tree in order to make room for 202 * an insertion, we need to figure out which page after the split 203 * should get the item we want to insert. This routine positions 204 * the tree's current page pointer appropriately. 205 * 206 * Parameters: 207 * t -- tree to position 208 * new -- the item we want to insert 209 * parent -- parent of the node that we just split 210 * prev -- page number of item directly to the left of 211 * new's position in the tree. 212 * 213 * Returns: 214 * RET_SUCCESS, RET_ERROR. 215 * 216 * Side Effects: 217 * None. 218 */ 219 220 int 221 _bt_reposition(t, new, parent, prev) 222 BTREE_P t; 223 IDATUM *new; 224 pgno_t parent; 225 pgno_t prev; 226 { 227 index_t i, next; 228 IDATUM *idx; 229 230 if (parent == P_ROOT) { 231 232 /* 233 * If we just split the root page, then there are guaranteed 234 * to be exactly two IDATUMs on it. Look at both of them 235 * to see if they point to the page that we want. 236 */ 237 238 if (_bt_getpage(t, parent) == RET_ERROR) 239 return (RET_ERROR); 240 241 next = NEXTINDEX(t->bt_curpage); 242 for (i = 0; i < next; i++) { 243 idx = (IDATUM *) GETDATUM(t->bt_curpage, i); 244 if (_bt_getpage(t, idx->i_pgno) == RET_ERROR) 245 return (RET_ERROR); 246 if (_bt_isonpage(t, new, prev) == RET_SUCCESS) 247 return (RET_SUCCESS); 248 if (_bt_getpage(t, parent) == RET_ERROR) 249 return (RET_ERROR); 250 } 251 } else { 252 253 /* 254 * Get the parent page -- which is where the new item would 255 * have gone -- and figure out whether the new item now goes 256 * on the parent, or the page immediately to the right of 257 * the parent. 258 */ 259 260 if (_bt_getpage(t, parent) == RET_ERROR) 261 return (RET_ERROR); 262 if (_bt_isonpage(t, new, prev) == RET_SUCCESS) 263 return (RET_SUCCESS); 264 if (_bt_getpage(t, t->bt_curpage->h_nextpg) == RET_ERROR) 265 return (RET_ERROR); 266 if (_bt_isonpage(t, new, prev) == RET_SUCCESS) 267 return (RET_SUCCESS); 268 } 269 return (RET_ERROR); 270 } 271 272 /* 273 * _BT_ISONPAGE -- Is the IDATUM for a given page number on the current page? 274 * 275 * This routine is used by _bt_reposition to decide whether the current 276 * page is the correct one on which to insert a new item. 277 * 278 * Parameters: 279 * t -- tree to check 280 * new -- the item that will be inserted (used for binary search) 281 * prev -- page number of page whose IDATUM is immediately to 282 * the left of new's position in the tree. 283 * 284 * Returns: 285 * RET_SUCCESS, RET_ERROR (corresponding to TRUE, FALSE). 286 */ 287 288 int 289 _bt_isonpage(t, new, prev) 290 BTREE_P t; 291 IDATUM *new; 292 pgno_t prev; 293 { 294 BTHEADER *h = (BTHEADER *) t->bt_curpage; 295 index_t i, next; 296 IDATUM *idx; 297 298 i = _bt_binsrch(t, &(new->i_bytes[0])); 299 while (i != 0 && _bt_cmp(t, &(new->i_bytes[0]), i) == 0) 300 --i; 301 next = NEXTINDEX(h); 302 idx = (IDATUM *) GETDATUM(h, i); 303 while (i < next && idx->i_pgno != prev) { 304 i++; 305 idx = (IDATUM *) GETDATUM(h, i); 306 } 307 if (idx->i_pgno == prev) 308 return (RET_SUCCESS); 309 else 310 return (RET_ERROR); 311 } 312 313 /* 314 * _BT_SPLITROOT -- Split the root of a btree. 315 * 316 * The root page for a btree is always page one. This means that in 317 * order to split the root, we need to do extra work. 318 * 319 * Parameters: 320 * t -- tree to split 321 * 322 * Returns: 323 * RET_SUCCESS, RET_ERROR. 324 * 325 * Side Effects: 326 * Splits root upward in the usual way, adding two new pages 327 * to the tree (rather than just one, as in usual splits). 328 */ 329 330 int 331 _bt_splitroot(t) 332 BTREE_P t; 333 { 334 BTHEADER *h = t->bt_curpage; 335 BTHEADER *left, *right; 336 IDATUM *id; 337 BTHEADER *where_h; 338 char *src, *dest; 339 int len, nbytes; 340 u_long was_leaf; 341 pgno_t oldchain; 342 u_char flags; 343 344 /* get two new pages for the split */ 345 if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL) 346 return (RET_ERROR); 347 left->h_pgno = ++(t->bt_npages); 348 if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL) 349 return (RET_ERROR); 350 right->h_pgno = ++(t->bt_npages); 351 352 /* do the split */ 353 if (_bt_dopsplit(t, left, right) == RET_ERROR) 354 return (RET_ERROR); 355 356 /* connect the new pages correctly */ 357 right->h_prevpg = left->h_pgno; 358 left->h_nextpg = right->h_pgno; 359 360 /* 361 * Write the child pages out now. We need them to remain 362 * where they are until we finish updating parent pages, 363 * however. 364 */ 365 366 if (_bt_write(t, left, NORELEASE) == RET_ERROR) 367 return (RET_ERROR); 368 if (_bt_write(t, right, NORELEASE) == RET_ERROR) 369 return (RET_ERROR); 370 371 /* now change the root page into an internal page */ 372 was_leaf = (h->h_flags & F_LEAF); 373 h->h_flags &= ~F_LEAF; 374 h->h_lower = (index_t) (((char *) (&(h->h_linp[0]))) - ((char *) h)); 375 h->h_upper = (index_t) t->bt_psize; 376 (void) bzero((char *) &(h->h_linp[0]), (int) (h->h_upper - h->h_lower)); 377 378 /* put two new keys on root page */ 379 where_h = left; 380 while (where_h) { 381 DATUM *data; 382 IDATUM *idata; 383 384 if (was_leaf) { 385 data = (DATUM *) GETDATUM(where_h, 0); 386 387 if (where_h == left) { 388 len = 0; /* first key in tree is null */ 389 } else { 390 if (data->d_flags & D_BIGKEY) { 391 bcopy(&(data->d_bytes[0]), 392 (char *) &oldchain, 393 sizeof(oldchain)); 394 if (_bt_markchain(t, oldchain) == RET_ERROR) 395 return (RET_ERROR); 396 src = (char *) &oldchain; 397 flags = D_BIGKEY; 398 } else { 399 src = (char *) &(data->d_bytes[0]); 400 flags = 0; 401 } 402 len = data->d_ksize; 403 } 404 } else { 405 idata = (IDATUM *) GETDATUM(where_h, 0); 406 len = idata->i_size; 407 flags = idata->i_flags; 408 src = &(idata->i_bytes[0]); 409 } 410 dest = ((char *) h) + h->h_upper; 411 nbytes = len + (sizeof (IDATUM) - sizeof(char)); 412 dest -= LONGALIGN(nbytes); 413 id = (IDATUM *) dest; 414 id->i_size = len; 415 id->i_pgno = where_h->h_pgno; 416 id->i_flags = flags; 417 if (len > 0) 418 (void) bcopy((char *) src, (char *) &(id->i_bytes[0]), len); 419 dest -= ((int) h); 420 h->h_linp[NEXTINDEX(h)] = (index_t) dest; 421 h->h_upper = (index_t) dest; 422 h->h_lower += sizeof(index_t); 423 424 /* next page */ 425 if (where_h == left) 426 where_h = right; 427 else 428 where_h = (BTHEADER *) NULL; 429 } 430 431 if (_bt_release(t, left) == RET_ERROR) 432 return (RET_ERROR); 433 if (_bt_release(t, right) == RET_ERROR) 434 return (RET_ERROR); 435 436 /* 437 * That's it, split is done. If we're doing a non-cached disk 438 * btree, we can free up the pages we allocated, as they're on 439 * disk, now. 440 */ 441 442 if (ISDISK(t) && !ISCACHE(t)) { 443 (void) free ((char *) left); 444 (void) free ((char *) right); 445 } 446 447 h->h_flags |= F_DIRTY; 448 449 return (RET_SUCCESS); 450 } 451 452 /* 453 * _BT_DOPSPLIT -- Do the work of splitting a page 454 * 455 * This routine takes two page pointers and splits the data on the 456 * current page of the btree between them. 457 * 458 * We do a lot of work here to handle duplicate keys on a page; we 459 * have to place these keys carefully to guarantee that later searches 460 * will find them correctly. See comments in the code below for details. 461 * 462 * Parameters: 463 * t -- tree to split 464 * left -- pointer to page to get left half of the data 465 * right -- pointer to page to get right half of the data 466 * 467 * Returns: 468 * None. 469 */ 470 471 int 472 _bt_dopsplit(t, left, right) 473 BTREE_P t; 474 BTHEADER *left; 475 BTHEADER *right; 476 { 477 BTHEADER *h = t->bt_curpage; 478 size_t psize; 479 char *where; 480 BTHEADER *where_h; 481 index_t where_i; 482 int nbytes, dsize, fixedsize, freespc; 483 index_t i; 484 index_t save_lower, save_upper, save_i; 485 index_t switch_i; 486 char *save_key; 487 DATUM *d; 488 CURSOR *c; 489 index_t top; 490 int free_save; 491 pgno_t chain; 492 int ignore; 493 494 /* 495 * Our strategy is to put half the bytes on each page. We figure 496 * out how many bytes we have total, and then move items until 497 * the last item moved put at least 50% of the data on the left 498 * page. 499 */ 500 save_key = (char *) NULL; 501 psize = (int) t->bt_psize; 502 where = ((char *) left) + psize; 503 where_h = left; 504 where_i = 0; 505 nbytes = psize - (int) ((char *) &(h->h_linp[0]) - ((char *) h)); 506 freespc = nbytes; 507 508 top = NEXTINDEX(h); 509 if (h->h_flags & F_LEAF) 510 fixedsize = (sizeof(DATUM) - sizeof(char)); 511 else 512 fixedsize = (sizeof(IDATUM) - sizeof(char)); 513 514 save_key = (char *) NULL; 515 516 /* move data */ 517 for (i = 0; i < top; i++) { 518 519 /* 520 * Internal and leaf pages have different layouts for 521 * data items, but in both cases the first entry in the 522 * data item is a size_t. 523 */ 524 d = (DATUM *) GETDATUM(h,i); 525 if (h->h_flags & F_LEAF) { 526 dsize = d->d_ksize + d->d_dsize + fixedsize; 527 } else { 528 dsize = d->d_ksize + fixedsize; 529 } 530 531 /* 532 * If a page contains duplicate keys, we have to be 533 * careful about splits. A sequence of duplicate keys 534 * may not begin in the middle of one page and end in 535 * the middle of another; it must begin on a page boundary, 536 * in order for searches on the internal nodes to work 537 * correctly. 538 */ 539 if (where_h == left) { 540 if (save_key == (char *) NULL) { 541 if (h->h_flags & F_LEAF) { 542 if (d->d_flags & D_BIGKEY) { 543 free_save = TRUE; 544 bcopy(&(d->d_bytes[0]), 545 (char *) &chain, 546 sizeof(chain)); 547 if (_bt_getbig(t, chain, 548 &save_key, 549 &ignore) 550 == RET_ERROR) 551 return (RET_ERROR); 552 } else { 553 free_save = FALSE; 554 save_key = (char *) &(d->d_bytes[0]); 555 } 556 } else { 557 IDATUM *id = (IDATUM *) d; 558 559 if (id->i_flags & D_BIGKEY) { 560 free_save = TRUE; 561 bcopy(&(id->i_bytes[0]), 562 (char *) &chain, 563 sizeof(chain)); 564 if (_bt_getbig(t, chain, 565 &save_key, 566 &ignore) 567 == RET_ERROR) 568 return (RET_ERROR); 569 } else { 570 free_save = FALSE; 571 save_key = (char *) 572 &(id->i_bytes[0]); 573 } 574 } 575 save_i = 0; 576 save_lower = where_h->h_lower; 577 save_upper = where_h->h_upper; 578 } else { 579 if (_bt_cmp(t, save_key, i) != 0) { 580 save_lower = where_h->h_lower; 581 save_upper = where_h->h_upper; 582 save_i = i; 583 } 584 if (h->h_flags & F_LEAF) { 585 if (free_save) 586 (void) free(save_key); 587 if (d->d_flags & D_BIGKEY) { 588 free_save = TRUE; 589 bcopy(&(d->d_bytes[0]), 590 (char *) &chain, 591 sizeof(chain)); 592 if (_bt_getbig(t, chain, 593 &save_key, 594 &ignore) 595 == RET_ERROR) 596 return (RET_ERROR); 597 } else { 598 free_save = FALSE; 599 save_key = (char *) &(d->d_bytes[0]); 600 } 601 } else { 602 IDATUM *id = (IDATUM *) d; 603 604 if (id->i_flags & D_BIGKEY) { 605 free_save = TRUE; 606 bcopy(&(id->i_bytes[0]), 607 (char *) &chain, 608 sizeof(chain)); 609 if (_bt_getbig(t, chain, 610 &save_key, 611 &ignore) 612 == RET_ERROR) 613 return (RET_ERROR); 614 } else { 615 free_save = FALSE; 616 save_key = (char *) 617 &(id->i_bytes[0]); 618 } 619 } 620 } 621 } 622 623 /* copy data and update page state */ 624 where -= LONGALIGN(dsize); 625 (void) bcopy((char *) d, (char *) where, dsize); 626 where_h->h_upper = where_h->h_linp[where_i] = 627 (index_t) (where - (int) where_h); 628 where_h->h_lower += sizeof(index_t); 629 where_i++; 630 631 /* if we've moved half, switch to the right-hand page */ 632 nbytes -= LONGALIGN(dsize) + sizeof(index_t); 633 if ((freespc - nbytes) > nbytes) { 634 nbytes = 2 * freespc; 635 636 /* identical keys go on the same page */ 637 if (save_i > 0) { 638 /* i gets incremented at loop bottom... */ 639 i = save_i - 1; 640 where_h->h_lower = save_lower; 641 where_h->h_upper = save_upper; 642 } 643 where = ((char *) right) + psize; 644 where_h = right; 645 switch_i = where_i; 646 where_i = 0; 647 } 648 } 649 650 /* 651 * If there was an active scan on the database, and we just 652 * split the page that the cursor was on, we may need to 653 * adjust the cursor to point to the same entry as before the 654 * split. 655 */ 656 657 c = &(t->bt_cursor); 658 if ((t->bt_flags & BTF_SEQINIT) 659 && (c->c_pgno == h->h_pgno) 660 && (c->c_index >= switch_i)) { 661 c->c_pgno = where_h->h_pgno; 662 c->c_index -= where_i; 663 } 664 return (RET_SUCCESS); 665 } 666