1 /*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Mike Olson. 7 * 8 * %sccs.include.redist.c% 9 */ 10 11 #if defined(LIBC_SCCS) && !defined(lint) 12 static char sccsid[] = "@(#)bt_split.c 5.1 (Berkeley) 01/23/91"; 13 #endif /* LIBC_SCCS and not lint */ 14 15 #include <sys/types.h> 16 #include <db.h> 17 #include "btree.h" 18 19 /* 20 * _BT_SPLIT -- Split a page into two pages. 21 * 22 * Splits are caused by insertions, and propogate up the tree in 23 * the usual way. The root page is always page 1 in the file on 24 * disk, so root splits are handled specially. On entry to this 25 * routine, t->bt_curpage is the page to be split. 26 * 27 * Parameters: 28 * t -- btree in which to do split. 29 * 30 * Returns: 31 * RET_SUCCESS, RET_ERROR. 32 * 33 * Side Effects: 34 * Changes the notion of the current page. 35 */ 36 37 int 38 _bt_split(t) 39 BTREE_P t; 40 { 41 BTHEADER *h; 42 BTHEADER *left, *right; 43 pgno_t nextpgno, parent; 44 int nbytes, len; 45 IDATUM *id; 46 DATUM *d; 47 char *src; 48 IDATUM *new; 49 pgno_t oldchain; 50 u_char flags; 51 52 h = (BTHEADER *) t->bt_curpage; 53 54 /* split root page specially, since it must remain page 1 */ 55 if (h->h_pgno == P_ROOT) { 56 return (_bt_splitroot(t)); 57 } 58 59 /* 60 * This is a little complicated. We go to some trouble to 61 * figure out which of the three possible cases -- in-memory tree, 62 * disk tree (no cache), and disk tree (cache) -- we have, in order 63 * to avoid unnecessary copying. If we have a disk cache, then we 64 * have to do some extra copying, though, since the cache code 65 * manages buffers externally to this code. 66 */ 67 68 if (ISDISK(t) && ISCACHE(t)) { 69 if ((left = (BTHEADER *) malloc((unsigned) t->bt_psize)) 70 == (BTHEADER *) NULL) 71 return (RET_ERROR); 72 left->h_pgno = left->h_prevpg = left->h_nextpg = P_NONE; 73 left->h_flags = t->bt_curpage->h_flags; 74 left->h_lower = (index_t) 75 (((char *) &(left->h_linp[0])) - ((char *) left)); 76 left->h_upper = t->bt_psize; 77 78 } else { 79 if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL) 80 return (RET_ERROR); 81 } 82 left->h_pgno = h->h_pgno; 83 84 if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL) 85 return (RET_ERROR); 86 right->h_pgno = ++(t->bt_npages); 87 88 /* now do the split */ 89 if (_bt_dopsplit(t, left, right) == RET_ERROR) 90 return (RET_ERROR); 91 92 right->h_prevpg = left->h_pgno; 93 nextpgno = right->h_nextpg = h->h_nextpg; 94 left->h_nextpg = right->h_pgno; 95 left->h_prevpg = h->h_prevpg; 96 97 /* okay, now use the left half of the page as the new page */ 98 if (ISDISK(t) && ISCACHE(t)) { 99 (void) bcopy((char *) left, (char *) t->bt_curpage, 100 (int) t->bt_psize); 101 (void) free ((char *) left); 102 left = t->bt_curpage; 103 } else { 104 (void) free((char *) t->bt_curpage); 105 t->bt_curpage = left; 106 } 107 108 /* 109 * Write the new pages out. We need them to stay where they are 110 * until we're done updating the parent pages. 111 */ 112 113 if (_bt_write(t, left, NORELEASE) == RET_ERROR) 114 return (RET_ERROR); 115 if (_bt_write(t, right, NORELEASE) == RET_ERROR) 116 return (RET_ERROR); 117 118 /* update 'prev' pointer of old neighbor of left */ 119 if (nextpgno != P_NONE) { 120 if (_bt_getpage(t, nextpgno) == RET_ERROR) 121 return (RET_ERROR); 122 h = t->bt_curpage; 123 h->h_prevpg = right->h_pgno; 124 h->h_flags |= F_DIRTY; 125 } 126 127 if ((parent = _bt_pop(t)) != P_NONE) { 128 if (right->h_flags & F_LEAF) { 129 d = (DATUM *) GETDATUM(right, 0); 130 len = d->d_ksize; 131 if (d->d_flags & D_BIGKEY) { 132 bcopy(&(d->d_bytes[0]), 133 (char *) &oldchain, 134 sizeof(oldchain)); 135 if (_bt_markchain(t, oldchain) == RET_ERROR) 136 return (RET_ERROR); 137 src = (char *) &oldchain; 138 flags = D_BIGKEY; 139 } else { 140 src = (char *) &(d->d_bytes[0]); 141 flags = 0; 142 } 143 } else { 144 id = (IDATUM *) GETDATUM(right, 0); 145 len = id->i_size; 146 flags = id->i_flags; 147 src = (char *) &(id->i_bytes[0]); 148 } 149 nbytes = len + (sizeof(IDATUM) - sizeof(char)); 150 new = (IDATUM *) malloc((unsigned) nbytes); 151 if (new == (IDATUM *) NULL) 152 return (RET_ERROR); 153 new->i_size = len; 154 new->i_pgno = right->h_pgno; 155 new->i_flags = flags; 156 if (len > 0) 157 (void) bcopy(src, (char *) &(new->i_bytes[0]), len); 158 159 nbytes = LONGALIGN(nbytes) + sizeof(index_t); 160 if (_bt_getpage(t, parent) == RET_ERROR) 161 return (RET_ERROR); 162 163 h = t->bt_curpage; 164 165 /* 166 * Split the parent if we need to, then reposition the 167 * tree's current page pointer for the new datum. 168 */ 169 if ((h->h_upper - h->h_lower) < nbytes) { 170 if (_bt_split(t) == RET_ERROR) 171 return (RET_ERROR); 172 if (_bt_reposition(t, new, parent, right->h_prevpg) 173 == RET_ERROR) 174 return (RET_ERROR); 175 } 176 177 /* okay, now insert the new idatum */ 178 if (_bt_inserti(t, new, right->h_prevpg) == RET_ERROR) 179 return (RET_ERROR); 180 } 181 182 /* 183 * Okay, split is done; don't need right page stapled down anymore. 184 * The page we call 'left' above is the new version of the old 185 * (split) page, so we can't release it. 186 */ 187 188 if (_bt_release(t, right) == RET_ERROR) 189 return (RET_ERROR); 190 if (ISDISK(t) && !ISCACHE(t)) 191 (void) free((char *) right); 192 193 return (RET_SUCCESS); 194 } 195 196 /* 197 * _BT_REPOSITION -- Reposition the current page pointer of a btree. 198 * 199 * After splitting a node in the tree in order to make room for 200 * an insertion, we need to figure out which page after the split 201 * should get the item we want to insert. This routine positions 202 * the tree's current page pointer appropriately. 203 * 204 * Parameters: 205 * t -- tree to position 206 * new -- the item we want to insert 207 * parent -- parent of the node that we just split 208 * prev -- page number of item directly to the left of 209 * new's position in the tree. 210 * 211 * Returns: 212 * RET_SUCCESS, RET_ERROR. 213 * 214 * Side Effects: 215 * None. 216 */ 217 218 int 219 _bt_reposition(t, new, parent, prev) 220 BTREE_P t; 221 IDATUM *new; 222 pgno_t parent; 223 pgno_t prev; 224 { 225 index_t i, next; 226 IDATUM *idx; 227 228 if (parent == P_ROOT) { 229 230 /* 231 * If we just split the root page, then there are guaranteed 232 * to be exactly two IDATUMs on it. Look at both of them 233 * to see if they point to the page that we want. 234 */ 235 236 if (_bt_getpage(t, parent) == RET_ERROR) 237 return (RET_ERROR); 238 239 next = NEXTINDEX(t->bt_curpage); 240 for (i = 0; i < next; i++) { 241 idx = (IDATUM *) GETDATUM(t->bt_curpage, i); 242 if (_bt_getpage(t, idx->i_pgno) == RET_ERROR) 243 return (RET_ERROR); 244 if (_bt_isonpage(t, new, prev) == RET_SUCCESS) 245 return (RET_SUCCESS); 246 if (_bt_getpage(t, parent) == RET_ERROR) 247 return (RET_ERROR); 248 } 249 } else { 250 251 /* 252 * Get the parent page -- which is where the new item would 253 * have gone -- and figure out whether the new item now goes 254 * on the parent, or the page immediately to the right of 255 * the parent. 256 */ 257 258 if (_bt_getpage(t, parent) == RET_ERROR) 259 return (RET_ERROR); 260 if (_bt_isonpage(t, new, prev) == RET_SUCCESS) 261 return (RET_SUCCESS); 262 if (_bt_getpage(t, t->bt_curpage->h_nextpg) == RET_ERROR) 263 return (RET_ERROR); 264 if (_bt_isonpage(t, new, prev) == RET_SUCCESS) 265 return (RET_SUCCESS); 266 } 267 return (RET_ERROR); 268 } 269 270 /* 271 * _BT_ISONPAGE -- Is the IDATUM for a given page number on the current page? 272 * 273 * This routine is used by _bt_reposition to decide whether the current 274 * page is the correct one on which to insert a new item. 275 * 276 * Parameters: 277 * t -- tree to check 278 * new -- the item that will be inserted (used for binary search) 279 * prev -- page number of page whose IDATUM is immediately to 280 * the left of new's position in the tree. 281 * 282 * Returns: 283 * RET_SUCCESS, RET_ERROR (corresponding to TRUE, FALSE). 284 */ 285 286 int 287 _bt_isonpage(t, new, prev) 288 BTREE_P t; 289 IDATUM *new; 290 pgno_t prev; 291 { 292 BTHEADER *h = (BTHEADER *) t->bt_curpage; 293 index_t i, next; 294 IDATUM *idx; 295 296 i = _bt_binsrch(t, &(new->i_bytes[0])); 297 while (i != 0 && _bt_cmp(t, &(new->i_bytes[0]), i) == 0) 298 --i; 299 next = NEXTINDEX(h); 300 idx = (IDATUM *) GETDATUM(h, i); 301 while (i < next && idx->i_pgno != prev) { 302 i++; 303 idx = (IDATUM *) GETDATUM(h, i); 304 } 305 if (idx->i_pgno == prev) 306 return (RET_SUCCESS); 307 else 308 return (RET_ERROR); 309 } 310 311 /* 312 * _BT_SPLITROOT -- Split the root of a btree. 313 * 314 * The root page for a btree is always page one. This means that in 315 * order to split the root, we need to do extra work. 316 * 317 * Parameters: 318 * t -- tree to split 319 * 320 * Returns: 321 * RET_SUCCESS, RET_ERROR. 322 * 323 * Side Effects: 324 * Splits root upward in the usual way, adding two new pages 325 * to the tree (rather than just one, as in usual splits). 326 */ 327 328 int 329 _bt_splitroot(t) 330 BTREE_P t; 331 { 332 BTHEADER *h = t->bt_curpage; 333 BTHEADER *left, *right; 334 IDATUM *id; 335 BTHEADER *where_h; 336 char *src, *dest; 337 int len, nbytes; 338 u_long was_leaf; 339 pgno_t oldchain; 340 u_char flags; 341 342 /* get two new pages for the split */ 343 if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL) 344 return (RET_ERROR); 345 left->h_pgno = ++(t->bt_npages); 346 if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL) 347 return (RET_ERROR); 348 right->h_pgno = ++(t->bt_npages); 349 350 /* do the split */ 351 if (_bt_dopsplit(t, left, right) == RET_ERROR) 352 return (RET_ERROR); 353 354 /* connect the new pages correctly */ 355 right->h_prevpg = left->h_pgno; 356 left->h_nextpg = right->h_pgno; 357 358 /* 359 * Write the child pages out now. We need them to remain 360 * where they are until we finish updating parent pages, 361 * however. 362 */ 363 364 if (_bt_write(t, left, NORELEASE) == RET_ERROR) 365 return (RET_ERROR); 366 if (_bt_write(t, right, NORELEASE) == RET_ERROR) 367 return (RET_ERROR); 368 369 /* now change the root page into an internal page */ 370 was_leaf = (h->h_flags & F_LEAF); 371 h->h_flags &= ~F_LEAF; 372 h->h_lower = (index_t) (((char *) (&(h->h_linp[0]))) - ((char *) h)); 373 h->h_upper = (index_t) t->bt_psize; 374 (void) bzero((char *) &(h->h_linp[0]), (int) (h->h_upper - h->h_lower)); 375 376 /* put two new keys on root page */ 377 where_h = left; 378 while (where_h) { 379 DATUM *data; 380 IDATUM *idata; 381 382 if (was_leaf) { 383 data = (DATUM *) GETDATUM(where_h, 0); 384 385 if (where_h == left) { 386 len = 0; /* first key in tree is null */ 387 } else { 388 if (data->d_flags & D_BIGKEY) { 389 bcopy(&(data->d_bytes[0]), 390 (char *) &oldchain, 391 sizeof(oldchain)); 392 if (_bt_markchain(t, oldchain) == RET_ERROR) 393 return (RET_ERROR); 394 src = (char *) &oldchain; 395 flags = D_BIGKEY; 396 } else { 397 src = (char *) &(data->d_bytes[0]); 398 flags = 0; 399 } 400 len = data->d_ksize; 401 } 402 } else { 403 idata = (IDATUM *) GETDATUM(where_h, 0); 404 len = idata->i_size; 405 flags = idata->i_flags; 406 src = &(idata->i_bytes[0]); 407 } 408 dest = ((char *) h) + h->h_upper; 409 nbytes = len + (sizeof (IDATUM) - sizeof(char)); 410 dest -= LONGALIGN(nbytes); 411 id = (IDATUM *) dest; 412 id->i_size = len; 413 id->i_pgno = where_h->h_pgno; 414 id->i_flags = flags; 415 if (len > 0) 416 (void) bcopy((char *) src, (char *) &(id->i_bytes[0]), len); 417 dest -= ((int) h); 418 h->h_linp[NEXTINDEX(h)] = (index_t) dest; 419 h->h_upper = (index_t) dest; 420 h->h_lower += sizeof(index_t); 421 422 /* next page */ 423 if (where_h == left) 424 where_h = right; 425 else 426 where_h = (BTHEADER *) NULL; 427 } 428 429 if (_bt_release(t, left) == RET_ERROR) 430 return (RET_ERROR); 431 if (_bt_release(t, right) == RET_ERROR) 432 return (RET_ERROR); 433 434 /* 435 * That's it, split is done. If we're doing a non-cached disk 436 * btree, we can free up the pages we allocated, as they're on 437 * disk, now. 438 */ 439 440 if (ISDISK(t) && !ISCACHE(t)) { 441 (void) free ((char *) left); 442 (void) free ((char *) right); 443 } 444 445 h->h_flags |= F_DIRTY; 446 447 return (RET_SUCCESS); 448 } 449 450 /* 451 * _BT_DOPSPLIT -- Do the work of splitting a page 452 * 453 * This routine takes two page pointers and splits the data on the 454 * current page of the btree between them. 455 * 456 * We do a lot of work here to handle duplicate keys on a page; we 457 * have to place these keys carefully to guarantee that later searches 458 * will find them correctly. See comments in the code below for details. 459 * 460 * Parameters: 461 * t -- tree to split 462 * left -- pointer to page to get left half of the data 463 * right -- pointer to page to get right half of the data 464 * 465 * Returns: 466 * None. 467 */ 468 469 int 470 _bt_dopsplit(t, left, right) 471 BTREE_P t; 472 BTHEADER *left; 473 BTHEADER *right; 474 { 475 BTHEADER *h = t->bt_curpage; 476 size_t psize; 477 char *where; 478 BTHEADER *where_h; 479 index_t where_i; 480 int nbytes, dsize, fixedsize, freespc; 481 index_t i; 482 index_t save_lower, save_upper, save_i; 483 index_t switch_i; 484 char *save_key; 485 DATUM *d; 486 CURSOR *c; 487 index_t top; 488 int free_save; 489 pgno_t chain; 490 int ignore; 491 492 /* 493 * Our strategy is to put half the bytes on each page. We figure 494 * out how many bytes we have total, and then move items until 495 * the last item moved put at least 50% of the data on the left 496 * page. 497 */ 498 save_key = (char *) NULL; 499 psize = (int) t->bt_psize; 500 where = ((char *) left) + psize; 501 where_h = left; 502 where_i = 0; 503 nbytes = psize - (int) ((char *) &(h->h_linp[0]) - ((char *) h)); 504 freespc = nbytes; 505 506 top = NEXTINDEX(h); 507 if (h->h_flags & F_LEAF) 508 fixedsize = (sizeof(DATUM) - sizeof(char)); 509 else 510 fixedsize = (sizeof(IDATUM) - sizeof(char)); 511 512 save_key = (char *) NULL; 513 514 /* move data */ 515 for (i = 0; i < top; i++) { 516 517 /* 518 * Internal and leaf pages have different layouts for 519 * data items, but in both cases the first entry in the 520 * data item is a size_t. 521 */ 522 d = (DATUM *) GETDATUM(h,i); 523 if (h->h_flags & F_LEAF) { 524 dsize = d->d_ksize + d->d_dsize + fixedsize; 525 } else { 526 dsize = d->d_ksize + fixedsize; 527 } 528 529 /* 530 * If a page contains duplicate keys, we have to be 531 * careful about splits. A sequence of duplicate keys 532 * may not begin in the middle of one page and end in 533 * the middle of another; it must begin on a page boundary, 534 * in order for searches on the internal nodes to work 535 * correctly. 536 */ 537 if (where_h == left) { 538 if (save_key == (char *) NULL) { 539 if (h->h_flags & F_LEAF) { 540 if (d->d_flags & D_BIGKEY) { 541 free_save = TRUE; 542 bcopy(&(d->d_bytes[0]), 543 (char *) &chain, 544 sizeof(chain)); 545 if (_bt_getbig(t, chain, 546 &save_key, 547 &ignore) 548 == RET_ERROR) 549 return (RET_ERROR); 550 } else { 551 free_save = FALSE; 552 save_key = (char *) &(d->d_bytes[0]); 553 } 554 } else { 555 IDATUM *id = (IDATUM *) d; 556 557 if (id->i_flags & D_BIGKEY) { 558 free_save = TRUE; 559 bcopy(&(id->i_bytes[0]), 560 (char *) &chain, 561 sizeof(chain)); 562 if (_bt_getbig(t, chain, 563 &save_key, 564 &ignore) 565 == RET_ERROR) 566 return (RET_ERROR); 567 } else { 568 free_save = FALSE; 569 save_key = (char *) 570 &(id->i_bytes[0]); 571 } 572 } 573 save_i = 0; 574 save_lower = where_h->h_lower; 575 save_upper = where_h->h_upper; 576 } else { 577 if (_bt_cmp(t, save_key, i) != 0) { 578 save_lower = where_h->h_lower; 579 save_upper = where_h->h_upper; 580 save_i = i; 581 } 582 if (h->h_flags & F_LEAF) { 583 if (free_save) 584 (void) free(save_key); 585 if (d->d_flags & D_BIGKEY) { 586 free_save = TRUE; 587 bcopy(&(d->d_bytes[0]), 588 (char *) &chain, 589 sizeof(chain)); 590 if (_bt_getbig(t, chain, 591 &save_key, 592 &ignore) 593 == RET_ERROR) 594 return (RET_ERROR); 595 } else { 596 free_save = FALSE; 597 save_key = (char *) &(d->d_bytes[0]); 598 } 599 } else { 600 IDATUM *id = (IDATUM *) d; 601 602 if (id->i_flags & D_BIGKEY) { 603 free_save = TRUE; 604 bcopy(&(id->i_bytes[0]), 605 (char *) &chain, 606 sizeof(chain)); 607 if (_bt_getbig(t, chain, 608 &save_key, 609 &ignore) 610 == RET_ERROR) 611 return (RET_ERROR); 612 } else { 613 free_save = FALSE; 614 save_key = (char *) 615 &(id->i_bytes[0]); 616 } 617 } 618 } 619 } 620 621 /* copy data and update page state */ 622 where -= LONGALIGN(dsize); 623 (void) bcopy((char *) d, (char *) where, dsize); 624 where_h->h_upper = where_h->h_linp[where_i] = 625 (index_t) (where - (int) where_h); 626 where_h->h_lower += sizeof(index_t); 627 where_i++; 628 629 /* if we've moved half, switch to the right-hand page */ 630 nbytes -= LONGALIGN(dsize) + sizeof(index_t); 631 if ((freespc - nbytes) > nbytes) { 632 nbytes = 2 * freespc; 633 634 /* identical keys go on the same page */ 635 if (save_i > 0) { 636 /* i gets incremented at loop bottom... */ 637 i = save_i - 1; 638 where_h->h_lower = save_lower; 639 where_h->h_upper = save_upper; 640 } 641 where = ((char *) right) + psize; 642 where_h = right; 643 switch_i = where_i; 644 where_i = 0; 645 } 646 } 647 648 /* 649 * If there was an active scan on the database, and we just 650 * split the page that the cursor was on, we may need to 651 * adjust the cursor to point to the same entry as before the 652 * split. 653 */ 654 655 c = &(t->bt_cursor); 656 if ((t->bt_flags & BTF_SEQINIT) 657 && (c->c_pgno == h->h_pgno) 658 && (c->c_index >= switch_i)) { 659 c->c_pgno = where_h->h_pgno; 660 c->c_index -= where_i; 661 } 662 return (RET_SUCCESS); 663 } 664