1 /*- 2 * Copyright (c) 1990, 1993, 1994 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Mike Olson. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)bt_delete.c 8.13 (Berkeley) 7/28/94 37 * $DragonFly: src/lib/libc/db/btree/bt_delete.c,v 1.4 2003/11/12 20:21:22 eirikn Exp $ 38 */ 39 40 #include <sys/types.h> 41 42 #include <errno.h> 43 #include <stdio.h> 44 #include <string.h> 45 46 #include <db.h> 47 #include "btree.h" 48 49 static int __bt_bdelete (BTREE *, const DBT *); 50 static int __bt_curdel (BTREE *, const DBT *, PAGE *, u_int); 51 static int __bt_pdelete (BTREE *, PAGE *); 52 static int __bt_relink (BTREE *, PAGE *); 53 static int __bt_stkacq (BTREE *, PAGE **, CURSOR *); 54 55 /* 56 * __bt_delete 57 * Delete the item(s) referenced by a key. 58 * 59 * Return RET_SPECIAL if the key is not found. 60 */ 61 int 62 __bt_delete(dbp, key, flags) 63 const DB *dbp; 64 const DBT *key; 65 u_int flags; 66 { 67 BTREE *t; 68 CURSOR *c; 69 PAGE *h; 70 int status; 71 72 t = dbp->internal; 73 74 /* Toss any page pinned across calls. */ 75 if (t->bt_pinned != NULL) { 76 mpool_put(t->bt_mp, t->bt_pinned, 0); 77 t->bt_pinned = NULL; 78 } 79 80 /* Check for change to a read-only tree. */ 81 if (F_ISSET(t, B_RDONLY)) { 82 errno = EPERM; 83 return (RET_ERROR); 84 } 85 86 switch (flags) { 87 case 0: 88 status = __bt_bdelete(t, key); 89 break; 90 case R_CURSOR: 91 /* 92 * If flags is R_CURSOR, delete the cursor. Must already 93 * have started a scan and not have already deleted it. 94 */ 95 c = &t->bt_cursor; 96 if (F_ISSET(c, CURS_INIT)) { 97 if (F_ISSET(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE)) 98 return (RET_SPECIAL); 99 if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) 100 return (RET_ERROR); 101 102 /* 103 * If the page is about to be emptied, we'll need to 104 * delete it, which means we have to acquire a stack. 105 */ 106 if (NEXTINDEX(h) == 1) 107 if (__bt_stkacq(t, &h, &t->bt_cursor)) 108 return (RET_ERROR); 109 110 status = __bt_dleaf(t, NULL, h, c->pg.index); 111 112 if (NEXTINDEX(h) == 0 && status == RET_SUCCESS) { 113 if (__bt_pdelete(t, h)) 114 return (RET_ERROR); 115 } else 116 mpool_put(t->bt_mp, 117 h, status == RET_SUCCESS ? MPOOL_DIRTY : 0); 118 break; 119 } 120 /* FALLTHROUGH */ 121 default: 122 errno = EINVAL; 123 return (RET_ERROR); 124 } 125 if (status == RET_SUCCESS) 126 F_SET(t, B_MODIFIED); 127 return (status); 128 } 129 130 /* 131 * __bt_stkacq -- 132 * Acquire a stack so we can delete a cursor entry. 133 * 134 * Parameters: 135 * t: tree 136 * hp: pointer to current, pinned PAGE pointer 137 * c: pointer to the cursor 138 * 139 * Returns: 140 * 0 on success, 1 on failure 141 */ 142 static int 143 __bt_stkacq(t, hp, c) 144 BTREE *t; 145 PAGE **hp; 146 CURSOR *c; 147 { 148 BINTERNAL *bi; 149 EPG *e; 150 EPGNO *parent; 151 PAGE *h; 152 indx_t index; 153 pgno_t pgno; 154 recno_t nextpg, prevpg; 155 int exact, level; 156 157 /* 158 * Find the first occurrence of the key in the tree. Toss the 159 * currently locked page so we don't hit an already-locked page. 160 */ 161 h = *hp; 162 mpool_put(t->bt_mp, h, 0); 163 if ((e = __bt_search(t, &c->key, &exact)) == NULL) 164 return (1); 165 h = e->page; 166 167 /* See if we got it in one shot. */ 168 if (h->pgno == c->pg.pgno) 169 goto ret; 170 171 /* 172 * Move right, looking for the page. At each move we have to move 173 * up the stack until we don't have to move to the next page. If 174 * we have to change pages at an internal level, we have to fix the 175 * stack back up. 176 */ 177 while (h->pgno != c->pg.pgno) { 178 if ((nextpg = h->nextpg) == P_INVALID) 179 break; 180 mpool_put(t->bt_mp, h, 0); 181 182 /* Move up the stack. */ 183 for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { 184 /* Get the parent page. */ 185 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 186 return (1); 187 188 /* Move to the next index. */ 189 if (parent->index != NEXTINDEX(h) - 1) { 190 index = parent->index + 1; 191 BT_PUSH(t, h->pgno, index); 192 break; 193 } 194 mpool_put(t->bt_mp, h, 0); 195 } 196 197 /* Restore the stack. */ 198 while (level--) { 199 /* Push the next level down onto the stack. */ 200 bi = GETBINTERNAL(h, index); 201 pgno = bi->pgno; 202 BT_PUSH(t, pgno, 0); 203 204 /* Lose the currently pinned page. */ 205 mpool_put(t->bt_mp, h, 0); 206 207 /* Get the next level down. */ 208 if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) 209 return (1); 210 index = 0; 211 } 212 mpool_put(t->bt_mp, h, 0); 213 if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL) 214 return (1); 215 } 216 217 if (h->pgno == c->pg.pgno) 218 goto ret; 219 220 /* Reacquire the original stack. */ 221 mpool_put(t->bt_mp, h, 0); 222 if ((e = __bt_search(t, &c->key, &exact)) == NULL) 223 return (1); 224 h = e->page; 225 226 /* 227 * Move left, looking for the page. At each move we have to move 228 * up the stack until we don't have to change pages to move to the 229 * next page. If we have to change pages at an internal level, we 230 * have to fix the stack back up. 231 */ 232 while (h->pgno != c->pg.pgno) { 233 if ((prevpg = h->prevpg) == P_INVALID) 234 break; 235 mpool_put(t->bt_mp, h, 0); 236 237 /* Move up the stack. */ 238 for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { 239 /* Get the parent page. */ 240 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 241 return (1); 242 243 /* Move to the next index. */ 244 if (parent->index != 0) { 245 index = parent->index - 1; 246 BT_PUSH(t, h->pgno, index); 247 break; 248 } 249 mpool_put(t->bt_mp, h, 0); 250 } 251 252 /* Restore the stack. */ 253 while (level--) { 254 /* Push the next level down onto the stack. */ 255 bi = GETBINTERNAL(h, index); 256 pgno = bi->pgno; 257 258 /* Lose the currently pinned page. */ 259 mpool_put(t->bt_mp, h, 0); 260 261 /* Get the next level down. */ 262 if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) 263 return (1); 264 265 index = NEXTINDEX(h) - 1; 266 BT_PUSH(t, pgno, index); 267 } 268 mpool_put(t->bt_mp, h, 0); 269 if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL) 270 return (1); 271 } 272 273 274 ret: mpool_put(t->bt_mp, h, 0); 275 return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL); 276 } 277 278 /* 279 * __bt_bdelete -- 280 * Delete all key/data pairs matching the specified key. 281 * 282 * Parameters: 283 * t: tree 284 * key: key to delete 285 * 286 * Returns: 287 * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found. 288 */ 289 static int 290 __bt_bdelete(t, key) 291 BTREE *t; 292 const DBT *key; 293 { 294 EPG *e; 295 PAGE *h; 296 int deleted, exact, redo; 297 298 deleted = 0; 299 300 /* Find any matching record; __bt_search pins the page. */ 301 loop: if ((e = __bt_search(t, key, &exact)) == NULL) 302 return (deleted ? RET_SUCCESS : RET_ERROR); 303 if (!exact) { 304 mpool_put(t->bt_mp, e->page, 0); 305 return (deleted ? RET_SUCCESS : RET_SPECIAL); 306 } 307 308 /* 309 * Delete forward, then delete backward, from the found key. If 310 * there are duplicates and we reach either side of the page, do 311 * the key search again, so that we get them all. 312 */ 313 redo = 0; 314 h = e->page; 315 do { 316 if (__bt_dleaf(t, key, h, e->index)) { 317 mpool_put(t->bt_mp, h, 0); 318 return (RET_ERROR); 319 } 320 if (F_ISSET(t, B_NODUPS)) { 321 if (NEXTINDEX(h) == 0) { 322 if (__bt_pdelete(t, h)) 323 return (RET_ERROR); 324 } else 325 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 326 return (RET_SUCCESS); 327 } 328 deleted = 1; 329 } while (e->index < NEXTINDEX(h) && __bt_cmp(t, key, e) == 0); 330 331 /* Check for right-hand edge of the page. */ 332 if (e->index == NEXTINDEX(h)) 333 redo = 1; 334 335 /* Delete from the key to the beginning of the page. */ 336 while (e->index-- > 0) { 337 if (__bt_cmp(t, key, e) != 0) 338 break; 339 if (__bt_dleaf(t, key, h, e->index) == RET_ERROR) { 340 mpool_put(t->bt_mp, h, 0); 341 return (RET_ERROR); 342 } 343 if (e->index == 0) 344 redo = 1; 345 } 346 347 /* Check for an empty page. */ 348 if (NEXTINDEX(h) == 0) { 349 if (__bt_pdelete(t, h)) 350 return (RET_ERROR); 351 goto loop; 352 } 353 354 /* Put the page. */ 355 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 356 357 if (redo) 358 goto loop; 359 return (RET_SUCCESS); 360 } 361 362 /* 363 * __bt_pdelete -- 364 * Delete a single page from the tree. 365 * 366 * Parameters: 367 * t: tree 368 * h: leaf page 369 * 370 * Returns: 371 * RET_SUCCESS, RET_ERROR. 372 * 373 * Side-effects: 374 * mpool_put's the page 375 */ 376 static int 377 __bt_pdelete(t, h) 378 BTREE *t; 379 PAGE *h; 380 { 381 BINTERNAL *bi; 382 PAGE *pg; 383 EPGNO *parent; 384 indx_t cnt, index, *ip, offset; 385 u_int32_t nksize; 386 char *from; 387 388 /* 389 * Walk the parent page stack -- a LIFO stack of the pages that were 390 * traversed when we searched for the page where the delete occurred. 391 * Each stack entry is a page number and a page index offset. The 392 * offset is for the page traversed on the search. We've just deleted 393 * a page, so we have to delete the key from the parent page. 394 * 395 * If the delete from the parent page makes it empty, this process may 396 * continue all the way up the tree. We stop if we reach the root page 397 * (which is never deleted, it's just not worth the effort) or if the 398 * delete does not empty the page. 399 */ 400 while ((parent = BT_POP(t)) != NULL) { 401 /* Get the parent page. */ 402 if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 403 return (RET_ERROR); 404 405 index = parent->index; 406 bi = GETBINTERNAL(pg, index); 407 408 /* Free any overflow pages. */ 409 if (bi->flags & P_BIGKEY && 410 __ovfl_delete(t, bi->bytes) == RET_ERROR) { 411 mpool_put(t->bt_mp, pg, 0); 412 return (RET_ERROR); 413 } 414 415 /* 416 * Free the parent if it has only the one key and it's not the 417 * root page. If it's the rootpage, turn it back into an empty 418 * leaf page. 419 */ 420 if (NEXTINDEX(pg) == 1) 421 if (pg->pgno == P_ROOT) { 422 pg->lower = BTDATAOFF; 423 pg->upper = t->bt_psize; 424 pg->flags = P_BLEAF; 425 } else { 426 if (__bt_relink(t, pg) || __bt_free(t, pg)) 427 return (RET_ERROR); 428 continue; 429 } 430 else { 431 /* Pack remaining key items at the end of the page. */ 432 nksize = NBINTERNAL(bi->ksize); 433 from = (char *)pg + pg->upper; 434 memmove(from + nksize, from, (char *)bi - from); 435 pg->upper += nksize; 436 437 /* Adjust indices' offsets, shift the indices down. */ 438 offset = pg->linp[index]; 439 for (cnt = index, ip = &pg->linp[0]; cnt--; ++ip) 440 if (ip[0] < offset) 441 ip[0] += nksize; 442 for (cnt = NEXTINDEX(pg) - index; --cnt; ++ip) 443 ip[0] = ip[1] < offset ? ip[1] + nksize : ip[1]; 444 pg->lower -= sizeof(indx_t); 445 } 446 447 mpool_put(t->bt_mp, pg, MPOOL_DIRTY); 448 break; 449 } 450 451 /* Free the leaf page, as long as it wasn't the root. */ 452 if (h->pgno == P_ROOT) { 453 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 454 return (RET_SUCCESS); 455 } 456 return (__bt_relink(t, h) || __bt_free(t, h)); 457 } 458 459 /* 460 * __bt_dleaf -- 461 * Delete a single record from a leaf page. 462 * 463 * Parameters: 464 * t: tree 465 * key: referenced key 466 * h: page 467 * index: index on page to delete 468 * 469 * Returns: 470 * RET_SUCCESS, RET_ERROR. 471 */ 472 int 473 __bt_dleaf(t, key, h, index) 474 BTREE *t; 475 const DBT *key; 476 PAGE *h; 477 u_int index; 478 { 479 BLEAF *bl; 480 indx_t cnt, *ip, offset; 481 u_int32_t nbytes; 482 void *to; 483 char *from; 484 485 /* If this record is referenced by the cursor, delete the cursor. */ 486 if (F_ISSET(&t->bt_cursor, CURS_INIT) && 487 !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && 488 t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index == index && 489 __bt_curdel(t, key, h, index)) 490 return (RET_ERROR); 491 492 /* If the entry uses overflow pages, make them available for reuse. */ 493 to = bl = GETBLEAF(h, index); 494 if (bl->flags & P_BIGKEY && __ovfl_delete(t, bl->bytes) == RET_ERROR) 495 return (RET_ERROR); 496 if (bl->flags & P_BIGDATA && 497 __ovfl_delete(t, bl->bytes + bl->ksize) == RET_ERROR) 498 return (RET_ERROR); 499 500 /* Pack the remaining key/data items at the end of the page. */ 501 nbytes = NBLEAF(bl); 502 from = (char *)h + h->upper; 503 memmove(from + nbytes, from, (char *)to - from); 504 h->upper += nbytes; 505 506 /* Adjust the indices' offsets, shift the indices down. */ 507 offset = h->linp[index]; 508 for (cnt = index, ip = &h->linp[0]; cnt--; ++ip) 509 if (ip[0] < offset) 510 ip[0] += nbytes; 511 for (cnt = NEXTINDEX(h) - index; --cnt; ++ip) 512 ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1]; 513 h->lower -= sizeof(indx_t); 514 515 /* If the cursor is on this page, adjust it as necessary. */ 516 if (F_ISSET(&t->bt_cursor, CURS_INIT) && 517 !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && 518 t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index > index) 519 --t->bt_cursor.pg.index; 520 521 return (RET_SUCCESS); 522 } 523 524 /* 525 * __bt_curdel -- 526 * Delete the cursor. 527 * 528 * Parameters: 529 * t: tree 530 * key: referenced key (or NULL) 531 * h: page 532 * index: index on page to delete 533 * 534 * Returns: 535 * RET_SUCCESS, RET_ERROR. 536 */ 537 static int 538 __bt_curdel(t, key, h, index) 539 BTREE *t; 540 const DBT *key; 541 PAGE *h; 542 u_int index; 543 { 544 CURSOR *c; 545 EPG e; 546 PAGE *pg; 547 int curcopy, status; 548 549 /* 550 * If there are duplicates, move forward or backward to one. 551 * Otherwise, copy the key into the cursor area. 552 */ 553 c = &t->bt_cursor; 554 F_CLR(c, CURS_AFTER | CURS_BEFORE | CURS_ACQUIRE); 555 556 curcopy = 0; 557 if (!F_ISSET(t, B_NODUPS)) { 558 /* 559 * We're going to have to do comparisons. If we weren't 560 * provided a copy of the key, i.e. the user is deleting 561 * the current cursor position, get one. 562 */ 563 if (key == NULL) { 564 e.page = h; 565 e.index = index; 566 if ((status = __bt_ret(t, &e, 567 &c->key, &c->key, NULL, NULL, 1)) != RET_SUCCESS) 568 return (status); 569 curcopy = 1; 570 key = &c->key; 571 } 572 /* Check previous key, if not at the beginning of the page. */ 573 if (index > 0) { 574 e.page = h; 575 e.index = index - 1; 576 if (__bt_cmp(t, key, &e) == 0) { 577 F_SET(c, CURS_BEFORE); 578 goto dup2; 579 } 580 } 581 /* Check next key, if not at the end of the page. */ 582 if (index < NEXTINDEX(h) - 1) { 583 e.page = h; 584 e.index = index + 1; 585 if (__bt_cmp(t, key, &e) == 0) { 586 F_SET(c, CURS_AFTER); 587 goto dup2; 588 } 589 } 590 /* Check previous key if at the beginning of the page. */ 591 if (index == 0 && h->prevpg != P_INVALID) { 592 if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) 593 return (RET_ERROR); 594 e.page = pg; 595 e.index = NEXTINDEX(pg) - 1; 596 if (__bt_cmp(t, key, &e) == 0) { 597 F_SET(c, CURS_BEFORE); 598 goto dup1; 599 } 600 mpool_put(t->bt_mp, pg, 0); 601 } 602 /* Check next key if at the end of the page. */ 603 if (index == NEXTINDEX(h) - 1 && h->nextpg != P_INVALID) { 604 if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) 605 return (RET_ERROR); 606 e.page = pg; 607 e.index = 0; 608 if (__bt_cmp(t, key, &e) == 0) { 609 F_SET(c, CURS_AFTER); 610 dup1: mpool_put(t->bt_mp, pg, 0); 611 dup2: c->pg.pgno = e.page->pgno; 612 c->pg.index = e.index; 613 return (RET_SUCCESS); 614 } 615 mpool_put(t->bt_mp, pg, 0); 616 } 617 } 618 e.page = h; 619 e.index = index; 620 if (curcopy || (status = 621 __bt_ret(t, &e, &c->key, &c->key, NULL, NULL, 1)) == RET_SUCCESS) { 622 F_SET(c, CURS_ACQUIRE); 623 return (RET_SUCCESS); 624 } 625 return (status); 626 } 627 628 /* 629 * __bt_relink -- 630 * Link around a deleted page. 631 * 632 * Parameters: 633 * t: tree 634 * h: page to be deleted 635 */ 636 static int 637 __bt_relink(t, h) 638 BTREE *t; 639 PAGE *h; 640 { 641 PAGE *pg; 642 643 if (h->nextpg != P_INVALID) { 644 if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) 645 return (RET_ERROR); 646 pg->prevpg = h->prevpg; 647 mpool_put(t->bt_mp, pg, MPOOL_DIRTY); 648 } 649 if (h->prevpg != P_INVALID) { 650 if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) 651 return (RET_ERROR); 652 pg->nextpg = h->nextpg; 653 mpool_put(t->bt_mp, pg, MPOOL_DIRTY); 654 } 655 return (0); 656 } 657