1 /*- 2 * Copyright (c) 1990, 1993, 1994 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Mike Olson. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)bt_delete.c 8.13 (Berkeley) 7/28/94 37 * $DragonFly: src/lib/libc/db/btree/bt_delete.c,v 1.6 2005/03/16 06:19:09 joerg Exp $ 38 */ 39 40 #include <sys/types.h> 41 42 #include <errno.h> 43 #include <stdio.h> 44 #include <string.h> 45 46 #include <db.h> 47 #include "btree.h" 48 49 static int __bt_bdelete (BTREE *, const DBT *); 50 static int __bt_curdel (BTREE *, const DBT *, PAGE *, u_int); 51 static int __bt_pdelete (BTREE *, PAGE *); 52 static int __bt_relink (BTREE *, PAGE *); 53 static int __bt_stkacq (BTREE *, PAGE **, CURSOR *); 54 55 /* 56 * __bt_delete 57 * Delete the item(s) referenced by a key. 58 * 59 * Return RET_SPECIAL if the key is not found. 60 */ 61 int 62 __bt_delete(dbp, key, flags) 63 const DB *dbp; 64 const DBT *key; 65 u_int flags; 66 { 67 BTREE *t; 68 CURSOR *c; 69 PAGE *h; 70 int status; 71 72 t = dbp->internal; 73 74 /* Toss any page pinned across calls. */ 75 if (t->bt_pinned != NULL) { 76 mpool_put(t->bt_mp, t->bt_pinned, 0); 77 t->bt_pinned = NULL; 78 } 79 80 /* Check for change to a read-only tree. */ 81 if (F_ISSET(t, B_RDONLY)) { 82 errno = EPERM; 83 return (RET_ERROR); 84 } 85 86 switch (flags) { 87 case 0: 88 status = __bt_bdelete(t, key); 89 break; 90 case R_CURSOR: 91 /* 92 * If flags is R_CURSOR, delete the cursor. Must already 93 * have started a scan and not have already deleted it. 94 */ 95 c = &t->bt_cursor; 96 if (F_ISSET(c, CURS_INIT)) { 97 if (F_ISSET(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE)) 98 return (RET_SPECIAL); 99 if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) 100 return (RET_ERROR); 101 102 /* 103 * If the page is about to be emptied, we'll need to 104 * delete it, which means we have to acquire a stack. 105 */ 106 if (NEXTINDEX(h) == 1) 107 if (__bt_stkacq(t, &h, &t->bt_cursor)) 108 return (RET_ERROR); 109 110 status = __bt_dleaf(t, NULL, h, c->pg.index); 111 112 if (NEXTINDEX(h) == 0 && status == RET_SUCCESS) { 113 if (__bt_pdelete(t, h)) 114 return (RET_ERROR); 115 } else 116 mpool_put(t->bt_mp, 117 h, status == RET_SUCCESS ? MPOOL_DIRTY : 0); 118 break; 119 } 120 /* FALLTHROUGH */ 121 default: 122 errno = EINVAL; 123 return (RET_ERROR); 124 } 125 if (status == RET_SUCCESS) 126 F_SET(t, B_MODIFIED); 127 return (status); 128 } 129 130 /* 131 * __bt_stkacq -- 132 * Acquire a stack so we can delete a cursor entry. 133 * 134 * Parameters: 135 * t: tree 136 * hp: pointer to current, pinned PAGE pointer 137 * c: pointer to the cursor 138 * 139 * Returns: 140 * 0 on success, 1 on failure 141 */ 142 static int 143 __bt_stkacq(t, hp, c) 144 BTREE *t; 145 PAGE **hp; 146 CURSOR *c; 147 { 148 BINTERNAL *bi; 149 EPG *e; 150 EPGNO *parent; 151 PAGE *h; 152 indx_t idx = 0; 153 pgno_t pgno; 154 recno_t nextpg, prevpg; 155 int exact, level; 156 157 /* 158 * Find the first occurrence of the key in the tree. Toss the 159 * currently locked page so we don't hit an already-locked page. 160 */ 161 h = *hp; 162 mpool_put(t->bt_mp, h, 0); 163 if ((e = __bt_search(t, &c->key, &exact)) == NULL) 164 return (1); 165 h = e->page; 166 167 /* See if we got it in one shot. */ 168 if (h->pgno == c->pg.pgno) 169 goto ret; 170 171 /* 172 * Move right, looking for the page. At each move we have to move 173 * up the stack until we don't have to move to the next page. If 174 * we have to change pages at an internal level, we have to fix the 175 * stack back up. 176 */ 177 while (h->pgno != c->pg.pgno) { 178 if ((nextpg = h->nextpg) == P_INVALID) 179 break; 180 mpool_put(t->bt_mp, h, 0); 181 182 /* Move up the stack. */ 183 for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { 184 /* Get the parent page. */ 185 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 186 return (1); 187 188 /* Move to the next index. */ 189 if (parent->index != NEXTINDEX(h) - 1) { 190 idx = parent->index + 1; 191 BT_PUSH(t, h->pgno, idx); 192 break; 193 } 194 mpool_put(t->bt_mp, h, 0); 195 } 196 197 /* Restore the stack. */ 198 while (level--) { 199 /* Push the next level down onto the stack. */ 200 bi = GETBINTERNAL(h, idx); 201 pgno = bi->pgno; 202 BT_PUSH(t, pgno, 0); 203 204 /* Lose the currently pinned page. */ 205 mpool_put(t->bt_mp, h, 0); 206 207 /* Get the next level down. */ 208 if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) 209 return (1); 210 idx = 0; 211 } 212 mpool_put(t->bt_mp, h, 0); 213 if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL) 214 return (1); 215 } 216 217 if (h->pgno == c->pg.pgno) 218 goto ret; 219 220 /* Reacquire the original stack. */ 221 mpool_put(t->bt_mp, h, 0); 222 if ((e = __bt_search(t, &c->key, &exact)) == NULL) 223 return (1); 224 h = e->page; 225 226 /* 227 * Move left, looking for the page. At each move we have to move 228 * up the stack until we don't have to change pages to move to the 229 * next page. If we have to change pages at an internal level, we 230 * have to fix the stack back up. 231 */ 232 while (h->pgno != c->pg.pgno) { 233 if ((prevpg = h->prevpg) == P_INVALID) 234 break; 235 mpool_put(t->bt_mp, h, 0); 236 237 /* Move up the stack. */ 238 for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { 239 /* Get the parent page. */ 240 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 241 return (1); 242 243 /* Move to the next index. */ 244 if (parent->index != 0) { 245 idx = parent->index - 1; 246 BT_PUSH(t, h->pgno, idx); 247 break; 248 } 249 mpool_put(t->bt_mp, h, 0); 250 } 251 252 /* Restore the stack. */ 253 while (level--) { 254 /* Push the next level down onto the stack. */ 255 bi = GETBINTERNAL(h, idx); 256 pgno = bi->pgno; 257 258 /* Lose the currently pinned page. */ 259 mpool_put(t->bt_mp, h, 0); 260 261 /* Get the next level down. */ 262 if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) 263 return (1); 264 265 idx = NEXTINDEX(h) - 1; 266 BT_PUSH(t, pgno, idx); 267 } 268 mpool_put(t->bt_mp, h, 0); 269 if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL) 270 return (1); 271 } 272 273 274 ret: mpool_put(t->bt_mp, h, 0); 275 return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL); 276 } 277 278 /* 279 * __bt_bdelete -- 280 * Delete all key/data pairs matching the specified key. 281 * 282 * Parameters: 283 * t: tree 284 * key: key to delete 285 * 286 * Returns: 287 * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found. 288 */ 289 static int 290 __bt_bdelete(t, key) 291 BTREE *t; 292 const DBT *key; 293 { 294 EPG *e; 295 PAGE *h; 296 int deleted, exact, redo; 297 298 deleted = 0; 299 300 /* Find any matching record; __bt_search pins the page. */ 301 loop: if ((e = __bt_search(t, key, &exact)) == NULL) 302 return (deleted ? RET_SUCCESS : RET_ERROR); 303 if (!exact) { 304 mpool_put(t->bt_mp, e->page, 0); 305 return (deleted ? RET_SUCCESS : RET_SPECIAL); 306 } 307 308 /* 309 * Delete forward, then delete backward, from the found key. If 310 * there are duplicates and we reach either side of the page, do 311 * the key search again, so that we get them all. 312 */ 313 redo = 0; 314 h = e->page; 315 do { 316 if (__bt_dleaf(t, key, h, e->index)) { 317 mpool_put(t->bt_mp, h, 0); 318 return (RET_ERROR); 319 } 320 if (F_ISSET(t, B_NODUPS)) { 321 if (NEXTINDEX(h) == 0) { 322 if (__bt_pdelete(t, h)) 323 return (RET_ERROR); 324 } else 325 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 326 return (RET_SUCCESS); 327 } 328 deleted = 1; 329 } while (e->index < NEXTINDEX(h) && __bt_cmp(t, key, e) == 0); 330 331 /* Check for right-hand edge of the page. */ 332 if (e->index == NEXTINDEX(h)) 333 redo = 1; 334 335 /* Delete from the key to the beginning of the page. */ 336 while (e->index-- > 0) { 337 if (__bt_cmp(t, key, e) != 0) 338 break; 339 if (__bt_dleaf(t, key, h, e->index) == RET_ERROR) { 340 mpool_put(t->bt_mp, h, 0); 341 return (RET_ERROR); 342 } 343 if (e->index == 0) 344 redo = 1; 345 } 346 347 /* Check for an empty page. */ 348 if (NEXTINDEX(h) == 0) { 349 if (__bt_pdelete(t, h)) 350 return (RET_ERROR); 351 goto loop; 352 } 353 354 /* Put the page. */ 355 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 356 357 if (redo) 358 goto loop; 359 return (RET_SUCCESS); 360 } 361 362 /* 363 * __bt_pdelete -- 364 * Delete a single page from the tree. 365 * 366 * Parameters: 367 * t: tree 368 * h: leaf page 369 * 370 * Returns: 371 * RET_SUCCESS, RET_ERROR. 372 * 373 * Side-effects: 374 * mpool_put's the page 375 */ 376 static int 377 __bt_pdelete(t, h) 378 BTREE *t; 379 PAGE *h; 380 { 381 BINTERNAL *bi; 382 PAGE *pg; 383 EPGNO *parent; 384 indx_t cnt, idx, *ip, offset; 385 u_int32_t nksize; 386 char *from; 387 388 /* 389 * Walk the parent page stack -- a LIFO stack of the pages that were 390 * traversed when we searched for the page where the delete occurred. 391 * Each stack entry is a page number and a page index offset. The 392 * offset is for the page traversed on the search. We've just deleted 393 * a page, so we have to delete the key from the parent page. 394 * 395 * If the delete from the parent page makes it empty, this process may 396 * continue all the way up the tree. We stop if we reach the root page 397 * (which is never deleted, it's just not worth the effort) or if the 398 * delete does not empty the page. 399 */ 400 while ((parent = BT_POP(t)) != NULL) { 401 /* Get the parent page. */ 402 if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 403 return (RET_ERROR); 404 405 idx = parent->index; 406 bi = GETBINTERNAL(pg, idx); 407 408 /* Free any overflow pages. */ 409 if (bi->flags & P_BIGKEY && 410 __ovfl_delete(t, bi->bytes) == RET_ERROR) { 411 mpool_put(t->bt_mp, pg, 0); 412 return (RET_ERROR); 413 } 414 415 /* 416 * Free the parent if it has only the one key and it's not the 417 * root page. If it's the rootpage, turn it back into an empty 418 * leaf page. 419 */ 420 if (NEXTINDEX(pg) == 1) 421 if (pg->pgno == P_ROOT) { 422 pg->lower = BTDATAOFF; 423 pg->upper = t->bt_psize; 424 pg->flags = P_BLEAF; 425 } else { 426 if (__bt_relink(t, pg) || __bt_free(t, pg)) 427 return (RET_ERROR); 428 continue; 429 } 430 else { 431 /* Pack remaining key items at the end of the page. */ 432 nksize = NBINTERNAL(bi->ksize); 433 from = (char *)pg + pg->upper; 434 memmove(from + nksize, from, (char *)bi - from); 435 pg->upper += nksize; 436 437 /* Adjust indices' offsets, shift the indices down. */ 438 offset = pg->linp[idx]; 439 for (cnt = idx, ip = &pg->linp[0]; cnt--; ++ip) 440 if (ip[0] < offset) 441 ip[0] += nksize; 442 for (cnt = NEXTINDEX(pg) - idx; --cnt; ++ip) 443 ip[0] = ip[1] < offset ? ip[1] + nksize : ip[1]; 444 pg->lower -= sizeof(indx_t); 445 } 446 447 mpool_put(t->bt_mp, pg, MPOOL_DIRTY); 448 break; 449 } 450 451 /* Free the leaf page, as long as it wasn't the root. */ 452 if (h->pgno == P_ROOT) { 453 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 454 return (RET_SUCCESS); 455 } 456 return (__bt_relink(t, h) || __bt_free(t, h)); 457 } 458 459 /* 460 * __bt_dleaf -- 461 * Delete a single record from a leaf page. 462 * 463 * Parameters: 464 * t: tree 465 * key: referenced key 466 * h: page 467 * idx: index on page to delete 468 * 469 * Returns: 470 * RET_SUCCESS, RET_ERROR. 471 */ 472 int 473 __bt_dleaf(BTREE *t, const DBT *key, PAGE *h, u_int idx) 474 { 475 BLEAF *bl; 476 indx_t cnt, *ip, offset; 477 u_int32_t nbytes; 478 void *to; 479 char *from; 480 481 /* If this record is referenced by the cursor, delete the cursor. */ 482 if (F_ISSET(&t->bt_cursor, CURS_INIT) && 483 !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && 484 t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index == idx && 485 __bt_curdel(t, key, h, idx)) 486 return (RET_ERROR); 487 488 /* If the entry uses overflow pages, make them available for reuse. */ 489 to = bl = GETBLEAF(h, idx); 490 if (bl->flags & P_BIGKEY && __ovfl_delete(t, bl->bytes) == RET_ERROR) 491 return (RET_ERROR); 492 if (bl->flags & P_BIGDATA && 493 __ovfl_delete(t, bl->bytes + bl->ksize) == RET_ERROR) 494 return (RET_ERROR); 495 496 /* Pack the remaining key/data items at the end of the page. */ 497 nbytes = NBLEAF(bl); 498 from = (char *)h + h->upper; 499 memmove(from + nbytes, from, (char *)to - from); 500 h->upper += nbytes; 501 502 /* Adjust the indices' offsets, shift the indices down. */ 503 offset = h->linp[idx]; 504 for (cnt = idx, ip = &h->linp[0]; cnt--; ++ip) 505 if (ip[0] < offset) 506 ip[0] += nbytes; 507 for (cnt = NEXTINDEX(h) - idx; --cnt; ++ip) 508 ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1]; 509 h->lower -= sizeof(indx_t); 510 511 /* If the cursor is on this page, adjust it as necessary. */ 512 if (F_ISSET(&t->bt_cursor, CURS_INIT) && 513 !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && 514 t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index > idx) 515 --t->bt_cursor.pg.index; 516 517 return (RET_SUCCESS); 518 } 519 520 /* 521 * __bt_curdel -- 522 * Delete the cursor. 523 * 524 * Parameters: 525 * t: tree 526 * key: referenced key (or NULL) 527 * h: page 528 * idx: index on page to delete 529 * 530 * Returns: 531 * RET_SUCCESS, RET_ERROR. 532 */ 533 static int 534 __bt_curdel(BTREE *t, const DBT *key, PAGE *h, u_int idx) 535 { 536 CURSOR *c; 537 EPG e; 538 PAGE *pg; 539 int curcopy, status; 540 541 /* 542 * If there are duplicates, move forward or backward to one. 543 * Otherwise, copy the key into the cursor area. 544 */ 545 c = &t->bt_cursor; 546 F_CLR(c, CURS_AFTER | CURS_BEFORE | CURS_ACQUIRE); 547 548 curcopy = 0; 549 if (!F_ISSET(t, B_NODUPS)) { 550 /* 551 * We're going to have to do comparisons. If we weren't 552 * provided a copy of the key, i.e. the user is deleting 553 * the current cursor position, get one. 554 */ 555 if (key == NULL) { 556 e.page = h; 557 e.index = idx; 558 if ((status = __bt_ret(t, &e, 559 &c->key, &c->key, NULL, NULL, 1)) != RET_SUCCESS) 560 return (status); 561 curcopy = 1; 562 key = &c->key; 563 } 564 /* Check previous key, if not at the beginning of the page. */ 565 if (idx > 0) { 566 e.page = h; 567 e.index = idx - 1; 568 if (__bt_cmp(t, key, &e) == 0) { 569 F_SET(c, CURS_BEFORE); 570 goto dup2; 571 } 572 } 573 /* Check next key, if not at the end of the page. */ 574 if (idx < NEXTINDEX(h) - 1) { 575 e.page = h; 576 e.index = idx + 1; 577 if (__bt_cmp(t, key, &e) == 0) { 578 F_SET(c, CURS_AFTER); 579 goto dup2; 580 } 581 } 582 /* Check previous key if at the beginning of the page. */ 583 if (idx == 0 && h->prevpg != P_INVALID) { 584 if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) 585 return (RET_ERROR); 586 e.page = pg; 587 e.index = NEXTINDEX(pg) - 1; 588 if (__bt_cmp(t, key, &e) == 0) { 589 F_SET(c, CURS_BEFORE); 590 goto dup1; 591 } 592 mpool_put(t->bt_mp, pg, 0); 593 } 594 /* Check next key if at the end of the page. */ 595 if (idx == NEXTINDEX(h) - 1 && h->nextpg != P_INVALID) { 596 if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) 597 return (RET_ERROR); 598 e.page = pg; 599 e.index = 0; 600 if (__bt_cmp(t, key, &e) == 0) { 601 F_SET(c, CURS_AFTER); 602 dup1: mpool_put(t->bt_mp, pg, 0); 603 dup2: c->pg.pgno = e.page->pgno; 604 c->pg.index = e.index; 605 return (RET_SUCCESS); 606 } 607 mpool_put(t->bt_mp, pg, 0); 608 } 609 } 610 e.page = h; 611 e.index = idx; 612 if (curcopy || (status = 613 __bt_ret(t, &e, &c->key, &c->key, NULL, NULL, 1)) == RET_SUCCESS) { 614 F_SET(c, CURS_ACQUIRE); 615 return (RET_SUCCESS); 616 } 617 return (status); 618 } 619 620 /* 621 * __bt_relink -- 622 * Link around a deleted page. 623 * 624 * Parameters: 625 * t: tree 626 * h: page to be deleted 627 */ 628 static int 629 __bt_relink(t, h) 630 BTREE *t; 631 PAGE *h; 632 { 633 PAGE *pg; 634 635 if (h->nextpg != P_INVALID) { 636 if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) 637 return (RET_ERROR); 638 pg->prevpg = h->prevpg; 639 mpool_put(t->bt_mp, pg, MPOOL_DIRTY); 640 } 641 if (h->prevpg != P_INVALID) { 642 if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) 643 return (RET_ERROR); 644 pg->nextpg = h->nextpg; 645 mpool_put(t->bt_mp, pg, MPOOL_DIRTY); 646 } 647 return (0); 648 } 649