1 /* 2 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6 /* 7 * Copyright (C) 2002 by the Massachusetts Institute of Technology. 8 * All rights reserved. 9 * 10 * Export of this software from the United States of America may 11 * require a specific license from the United States Government. 12 * It is the responsibility of any person or organization contemplating 13 * export to obtain such a license before exporting. 14 * 15 * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and 16 * distribute this software and its documentation for any purpose and 17 * without fee is hereby granted, provided that the above copyright 18 * notice appear in all copies and that both that copyright notice and 19 * this permission notice appear in supporting documentation, and that 20 * the name of M.I.T. not be used in advertising or publicity pertaining 21 * to distribution of the software without specific, written prior 22 * permission. Furthermore if you modify this software you must label 23 * your software as modified software and not distribute it in such a 24 * fashion that it might be confused with the original M.I.T. software. 25 * M.I.T. makes no representations about the suitability of 26 * this software for any purpose. It is provided "as is" without express 27 * or implied warranty. 28 */ 29 30 /*- 31 * Copyright (c) 1990, 1993, 1994 32 * The Regents of the University of California. All rights reserved. 33 * 34 * This code is derived from software contributed to Berkeley by 35 * Mike Olson. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 3. All advertising materials mentioning features or use of this software 46 * must display the following acknowledgement: 47 * This product includes software developed by the University of 48 * California, Berkeley and its contributors. 49 * 4. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 */ 65 66 #if defined(LIBC_SCCS) && !defined(lint) 67 static char sccsid[] = "@(#)bt_seq.c 8.9 (Berkeley) 6/20/95"; 68 #endif /* LIBC_SCCS and not lint */ 69 70 #include <sys/types.h> 71 72 #include <errno.h> 73 #include <stddef.h> 74 #include <stdio.h> 75 #include <stdlib.h> 76 #include <string.h> 77 78 #include "db-int.h" 79 #include "btree.h" 80 81 static int __bt_first __P((BTREE *, const DBT *, EPG *, int *)); 82 static int __bt_seqadv __P((BTREE *, EPG *, int)); 83 static int __bt_seqset __P((BTREE *, EPG *, DBT *, int)); 84 85 /* 86 * Sequential scan support. 87 * 88 * The tree can be scanned sequentially, starting from either end of the 89 * tree or from any specific key. A scan request before any scanning is 90 * done is initialized as starting from the least node. 91 */ 92 93 /* 94 * __bt_seq -- 95 * Btree sequential scan interface. 96 * 97 * Parameters: 98 * dbp: pointer to access method 99 * key: key for positioning and return value 100 * data: data return value 101 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV. 102 * 103 * Returns: 104 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. 105 */ 106 int 107 __bt_seq(dbp, key, data, flags) 108 const DB *dbp; 109 DBT *key, *data; 110 u_int flags; 111 { 112 BTREE *t; 113 EPG e; 114 int status; 115 116 t = dbp->internal; 117 118 /* Toss any page pinned across calls. */ 119 if (t->bt_pinned != NULL) { 120 mpool_put(t->bt_mp, t->bt_pinned, 0); 121 t->bt_pinned = NULL; 122 } 123 124 /* 125 * If scan unitialized as yet, or starting at a specific record, set 126 * the scan to a specific key. Both __bt_seqset and __bt_seqadv pin 127 * the page the cursor references if they're successful. 128 */ 129 switch (flags) { 130 case R_NEXT: 131 case R_PREV: 132 if (F_ISSET(&t->bt_cursor, CURS_INIT)) { 133 status = __bt_seqadv(t, &e, flags); 134 break; 135 } 136 /* FALLTHROUGH */ 137 case R_FIRST: 138 case R_LAST: 139 case R_CURSOR: 140 status = __bt_seqset(t, &e, key, flags); 141 break; 142 default: 143 errno = EINVAL; 144 return (RET_ERROR); 145 } 146 147 if (status == RET_SUCCESS) { 148 __bt_setcur(t, e.page->pgno, e.index); 149 150 status = 151 __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0); 152 153 /* 154 * If the user is doing concurrent access, we copied the 155 * key/data, toss the page. 156 */ 157 if (F_ISSET(t, B_DB_LOCK)) 158 mpool_put(t->bt_mp, e.page, 0); 159 else 160 t->bt_pinned = e.page; 161 } 162 return (status); 163 } 164 165 /* 166 * __bt_seqset -- 167 * Set the sequential scan to a specific key. 168 * 169 * Parameters: 170 * t: tree 171 * ep: storage for returned key 172 * key: key for initial scan position 173 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV 174 * 175 * Side effects: 176 * Pins the page the cursor references. 177 * 178 * Returns: 179 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. 180 */ 181 static int 182 __bt_seqset(t, ep, key, flags) 183 BTREE *t; 184 EPG *ep; 185 DBT *key; 186 int flags; 187 { 188 PAGE *h; 189 db_pgno_t pg; 190 int exact; 191 192 /* 193 * Find the first, last or specific key in the tree and point the 194 * cursor at it. The cursor may not be moved until a new key has 195 * been found. 196 */ 197 switch (flags) { 198 case R_CURSOR: /* Keyed scan. */ 199 /* 200 * Find the first instance of the key or the smallest key 201 * which is greater than or equal to the specified key. 202 */ 203 if (key->data == NULL || key->size == 0) { 204 errno = EINVAL; 205 return (RET_ERROR); 206 } 207 return (__bt_first(t, key, ep, &exact)); 208 case R_FIRST: /* First record. */ 209 case R_NEXT: 210 /* Walk down the left-hand side of the tree. */ 211 for (pg = P_ROOT;;) { 212 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 213 return (RET_ERROR); 214 215 /* Check for an empty tree. */ 216 if (NEXTINDEX(h) == 0) { 217 mpool_put(t->bt_mp, h, 0); 218 return (RET_SPECIAL); 219 } 220 221 if (h->flags & (P_BLEAF | P_RLEAF)) 222 break; 223 pg = GETBINTERNAL(h, 0)->pgno; 224 mpool_put(t->bt_mp, h, 0); 225 } 226 ep->page = h; 227 ep->index = 0; 228 break; 229 case R_LAST: /* Last record. */ 230 case R_PREV: 231 /* Walk down the right-hand side of the tree. */ 232 for (pg = P_ROOT;;) { 233 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 234 return (RET_ERROR); 235 236 /* Check for an empty tree. */ 237 if (NEXTINDEX(h) == 0) { 238 mpool_put(t->bt_mp, h, 0); 239 return (RET_SPECIAL); 240 } 241 242 if (h->flags & (P_BLEAF | P_RLEAF)) 243 break; 244 pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno; 245 mpool_put(t->bt_mp, h, 0); 246 } 247 248 ep->page = h; 249 ep->index = NEXTINDEX(h) - 1; 250 break; 251 } 252 return (RET_SUCCESS); 253 } 254 255 /* 256 * __bt_seqadvance -- 257 * Advance the sequential scan. 258 * 259 * Parameters: 260 * t: tree 261 * flags: R_NEXT, R_PREV 262 * 263 * Side effects: 264 * Pins the page the new key/data record is on. 265 * 266 * Returns: 267 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. 268 */ 269 static int 270 __bt_seqadv(t, ep, flags) 271 BTREE *t; 272 EPG *ep; 273 int flags; 274 { 275 CURSOR *c; 276 PAGE *h; 277 indx_t idx; 278 db_pgno_t pg; 279 int exact, rval; 280 281 /* 282 * There are a couple of states that we can be in. The cursor has 283 * been initialized by the time we get here, but that's all we know. 284 */ 285 c = &t->bt_cursor; 286 287 /* 288 * The cursor was deleted and there weren't any duplicate records, 289 * so the cursor's key was saved. Find out where that key would 290 * be in the current tree. If the returned key is an exact match, 291 * it means that a key/data pair was inserted into the tree after 292 * the delete. We could reasonably return the key, but the problem 293 * is that this is the access pattern we'll see if the user is 294 * doing seq(..., R_NEXT)/put(..., 0) pairs, i.e. the put deletes 295 * the cursor record and then replaces it, so the cursor was saved, 296 * and we'll simply return the same "new" record until the user 297 * notices and doesn't do a put() of it. Since the key is an exact 298 * match, we could as easily put the new record before the cursor, 299 * and we've made no guarantee to return it. So, move forward or 300 * back a record if it's an exact match. 301 * 302 * XXX 303 * In the current implementation, put's to the cursor are done with 304 * delete/add pairs. This has two consequences. First, it means 305 * that seq(..., R_NEXT)/put(..., R_CURSOR) pairs are going to exhibit 306 * the same behavior as above. Second, you can return the same key 307 * twice if you have duplicate records. The scenario is that the 308 * cursor record is deleted, moving the cursor forward or backward 309 * to a duplicate. The add then inserts the new record at a location 310 * ahead of the cursor because duplicates aren't sorted in any way, 311 * and the new record is later returned. This has to be fixed at some 312 * point. 313 */ 314 if (F_ISSET(c, CURS_ACQUIRE)) { 315 if ((rval = __bt_first(t, &c->key, ep, &exact)) == RET_ERROR) 316 return (RET_ERROR); 317 if (!exact) 318 return (rval); 319 /* 320 * XXX 321 * Kluge -- get, release, get the page. 322 */ 323 c->pg.pgno = ep->page->pgno; 324 c->pg.index = ep->index; 325 mpool_put(t->bt_mp, ep->page, 0); 326 } 327 328 /* Get the page referenced by the cursor. */ 329 if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) 330 return (RET_ERROR); 331 332 /* 333 * Find the next/previous record in the tree and point the cursor at 334 * it. The cursor may not be moved until a new key has been found. 335 */ 336 switch (flags) { 337 case R_NEXT: /* Next record. */ 338 /* 339 * The cursor was deleted in duplicate records, and moved 340 * forward to a record that has yet to be returned. Clear 341 * that flag, and return the record. 342 */ 343 if (F_ISSET(c, CURS_AFTER)) 344 goto usecurrent; 345 idx = c->pg.index; 346 if (++idx == NEXTINDEX(h)) { 347 pg = h->nextpg; 348 mpool_put(t->bt_mp, h, 0); 349 if (pg == P_INVALID) 350 return (RET_SPECIAL); 351 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 352 return (RET_ERROR); 353 idx = 0; 354 } 355 break; 356 case R_PREV: /* Previous record. */ 357 /* 358 * The cursor was deleted in duplicate records, and moved 359 * backward to a record that has yet to be returned. Clear 360 * that flag, and return the record. 361 */ 362 if (F_ISSET(c, CURS_BEFORE)) { 363 usecurrent: F_CLR(c, CURS_AFTER | CURS_BEFORE); 364 ep->page = h; 365 ep->index = c->pg.index; 366 return (RET_SUCCESS); 367 } 368 idx = c->pg.index; 369 if (idx == 0) { 370 pg = h->prevpg; 371 mpool_put(t->bt_mp, h, 0); 372 if (pg == P_INVALID) 373 return (RET_SPECIAL); 374 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 375 return (RET_ERROR); 376 idx = NEXTINDEX(h) - 1; 377 } else 378 --idx; 379 break; 380 } 381 382 ep->page = h; 383 ep->index = idx; 384 return (RET_SUCCESS); 385 } 386 387 /* 388 * __bt_first -- 389 * Find the first entry. 390 * 391 * Parameters: 392 * t: the tree 393 * key: the key 394 * erval: return EPG 395 * exactp: pointer to exact match flag 396 * 397 * Returns: 398 * The first entry in the tree greater than or equal to key, 399 * or RET_SPECIAL if no such key exists. 400 */ 401 static int 402 __bt_first(t, key, erval, exactp) 403 BTREE *t; 404 const DBT *key; 405 EPG *erval; 406 int *exactp; 407 { 408 PAGE *h; 409 EPG *ep, save; 410 db_pgno_t pg; 411 412 /* 413 * Find any matching record; __bt_search pins the page. 414 * 415 * If it's an exact match and duplicates are possible, walk backwards 416 * in the tree until we find the first one. Otherwise, make sure it's 417 * a valid key (__bt_search may return an index just past the end of a 418 * page) and return it. 419 */ 420 if ((ep = __bt_search(t, key, exactp)) == NULL) 421 return (RET_SPECIAL); 422 if (*exactp) { 423 if (F_ISSET(t, B_NODUPS)) { 424 *erval = *ep; 425 return (RET_SUCCESS); 426 } 427 428 /* 429 * Walk backwards, as long as the entry matches and there are 430 * keys left in the tree. Save a copy of each match in case 431 * we go too far. 432 */ 433 save = *ep; 434 h = ep->page; 435 do { 436 if (save.page->pgno != ep->page->pgno) { 437 mpool_put(t->bt_mp, save.page, 0); 438 save = *ep; 439 } else 440 save.index = ep->index; 441 442 /* 443 * Don't unpin the page the last (or original) match 444 * was on, but make sure it's unpinned if an error 445 * occurs. 446 */ 447 if (ep->index == 0) { 448 if (h->prevpg == P_INVALID) 449 break; 450 if (h->pgno != save.page->pgno) 451 mpool_put(t->bt_mp, h, 0); 452 if ((h = mpool_get(t->bt_mp, 453 h->prevpg, 0)) == NULL) { 454 if (h->pgno == save.page->pgno) 455 mpool_put(t->bt_mp, 456 save.page, 0); 457 return (RET_ERROR); 458 } 459 ep->page = h; 460 ep->index = NEXTINDEX(h); 461 } 462 --ep->index; 463 } while (__bt_cmp(t, key, ep) == 0); 464 465 /* 466 * Reach here with the last page that was looked at pinned, 467 * which may or may not be the same as the last (or original) 468 * match page. If it's not useful, release it. 469 */ 470 if (h->pgno != save.page->pgno) 471 mpool_put(t->bt_mp, h, 0); 472 473 *erval = save; 474 return (RET_SUCCESS); 475 } 476 477 /* If at the end of a page, find the next entry. */ 478 if (ep->index == NEXTINDEX(ep->page)) { 479 h = ep->page; 480 pg = h->nextpg; 481 mpool_put(t->bt_mp, h, 0); 482 if (pg == P_INVALID) 483 return (RET_SPECIAL); 484 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 485 return (RET_ERROR); 486 ep->index = 0; 487 ep->page = h; 488 } 489 *erval = *ep; 490 return (RET_SUCCESS); 491 } 492 493 /* 494 * __bt_setcur -- 495 * Set the cursor to an entry in the tree. 496 * 497 * Parameters: 498 * t: the tree 499 * pgno: page number 500 * index: page index 501 */ 502 void 503 __bt_setcur(t, pgno, idx) 504 BTREE *t; 505 db_pgno_t pgno; 506 u_int idx; 507 { 508 /* Lose any already deleted key. */ 509 if (t->bt_cursor.key.data != NULL) { 510 free(t->bt_cursor.key.data); 511 t->bt_cursor.key.size = 0; 512 t->bt_cursor.key.data = NULL; 513 } 514 F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE); 515 516 /* Update the cursor. */ 517 t->bt_cursor.pg.pgno = pgno; 518 t->bt_cursor.pg.index = idx; 519 F_SET(&t->bt_cursor, CURS_INIT); 520 } 521 522 /* Recursive descent cursor. */ 523 typedef struct rcursor_ { 524 CURSOR cursor; 525 size_t ssize; 526 EPGNO *stack; 527 EPGNO *sp; 528 } RCURSOR; 529 #define RCURSOR_MINSS 64 530 531 static int bt_rcinit(void **); 532 static void bt_rcdestroy(void **); 533 static int bt_rcpush(RCURSOR *, db_pgno_t, u_int); 534 static EPGNO *bt_rcpop(RCURSOR *); 535 static void bt_rcclr(RCURSOR *); 536 static int bt_rcgrowstk(RCURSOR *); 537 static int bt_rseqset(BTREE *, EPG *, DBT *, RCURSOR *, int); 538 static int bt_rseqadv(BTREE *, EPG *, RCURSOR *, int); 539 540 static int 541 bt_rcinit(curs) 542 void **curs; 543 { 544 RCURSOR *rc; 545 546 rc = *curs = malloc(sizeof(RCURSOR)); 547 if (rc == NULL) { 548 errno = ENOMEM; 549 return RET_ERROR; 550 } 551 memset(rc, 0, sizeof(*rc)); 552 553 rc->ssize = RCURSOR_MINSS; 554 rc->stack = malloc(rc->ssize * sizeof(EPGNO)); 555 if (rc->stack == NULL) { 556 free(rc); 557 errno = ENOMEM; 558 return RET_ERROR; 559 } 560 bt_rcclr(rc); 561 return RET_SUCCESS; 562 } 563 564 static void 565 bt_rcdestroy(curs) 566 void **curs; 567 { 568 RCURSOR *rc; 569 570 rc = *curs; 571 free(rc->stack); 572 free(rc); 573 *curs = NULL; 574 } 575 576 static int 577 bt_rcpush(rc, p, i) 578 RCURSOR *rc; 579 db_pgno_t p; 580 u_int i; 581 { 582 int status; 583 584 rc->sp->pgno = p; 585 rc->sp->index = i; 586 if (++rc->sp > rc->stack + rc->ssize) { 587 status = bt_rcgrowstk(rc); 588 if (status != RET_SUCCESS) 589 return status; 590 } 591 return RET_SUCCESS; 592 } 593 594 static EPGNO * 595 bt_rcpop(rc) 596 RCURSOR *rc; 597 { 598 return (rc->sp == rc->stack) ? NULL : --rc->sp; 599 } 600 601 static void 602 bt_rcclr(rc) 603 RCURSOR *rc; 604 { 605 rc->sp = rc->stack; 606 } 607 608 static int 609 bt_rcgrowstk(rc) 610 RCURSOR *rc; 611 { 612 size_t osize; 613 EPGNO *e; 614 615 osize = rc->ssize; 616 rc->ssize *= 2; 617 e = realloc(rc->stack, rc->ssize * sizeof(EPGNO)); 618 if (e == NULL) { 619 rc->ssize = osize; 620 errno = ENOMEM; 621 return RET_ERROR; 622 } 623 rc->stack = e; 624 return RET_SUCCESS; 625 } 626 627 /* 628 * bt_rseq -- 629 * Like __bt_seq but does recursive descent tree traversal 630 * instead of using the prev/next pointers. 631 */ 632 int 633 bt_rseq(dbp, key, data, curs, flags) 634 const DB *dbp; 635 DBT *key, *data; 636 void **curs; 637 u_int flags; 638 { 639 RCURSOR *rc; 640 BTREE *t; 641 EPG e; 642 int status; 643 644 t = dbp->internal; 645 646 /* Toss any page pinned across calls. */ 647 if (t->bt_pinned != NULL) { 648 mpool_put(t->bt_mp, t->bt_pinned, 0); 649 t->bt_pinned = NULL; 650 } 651 652 if (curs == NULL) { 653 errno = EINVAL; 654 return RET_ERROR; 655 } 656 if (*curs == NULL) { 657 status = bt_rcinit(curs); 658 if (status != RET_SUCCESS) 659 return RET_ERROR; 660 } 661 rc = *curs; 662 663 /* 664 * If scan unitialized as yet, or starting at a specific record, set 665 * the scan to a specific key. Both bt_rseqset and bt_rseqadv pin 666 * the page the cursor references if they're successful. 667 */ 668 switch (flags) { 669 case R_NEXT: 670 case R_PREV: 671 if (F_ISSET(&rc->cursor, CURS_INIT)) { 672 status = bt_rseqadv(t, &e, rc, flags); 673 break; 674 } 675 /* FALLTHROUGH */ 676 case R_FIRST: 677 case R_LAST: 678 case R_CURSOR: 679 status = bt_rseqset(t, &e, key, rc, flags); 680 break; 681 default: 682 errno = EINVAL; 683 return (RET_ERROR); 684 } 685 686 if (status == RET_SUCCESS) { 687 status = 688 __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0); 689 690 /* 691 * If the user is doing concurrent access, we copied the 692 * key/data, toss the page. 693 */ 694 if (F_ISSET(t, B_DB_LOCK)) 695 mpool_put(t->bt_mp, e.page, 0); 696 else 697 t->bt_pinned = e.page; 698 } else if (status == RET_SPECIAL) 699 bt_rcdestroy(curs); 700 return (status); 701 } 702 703 /* 704 * bt_rseqset -- 705 * Set the sequential scan to a specific key. 706 * 707 * Parameters: 708 * t: tree 709 * ep: storage for returned key 710 * key: key for initial scan position 711 * rc: recursion cursor 712 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV 713 * 714 * Side effects: 715 * Pins the page the cursor references. 716 * Updates rc's stack and cursor. 717 * 718 * Returns: 719 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. 720 */ 721 static int 722 bt_rseqset(t, ep, key, rc, flags) 723 BTREE *t; 724 EPG *ep; 725 DBT *key; 726 RCURSOR *rc; 727 int flags; 728 { 729 PAGE *h; 730 db_pgno_t pg; 731 int status; 732 733 /* 734 * Find the first, last or specific key in the tree and point the 735 * cursor at it. The cursor may not be moved until a new key has 736 * been found. 737 */ 738 switch (flags) { 739 case R_CURSOR: /* Not implemented. */ 740 errno = EINVAL; 741 return RET_ERROR; 742 case R_FIRST: /* First record. */ 743 case R_NEXT: 744 bt_rcclr(rc); 745 /* Walk down the left-hand side of the tree. */ 746 for (pg = P_ROOT;;) { 747 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 748 return (RET_ERROR); 749 750 /* Check for an empty tree. */ 751 if (NEXTINDEX(h) == 0) { 752 mpool_put(t->bt_mp, h, 0); 753 return (RET_SPECIAL); 754 } 755 756 if (h->flags & (P_BLEAF | P_RLEAF)) 757 break; 758 pg = GETBINTERNAL(h, 0)->pgno; 759 status = bt_rcpush(rc, h->pgno, 0); 760 mpool_put(t->bt_mp, h, 0); 761 if (status != RET_SUCCESS) 762 return status; 763 } 764 ep->page = h; 765 ep->index = 0; 766 break; 767 case R_LAST: /* Last record. */ 768 case R_PREV: 769 bt_rcclr(rc); 770 /* Walk down the right-hand side of the tree. */ 771 for (pg = P_ROOT;;) { 772 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 773 return (RET_ERROR); 774 775 /* Check for an empty tree. */ 776 if (NEXTINDEX(h) == 0) { 777 mpool_put(t->bt_mp, h, 0); 778 return (RET_SPECIAL); 779 } 780 781 if (h->flags & (P_BLEAF | P_RLEAF)) 782 break; 783 pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno; 784 status = bt_rcpush(rc, h->pgno, NEXTINDEX(h) - 1); 785 mpool_put(t->bt_mp, h, 0); 786 if (status != RET_SUCCESS) 787 return status; 788 } 789 ep->page = h; 790 ep->index = NEXTINDEX(h) - 1; 791 break; 792 } 793 rc->cursor.pg.pgno = ep->page->pgno; 794 rc->cursor.pg.index = ep->index; 795 F_CLR(&rc->cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE); 796 F_SET(&rc->cursor, CURS_INIT); 797 return (RET_SUCCESS); 798 } 799 800 /* 801 * bt_rseqadvance -- 802 * Advance the sequential scan. 803 * 804 * Parameters: 805 * t: tree 806 * ep: return page 807 * rc: recursion cursor 808 * flags: R_NEXT, R_PREV 809 * 810 * Side effects: 811 * Pins the page the new key/data record is on. 812 * Updates rc's stack and cursor. 813 * 814 * Returns: 815 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. 816 */ 817 static int 818 bt_rseqadv(t, ep, rc, flags) 819 BTREE *t; 820 EPG *ep; 821 RCURSOR *rc; 822 int flags; 823 { 824 CURSOR *c; 825 PAGE *h; 826 indx_t idx; 827 db_pgno_t pg; 828 int status; 829 EPGNO *e; 830 831 /* 832 * There are a couple of states that we can be in. The cursor has 833 * been initialized by the time we get here, but that's all we know. 834 */ 835 c = &rc->cursor; 836 837 /* Get the page referenced by the cursor. */ 838 if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) 839 return (RET_ERROR); 840 841 /* 842 * Find the next/previous record in the tree and point the cursor at 843 * it. The cursor may not be moved until a new key has been found. 844 */ 845 switch (flags) { 846 case R_NEXT: /* Next record. */ 847 idx = c->pg.index; 848 while (++idx == NEXTINDEX(h)) { 849 /* Crawl up if we hit the right edge. */ 850 e = bt_rcpop(rc); 851 mpool_put(t->bt_mp, h, 0); 852 if (e == NULL) /* Hit the right edge of root. */ 853 return RET_SPECIAL; 854 idx = e->index; 855 pg = e->pgno; 856 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 857 return (RET_ERROR); 858 } 859 while (!(h->flags & (P_BLEAF | P_RLEAF))) { 860 /* Crawl down the left until we hit a leaf. */ 861 status = bt_rcpush(rc, h->pgno, idx); 862 pg = GETBINTERNAL(h, idx)->pgno; 863 mpool_put(t->bt_mp, h, 0); 864 if (status != RET_SUCCESS) 865 return status; 866 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 867 return (RET_ERROR); 868 idx = 0; 869 } 870 break; 871 case R_PREV: /* Previous record. */ 872 idx = c->pg.index; 873 while (!idx) { 874 /* Crawl up if we hit the left edge. */ 875 e = bt_rcpop(rc); 876 mpool_put(t->bt_mp, h, 0); 877 if (e == NULL) /* Hit the left edge of root. */ 878 return RET_SPECIAL; 879 idx = e->index; 880 pg = e->pgno; 881 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 882 return (RET_ERROR); 883 } 884 idx--; 885 while (!(h->flags & (P_BLEAF | P_RLEAF))) { 886 /* Crawl down the right until we hit a leaf. */ 887 status = bt_rcpush(rc, h->pgno, idx); 888 pg = GETBINTERNAL(h, idx)->pgno; 889 mpool_put(t->bt_mp, h, 0); 890 if (status != RET_SUCCESS) 891 return status; 892 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 893 return (RET_ERROR); 894 idx = NEXTINDEX(h) - 1; 895 } 896 break; 897 } 898 899 ep->page = h; 900 ep->index = idx; 901 c->pg.pgno = h->pgno; 902 c->pg.index = idx; 903 F_CLR(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE); 904 F_SET(c, CURS_INIT); 905 return (RET_SUCCESS); 906 } 907