1 /*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Mike Olson. 7 * 8 * %sccs.include.redist.c% 9 */ 10 11 #if defined(LIBC_SCCS) && !defined(lint) 12 static char sccsid[] = "@(#)bt_open.c 5.9 (Berkeley) 05/07/91"; 13 #endif /* LIBC_SCCS and not lint */ 14 15 /* 16 * btree.c -- implementation of btree access method for 4.4BSD. 17 * 18 * The design here is based on that of the btree access method used 19 * in the Postgres database system at UC Berkeley. The implementation 20 * is wholly independent of the Postgres code. 21 * 22 * This implementation supports btrees on disk (supply a filename) or 23 * in memory (don't). Public interfaces defined here are: 24 * 25 * btree_open() -- wrapper; returns a filled DB struct for 26 * a btree. 27 * 28 * bt_open() -- open a new or existing btree. 29 * bt_get() -- fetch data from a tree by key. 30 * bt_seq() -- do a sequential scan on a tree. 31 * bt_put() -- add data to a tree by key. 32 * bt_delete() -- remove data from a tree by key. 33 * bt_close() -- close a btree. 34 * bt_sync() -- force btree pages to disk (disk trees only). 35 */ 36 37 #include <sys/param.h> 38 #include <sys/stat.h> 39 #include <signal.h> 40 #include <errno.h> 41 #include <fcntl.h> 42 #include <db.h> 43 #include <stdlib.h> 44 #include <string.h> 45 #include <unistd.h> 46 #include "btree.h" 47 48 BTREEINFO _DefaultBTInfo = { 49 0, /* flags */ 50 0, /* cachesize */ 51 0, /* psize */ 52 strcmp, /* compare */ 53 0 54 }; 55 56 /* 57 * BTREE_OPEN -- Wrapper routine to open a btree. 58 * 59 * Creates and fills a DB struct, and calls the routine that actually 60 * opens the btree. 61 * 62 * Parameters: 63 * f: filename to open 64 * flags: flag bits passed to open 65 * mode: permission bits, used if O_CREAT specified 66 * b: BTREEINFO pointer 67 * 68 * Returns: 69 * Filled-in DBT on success; NULL on failure, with errno 70 * set as appropriate. 71 * 72 * Side Effects: 73 * Allocates memory for the DB struct. 74 */ 75 76 DB * 77 btree_open(f, flags, mode, b) 78 const char *f; 79 int flags; 80 int mode; 81 const BTREEINFO *b; 82 { 83 DB *db; 84 BTREE t; 85 86 if ((db = (DB *) malloc((unsigned) sizeof(DB))) == (DB *) NULL) 87 return ((DB *) NULL); 88 89 if ((t = bt_open(f, flags, mode, b)) == (BTREE) NULL) { 90 (void) free ((char *) db); 91 return ((DB *) NULL); 92 } 93 94 db->internal = (char *) t; 95 db->close = bt_close; 96 db->del = bt_delete; 97 db->get = bt_get; 98 db->put = bt_put; 99 db->seq = bt_seq; 100 db->sync = bt_sync; 101 db->type = DB_BTREE; 102 103 return (db); 104 } 105 106 /* 107 * BT_OPEN -- Open a btree. 108 * 109 * This routine creates the correct kind (disk or in-memory) of 110 * btree and initializes it as required. 111 * 112 * Parameters: 113 * f -- name of btree (NULL for in-memory btrees) 114 * flags -- flags passed to open() 115 * mode -- mode passed to open () 116 * b -- BTREEINFO structure, describing btree 117 * 118 * Returns: 119 * (Opaque) pointer to the btree. On failure, returns NULL 120 * with errno set as appropriate. 121 * 122 * Side Effects: 123 * Allocates memory, opens files. 124 */ 125 126 BTREE 127 bt_open(f, flags, mode, b) 128 char *f; 129 int flags; 130 int mode; 131 BTREEINFO *b; 132 { 133 BTREE_P t; 134 HTABLE ht; 135 int nbytes; 136 int fd; 137 CURSOR *c; 138 BTMETA m; 139 struct stat buf; 140 141 /* use the default info if none was provided */ 142 if (b == (BTREEINFO *) NULL) 143 b = &_DefaultBTInfo; 144 145 if ((t = (BTREE_P) malloc((unsigned) sizeof *t)) == (BTREE_P) NULL) 146 return ((BTREE) NULL); 147 148 if (b->compare) 149 t->bt_compare = b->compare; 150 else 151 t->bt_compare = strcmp; 152 153 t->bt_fname = f; 154 t->bt_curpage = (BTHEADER *) NULL; 155 t->bt_free = P_NONE; 156 c = &(t->bt_cursor); 157 c->c_pgno = P_NONE; 158 c->c_index = 0; 159 c->c_flags = (u_char) NULL; 160 c->c_key = (char *) NULL; 161 t->bt_stack = (BTSTACK *) NULL; 162 t->bt_flags = 0; 163 164 /* 165 * If no file name was supplied, this is an in-memory btree. 166 * Otherwise, it's a disk-based btree. 167 */ 168 if (f == (char *) NULL) { 169 /* in memory */ 170 if ((t->bt_psize = b->psize) < MINPSIZE) { 171 if (t->bt_psize != 0) { 172 (void) free ((char *) t); 173 errno = EINVAL; 174 return ((BTREE) NULL); 175 } 176 t->bt_psize = getpagesize(); 177 } 178 179 nbytes = HTSIZE * sizeof(HTBUCKET *); 180 if ((ht = (HTABLE) malloc((unsigned) nbytes)) 181 == (HTABLE) NULL) { 182 (void) free((char *) t); 183 return ((BTREE) NULL); 184 } 185 (void) bzero((char *) ht, nbytes); 186 t->bt_s.bt_ht = ht; 187 t->bt_npages = 0; 188 t->bt_lorder = BYTE_ORDER; 189 if (!(b->flags & R_DUP)) 190 t->bt_flags |= BTF_NODUPS; 191 } else { 192 /* on disk */ 193 if ((fd = open(f, O_RDONLY, 0)) < 0) { 194 /* doesn't exist yet, be sure page is big enough */ 195 if ((t->bt_psize = b->psize) < sizeof(BTHEADER) 196 && b->psize != 0) { 197 (void) free((char *) t); 198 errno = EINVAL; 199 return ((BTREE) NULL); 200 } 201 if (b->lorder == 0) 202 b->lorder = BYTE_ORDER; 203 204 if (b->lorder != BIG_ENDIAN 205 && b->lorder != LITTLE_ENDIAN) { 206 (void) free((char *) t); 207 errno = EINVAL; 208 return ((BTREE) NULL); 209 } 210 t->bt_lorder = b->lorder; 211 if (!(b->flags & R_DUP)) 212 t->bt_flags |= BTF_NODUPS; 213 } else { 214 /* exists, get page size from file */ 215 if (read(fd, &m, sizeof(m)) < sizeof(m)) { 216 (void) close(fd); 217 (void) free((char *) t); 218 errno = EINVAL; 219 return ((BTREE) NULL); 220 } 221 222 /* lorder always stored in host-independent format */ 223 NTOHL(m.m_lorder); 224 if (m.m_lorder != BIG_ENDIAN 225 && m.m_lorder != LITTLE_ENDIAN) { 226 (void) free((char *) t); 227 errno = EINVAL; 228 return ((BTREE) NULL); 229 } 230 t->bt_lorder = m.m_lorder; 231 232 if (t->bt_lorder != BYTE_ORDER) { 233 BLSWAP(m.m_magic); 234 BLSWAP(m.m_version); 235 BLSWAP(m.m_psize); 236 BLSWAP(m.m_free); 237 BLSWAP(m.m_flags); 238 } 239 240 if (m.m_magic != BTREEMAGIC 241 || m.m_version != BTREEVERSION 242 || m.m_psize < MINPSIZE) { 243 (void) close(fd); 244 (void) free((char *) t); 245 #ifndef EFTYPE 246 #define EFTYPE -100 247 #endif 248 errno = EFTYPE; 249 return ((BTREE) NULL); 250 } 251 t->bt_psize = m.m_psize; 252 t->bt_free = m.m_free; 253 t->bt_flags |= (m.m_flags & BTF_NODUPS) | BTF_METAOK; 254 (void) close(fd); 255 } 256 257 /* now open the file the way the user wanted it */ 258 if ((t->bt_s.bt_d.d_fd = open(f, flags, mode)) < 0) { 259 (void) free ((char *) t); 260 return ((BTREE) NULL); 261 } 262 263 /* access method files are always close-on-exec */ 264 if (fcntl(t->bt_s.bt_d.d_fd, F_SETFL, 1) == -1) { 265 (void) close(t->bt_s.bt_d.d_fd); 266 (void) free ((char *) t); 267 return ((BTREE) NULL); 268 } 269 270 /* get number of pages, page size if necessary */ 271 (void) fstat(t->bt_s.bt_d.d_fd, &buf); 272 if (t->bt_psize == 0) 273 t->bt_psize = buf.st_blksize; 274 t->bt_npages = (pgno_t) (buf.st_size / t->bt_psize); 275 276 /* page zero is metadata, doesn't count */ 277 if (t->bt_npages > 0) 278 --(t->bt_npages); 279 280 if (b->cachesize == 0) 281 b->cachesize = DEFCACHE; 282 283 /* get an lru buffer cache, if the user asked for one */ 284 if ((b->cachesize / t->bt_psize) > 0) { 285 BTDISK *d = &(t->bt_s.bt_d); 286 287 d->d_cache = lruinit(d->d_fd, 288 (int) (b->cachesize / t->bt_psize), 289 (int) t->bt_psize, 290 (char *) t->bt_lorder, 291 _bt_pgin, _bt_pgout); 292 293 if (d->d_cache == (char *) NULL) { 294 (void) free((char *) t); 295 return ((BTREE) NULL); 296 } 297 } 298 } 299 300 /* remember if tree was opened for write */ 301 if (((flags & O_WRONLY) == O_WRONLY) 302 || ((flags & O_RDWR) == O_RDWR)) 303 t->bt_flags |= BTF_ISWRITE; 304 305 return ((BTREE) t); 306 } 307 308 /* 309 * BT_GET -- Get an entry from a btree. 310 * 311 * Does a key lookup in the tree to find the specified key, and returns 312 * the key/data pair found. 313 * 314 * Parameters: 315 * tree -- btree in which to do lookup 316 * key -- key to find 317 * data -- pointer to DBT in which to return data 318 * flag -- ignored 319 * 320 * Returns: 321 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not 322 * found. If key is not found, nothing is stored in the 323 * return DBT 'data'. 324 * 325 * Side Effects: 326 * None. 327 * 328 * Warnings: 329 * Return data is statically allocated, and will be overwritten 330 * at the next call. 331 */ 332 333 int 334 bt_get(dbp, key, data, flag) 335 DB *dbp; 336 DBT *key, *data; 337 u_long flag; 338 { 339 BTREE_P t = (BTREE_P) (dbp->internal); 340 BTHEADER *h; 341 DATUM *d; 342 BTITEM *item; 343 344 #ifdef lint 345 flag = flag; 346 #endif /* lint */ 347 348 /* lookup */ 349 item = _bt_search(t, key); 350 351 /* clear parent pointer stack */ 352 while (_bt_pop(t) != P_NONE) 353 continue; 354 355 if (item == (BTITEM *) NULL) 356 return (RET_ERROR); 357 358 h = (BTHEADER *) t->bt_curpage; 359 data->size = 0; 360 data->data = (u_char *) NULL; 361 362 /* match? */ 363 if (VALIDITEM(t, item) 364 && _bt_cmp(t, key->data, item->bti_index) == 0) { 365 d = (DATUM *) GETDATUM(h, item->bti_index); 366 return (_bt_buildret(t, d, data, key)); 367 } 368 369 /* nope */ 370 return (RET_SPECIAL); 371 } 372 373 /* 374 * BT_PUT -- Add an entry to a btree. 375 * 376 * The specified (key, data) pair is added to the tree. If the tree 377 * was created for unique keys only, then duplicates will not be 378 * entered. If the requested key exists in the tree, it will be over- 379 * written unless the flags parameter is R_NOOVERWRITE, in which case 380 * the update will not be done. If duplicate keys are permitted in the 381 * tree, duplicates will be inserted and will not overwrite existing 382 * keys. Nodes are split as required. 383 * 384 * Parameters: 385 * tree -- btree in which to put the new entry 386 * key -- key component to add 387 * data -- data corresponding to key 388 * flag -- R_NOOVERWRITE or zero. 389 * 390 * Returns: 391 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the 392 * NOOVERWRITE flag was set and the specified key exists 393 * in the database. 394 * 395 * Side Effects: 396 * None. 397 */ 398 399 int 400 bt_put(dbp, key, data, flag) 401 DB *dbp; 402 DBT *key, *data; 403 u_long flag; 404 { 405 BTREE_P t; 406 BTITEM *item; 407 408 t = (BTREE_P)dbp->internal; 409 410 /* look for this key in the tree */ 411 item = _bt_search(t, key); 412 413 /* 414 * If this tree was originally created without R_DUP, then duplicate 415 * keys are not allowed. We need to check this at insertion time. 416 */ 417 418 if (VALIDITEM(t, item) && _bt_cmp(t, key->data, item->bti_index) == 0) { 419 if ((t->bt_flags & BTF_NODUPS) && flag == R_NOOVERWRITE) { 420 if (_bt_delone(t, item->bti_index) == RET_ERROR) { 421 while (_bt_pop(t) != P_NONE) 422 continue; 423 return (RET_ERROR); 424 } 425 } 426 } 427 428 return (_bt_insert(t, item, key, data, flag)); 429 } 430 431 /* 432 * BT_DELETE -- delete a key from the tree. 433 * 434 * Deletes all keys (and their associated data items) matching the 435 * supplied key from the tree. If the flags entry is R_CURSOR, then 436 * the current item in the active scan is deleted. 437 * 438 * Parameters: 439 * tree -- btree from which to delete key 440 * key -- key to delete 441 * flags -- R_CURSOR or zero 442 * 443 * Returns: 444 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified 445 * key was not in the tree. 446 * 447 * Side Effects: 448 * None. 449 */ 450 451 int 452 bt_delete(dbp, key, flags) 453 DB *dbp; 454 DBT *key; 455 u_long flags; 456 { 457 BTREE_P t; 458 BTHEADER *h; 459 BTITEM *item; 460 int ndel = 0; 461 462 t = (BTREE_P)dbp->internal; 463 464 if (flags == R_CURSOR) 465 return (_bt_crsrdel(t)); 466 467 /* find the first matching key in the tree */ 468 item = _bt_first(t, key); 469 h = t->bt_curpage; 470 471 /* don't need the descent stack for deletes */ 472 while (_bt_pop(t) != P_NONE) 473 continue; 474 475 /* delete all matching keys */ 476 for (;;) { 477 while (VALIDITEM(t, item) 478 && (_bt_cmp(t, key->data, item->bti_index) == 0)) { 479 if (_bt_delone(t, item->bti_index) == RET_ERROR) 480 return (RET_ERROR); 481 ndel++; 482 } 483 484 if (VALIDITEM(t, item) || h->h_nextpg == P_NONE) 485 break; 486 487 /* next page, if necessary */ 488 do { 489 if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) 490 return (RET_ERROR); 491 h = t->bt_curpage; 492 } while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE); 493 494 item->bti_pgno = h->h_pgno; 495 item->bti_index = 0; 496 497 if (!VALIDITEM(t, item) 498 || _bt_cmp(t, key->data, item->bti_index) != 0) 499 break; 500 } 501 502 /* flush changes to disk */ 503 if (ISDISK(t)) { 504 if (h->h_flags & F_DIRTY) { 505 if (_bt_write(t, t->bt_curpage, NORELEASE) == RET_ERROR) 506 return (RET_ERROR); 507 } 508 } 509 510 if (ndel == 0) 511 return (RET_SPECIAL); 512 513 return (RET_SUCCESS); 514 } 515 516 /* 517 * BT_SYNC -- sync the btree to disk. 518 * 519 * Parameters: 520 * tree -- btree to sync. 521 * 522 * Returns: 523 * RET_SUCCESS, RET_ERROR. 524 */ 525 526 bt_sync(dbp) 527 DB *dbp; 528 { 529 BTREE_P t; 530 BTHEADER *h; 531 pgno_t pgno; 532 533 t = (BTREE_P)dbp->internal; 534 535 /* if this is an in-memory btree, syncing is a no-op */ 536 if (!ISDISK(t)) 537 return (RET_SUCCESS); 538 539 h = (BTHEADER *) t->bt_curpage; 540 h->h_flags &= ~F_DIRTY; 541 542 if (ISCACHE(t)) { 543 pgno = t->bt_curpage->h_pgno; 544 if (_bt_write(t, h, RELEASE) == RET_ERROR) 545 return(RET_ERROR); 546 if (lrusync(t->bt_s.bt_d.d_cache) < RET_ERROR) 547 return (RET_ERROR); 548 if (_bt_getpage(t, pgno) == RET_ERROR) 549 return (RET_ERROR); 550 } else { 551 if (_bt_write(t, h, NORELEASE) == RET_ERROR) 552 return (RET_ERROR); 553 } 554 555 return (fsync(t->bt_s.bt_d.d_fd)); 556 } 557 558 /* 559 * BT_SEQ -- Sequential scan interface. 560 * 561 * This routine supports sequential scans on the btree. If called with 562 * flags set to R_CURSOR, or if no seq scan has been initialized in the 563 * current tree, we initialize the scan. Otherwise, we advance the scan 564 * and return the next item. 565 * 566 * Scans can be either keyed or non-keyed. Keyed scans basically have 567 * a starting point somewhere in the middle of the tree. Non-keyed 568 * scans start at an endpoint. Also, scans can be backward (descending 569 * order), forward (ascending order), or no movement (keep returning 570 * the same item). 571 * 572 * Flags is checked every time we enter the routine, so the user can 573 * change directions on an active scan if desired. The key argument 574 * is examined only when we initialize the scan, in order to position 575 * it properly. 576 * 577 * Items are returned via the key and data arguments passed in. 578 * 579 * Parameters: 580 * tree -- btree in which to do scan 581 * key -- key, used to position scan on initialization, and 582 * used to return key components to the user. 583 * data -- used to return data components to the user. 584 * flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or 585 * R_PREV. 586 * 587 * Returns: 588 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data 589 * exists in the tree in the specified direction. 590 * 591 * Side Effects: 592 * Changes the btree's notion of the current position in the 593 * scan. 594 * 595 * Warnings: 596 * The key and data items returned are static and will be 597 * overwritten by the next bt_get or bt_seq. 598 */ 599 600 int 601 bt_seq(dbp, key, data, flags) 602 DB *dbp; 603 DBT *key, *data; 604 u_long flags; 605 { 606 BTREE_P t; 607 BTHEADER *h; 608 DATUM *d; 609 int status; 610 611 t = (BTREE_P)dbp->internal; 612 613 /* do we need to initialize the scan? */ 614 if (flags == R_CURSOR || flags == R_LAST || flags == R_FIRST 615 || !(t->bt_flags & BTF_SEQINIT)) { 616 617 /* initialize it */ 618 status = _bt_seqinit(t, key, flags); 619 } else { 620 /* just advance the current scan pointer */ 621 status = _bt_seqadvance(t, flags); 622 } 623 624 key->size = data->size = 0; 625 key->data = data->data = (u_char *) NULL; 626 627 h = t->bt_curpage; 628 629 /* is there a valid item at the current scan location? */ 630 if (status == RET_SPECIAL) { 631 if (flags == R_NEXT) { 632 if (t->bt_cursor.c_index >= NEXTINDEX(h)) { 633 if (NEXTINDEX(h) > 0) 634 t->bt_cursor.c_index = NEXTINDEX(h) - 1; 635 else 636 t->bt_cursor.c_index = 0; 637 } 638 } else { 639 t->bt_cursor.c_index = 0; 640 } 641 return (RET_SPECIAL); 642 } else if (status == RET_ERROR) 643 return (RET_ERROR); 644 645 /* okay, return the data */ 646 d = (DATUM *) GETDATUM(h, t->bt_cursor.c_index); 647 648 return (_bt_buildret(t, d, data, key)); 649 } 650 651 /* 652 * BT_CLOSE -- Close a btree 653 * 654 * Parameters: 655 * tree -- tree to close 656 * 657 * Returns: 658 * RET_SUCCESS, RET_ERROR. 659 * 660 * Side Effects: 661 * Frees space occupied by the tree. 662 */ 663 664 int 665 bt_close(dbp) 666 DB *dbp; 667 { 668 struct HTBUCKET *b, *sb; 669 BTREE_P t; 670 BTHEADER *h; 671 HTABLE ht; 672 int fd, i; 673 char *cache; 674 675 t = (BTREE_P)dbp->internal; 676 677 if (t->bt_cursor.c_key != (char *) NULL) 678 (void) free(t->bt_cursor.c_key); 679 680 if (!ISDISK(t)) { 681 /* in-memory tree, release hash table memory */ 682 ht = t->bt_s.bt_ht; 683 for (i = 0; i < HTSIZE; i++) { 684 if ((b = ht[i]) == (struct HTBUCKET *) NULL) 685 break; 686 do { 687 sb = b; 688 (void) free((char *) b->ht_page); 689 b = b->ht_next; 690 (void) free((char *) sb); 691 } while (b != (struct HTBUCKET *) NULL); 692 } 693 (void) free ((char *) ht); 694 (void) free ((char *) t); 695 return (RET_SUCCESS); 696 } 697 698 if ((t->bt_flags & BTF_ISWRITE) && !(t->bt_flags & BTF_METAOK)) { 699 if (_bt_wrtmeta(t) == RET_ERROR) 700 return (RET_ERROR); 701 } 702 703 if (t->bt_curpage != (BTHEADER *) NULL) { 704 h = t->bt_curpage; 705 if (h->h_flags & F_DIRTY) { 706 if (_bt_write(t, h, RELEASE) == RET_ERROR) 707 return (RET_ERROR); 708 } else { 709 if (_bt_release(t, h) == RET_ERROR) 710 return (RET_ERROR); 711 } 712 713 /* flush and free the cache, if there is one */ 714 if (ISCACHE(t)) { 715 cache = t->bt_s.bt_d.d_cache; 716 if (lrusync(cache) == RET_ERROR) 717 return (RET_ERROR); 718 lrufree(cache); 719 } 720 (void) free ((char *) h); 721 } 722 723 fd = t->bt_s.bt_d.d_fd; 724 (void) free ((char *) t); 725 return (close(fd)); 726 } 727