1 /*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Mike Olson. 7 * 8 * %sccs.include.redist.c% 9 */ 10 11 #if defined(LIBC_SCCS) && !defined(lint) 12 static char sccsid[] = "@(#)bt_open.c 5.7 (Berkeley) 03/03/91"; 13 #endif /* LIBC_SCCS and not lint */ 14 15 /* 16 * btree.c -- implementation of btree access method for 4.4BSD. 17 * 18 * The design here is based on that of the btree access method used 19 * in the Postgres database system at UC Berkeley. The implementation 20 * is wholly independent of the Postgres code. 21 * 22 * This implementation supports btrees on disk (supply a filename) or 23 * in memory (don't). Public interfaces defined here are: 24 * 25 * btree_open() -- wrapper; returns a filled DB struct for 26 * a btree. 27 * 28 * bt_open() -- open a new or existing btree. 29 * bt_get() -- fetch data from a tree by key. 30 * bt_seq() -- do a sequential scan on a tree. 31 * bt_put() -- add data to a tree by key. 32 * bt_delete() -- remove data from a tree by key. 33 * bt_close() -- close a btree. 34 * bt_sync() -- force btree pages to disk (disk trees only). 35 */ 36 37 #include <sys/param.h> 38 #include <sys/stat.h> 39 #include <signal.h> 40 #include <errno.h> 41 #include <fcntl.h> 42 #include <db.h> 43 #include <stdlib.h> 44 #include <string.h> 45 #include <unistd.h> 46 #include "btree.h" 47 48 BTREEINFO _DefaultBTInfo = { 49 0, /* flags */ 50 0, /* cachesize */ 51 0, /* psize */ 52 strcmp, /* compare */ 53 0 54 }; 55 56 /* 57 * BTREE_OPEN -- Wrapper routine to open a btree. 58 * 59 * Creates and fills a DB struct, and calls the routine that actually 60 * opens the btree. 61 * 62 * Parameters: 63 * f: filename to open 64 * flags: flag bits passed to open 65 * mode: permission bits, used if O_CREAT specified 66 * b: BTREEINFO pointer 67 * 68 * Returns: 69 * Filled-in DBT on success; NULL on failure, with errno 70 * set as appropriate. 71 * 72 * Side Effects: 73 * Allocates memory for the DB struct. 74 */ 75 76 DB * 77 btree_open(f, flags, mode, b) 78 const char *f; 79 int flags; 80 int mode; 81 const BTREEINFO *b; 82 { 83 DB *db; 84 BTREE t; 85 86 if ((db = (DB *) malloc((unsigned) sizeof(DB))) == (DB *) NULL) 87 return ((DB *) NULL); 88 89 if ((t = bt_open(f, flags, mode, b)) == (BTREE) NULL) { 90 (void) free ((char *) db); 91 return ((DB *) NULL); 92 } 93 94 db->internal = (char *) t; 95 db->close = bt_close; 96 db->del = bt_delete; 97 db->get = bt_get; 98 db->put = bt_put; 99 db->seq = bt_seq; 100 db->sync = bt_sync; 101 102 return (db); 103 } 104 105 /* 106 * BT_OPEN -- Open a btree. 107 * 108 * This routine creates the correct kind (disk or in-memory) of 109 * btree and initializes it as required. 110 * 111 * Parameters: 112 * f -- name of btree (NULL for in-memory btrees) 113 * flags -- flags passed to open() 114 * mode -- mode passed to open () 115 * b -- BTREEINFO structure, describing btree 116 * 117 * Returns: 118 * (Opaque) pointer to the btree. On failure, returns NULL 119 * with errno set as appropriate. 120 * 121 * Side Effects: 122 * Allocates memory, opens files. 123 */ 124 125 BTREE 126 bt_open(f, flags, mode, b) 127 char *f; 128 int flags; 129 int mode; 130 BTREEINFO *b; 131 { 132 BTREE_P t; 133 HTABLE ht; 134 int nbytes; 135 int fd; 136 CURSOR *c; 137 BTMETA m; 138 struct stat buf; 139 140 /* use the default info if none was provided */ 141 if (b == (BTREEINFO *) NULL) 142 b = &_DefaultBTInfo; 143 144 if ((t = (BTREE_P) malloc((unsigned) sizeof *t)) == (BTREE_P) NULL) 145 return ((BTREE) NULL); 146 147 if (b->compare) 148 t->bt_compare = b->compare; 149 else 150 t->bt_compare = strcmp; 151 152 t->bt_fname = f; 153 t->bt_curpage = (BTHEADER *) NULL; 154 t->bt_free = P_NONE; 155 c = &(t->bt_cursor); 156 c->c_pgno = P_NONE; 157 c->c_index = 0; 158 c->c_flags = (u_char) NULL; 159 c->c_key = (char *) NULL; 160 t->bt_stack = (BTSTACK *) NULL; 161 t->bt_flags = 0; 162 163 /* 164 * If no file name was supplied, this is an in-memory btree. 165 * Otherwise, it's a disk-based btree. 166 */ 167 if (f == (char *) NULL) { 168 /* in memory */ 169 if ((t->bt_psize = b->psize) < MINPSIZE) { 170 if (t->bt_psize != 0) { 171 (void) free ((char *) t); 172 errno = EINVAL; 173 return ((BTREE) NULL); 174 } 175 t->bt_psize = getpagesize(); 176 } 177 178 nbytes = HTSIZE * sizeof(HTBUCKET *); 179 if ((ht = (HTABLE) malloc((unsigned) nbytes)) 180 == (HTABLE) NULL) { 181 (void) free((char *) t); 182 return ((BTREE) NULL); 183 } 184 (void) bzero((char *) ht, nbytes); 185 t->bt_s.bt_ht = ht; 186 t->bt_npages = 0; 187 t->bt_lorder = BYTE_ORDER; 188 if (!(b->flags & R_DUP)) 189 t->bt_flags |= BTF_NODUPS; 190 } else { 191 /* on disk */ 192 if ((fd = open(f, O_RDONLY, 0)) < 0) { 193 /* doesn't exist yet, be sure page is big enough */ 194 if ((t->bt_psize = b->psize) < sizeof(BTHEADER) 195 && b->psize != 0) { 196 (void) free((char *) t); 197 errno = EINVAL; 198 return ((BTREE) NULL); 199 } 200 if (b->lorder == 0) 201 b->lorder = BYTE_ORDER; 202 203 if (b->lorder != BIG_ENDIAN 204 && b->lorder != LITTLE_ENDIAN) { 205 (void) free((char *) t); 206 errno = EINVAL; 207 return ((BTREE) NULL); 208 } 209 t->bt_lorder = b->lorder; 210 if (!(b->flags & R_DUP)) 211 t->bt_flags |= BTF_NODUPS; 212 } else { 213 /* exists, get page size from file */ 214 if (read(fd, &m, sizeof(m)) < sizeof(m)) { 215 (void) close(fd); 216 (void) free((char *) t); 217 errno = EINVAL; 218 return ((BTREE) NULL); 219 } 220 221 /* lorder always stored in host-independent format */ 222 NTOHL(m.m_lorder); 223 if (m.m_lorder != BIG_ENDIAN 224 && m.m_lorder != LITTLE_ENDIAN) { 225 (void) free((char *) t); 226 errno = EINVAL; 227 return ((BTREE) NULL); 228 } 229 t->bt_lorder = m.m_lorder; 230 231 if (t->bt_lorder != BYTE_ORDER) { 232 BLSWAP(m.m_magic); 233 BLSWAP(m.m_version); 234 BLSWAP(m.m_psize); 235 BLSWAP(m.m_free); 236 BLSWAP(m.m_flags); 237 } 238 239 if (m.m_magic != BTREEMAGIC 240 || m.m_version != BTREEVERSION 241 || m.m_psize < MINPSIZE) { 242 (void) close(fd); 243 (void) free((char *) t); 244 #ifndef EFTYPE 245 #define EFTYPE -100 246 #endif 247 errno = EFTYPE; 248 return ((BTREE) NULL); 249 } 250 t->bt_psize = m.m_psize; 251 t->bt_free = m.m_free; 252 t->bt_flags |= (m.m_flags & BTF_NODUPS) | BTF_METAOK; 253 (void) close(fd); 254 } 255 256 /* now open the file the way the user wanted it */ 257 if ((t->bt_s.bt_d.d_fd = open(f, flags, mode)) < 0) { 258 (void) free ((char *) t); 259 return ((BTREE) NULL); 260 } 261 262 /* access method files are always close-on-exec */ 263 if (fcntl(t->bt_s.bt_d.d_fd, F_SETFL, 1) == -1) { 264 (void) close(t->bt_s.bt_d.d_fd); 265 (void) free ((char *) t); 266 return ((BTREE) NULL); 267 } 268 269 /* get number of pages, page size if necessary */ 270 (void) fstat(t->bt_s.bt_d.d_fd, &buf); 271 if (t->bt_psize == 0) 272 t->bt_psize = buf.st_blksize; 273 t->bt_npages = (pgno_t) (buf.st_size / t->bt_psize); 274 275 /* page zero is metadata, doesn't count */ 276 if (t->bt_npages > 0) 277 --(t->bt_npages); 278 279 if (b->cachesize == 0) 280 b->cachesize = DEFCACHE; 281 282 /* get an lru buffer cache, if the user asked for one */ 283 if ((b->cachesize / t->bt_psize) > 0) { 284 BTDISK *d = &(t->bt_s.bt_d); 285 286 d->d_cache = lruinit(d->d_fd, 287 (int) (b->cachesize / t->bt_psize), 288 (int) t->bt_psize, 289 (char *) t->bt_lorder, 290 _bt_pgin, _bt_pgout); 291 292 if (d->d_cache == (char *) NULL) { 293 (void) free((char *) t); 294 return ((BTREE) NULL); 295 } 296 } 297 } 298 299 /* remember if tree was opened for write */ 300 if (((flags & O_WRONLY) == O_WRONLY) 301 || ((flags & O_RDWR) == O_RDWR)) 302 t->bt_flags |= BTF_ISWRITE; 303 304 return ((BTREE) t); 305 } 306 307 /* 308 * BT_GET -- Get an entry from a btree. 309 * 310 * Does a key lookup in the tree to find the specified key, and returns 311 * the key/data pair found. 312 * 313 * Parameters: 314 * tree -- btree in which to do lookup 315 * key -- key to find 316 * data -- pointer to DBT in which to return data 317 * flag -- ignored 318 * 319 * Returns: 320 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not 321 * found. If key is not found, nothing is stored in the 322 * return DBT 'data'. 323 * 324 * Side Effects: 325 * None. 326 * 327 * Warnings: 328 * Return data is statically allocated, and will be overwritten 329 * at the next call. 330 */ 331 332 int 333 bt_get(tree, key, data, flag) 334 BTREE tree; 335 DBT *key; 336 DBT *data; 337 u_long flag; 338 { 339 BTREE_P t = (BTREE_P) tree; 340 BTHEADER *h; 341 DATUM *d; 342 BTITEM *item; 343 344 #ifdef lint 345 flag = flag; 346 #endif /* lint */ 347 348 /* lookup */ 349 item = _bt_search(t, key); 350 351 /* clear parent pointer stack */ 352 while (_bt_pop(t) != P_NONE) 353 continue; 354 355 if (item == (BTITEM *) NULL) 356 return (RET_ERROR); 357 358 h = (BTHEADER *) t->bt_curpage; 359 data->size = 0; 360 data->data = (u_char *) NULL; 361 362 /* match? */ 363 if (VALIDITEM(t, item) 364 && _bt_cmp(t, key->data, item->bti_index) == 0) { 365 d = (DATUM *) GETDATUM(h, item->bti_index); 366 return (_bt_buildret(t, d, data, key)); 367 } 368 369 /* nope */ 370 return (RET_SPECIAL); 371 } 372 373 /* 374 * BT_PUT -- Add an entry to a btree. 375 * 376 * The specified (key, data) pair is added to the tree. If the tree 377 * was created for unique keys only, then duplicates will not be 378 * entered. If the requested key exists in the tree, it will be over- 379 * written unless the flags parameter is R_NOOVERWRITE, in which case 380 * the update will not be done. If duplicate keys are permitted in the 381 * tree, duplicates will be inserted and will not overwrite existing 382 * keys. Nodes are split as required. 383 * 384 * Parameters: 385 * tree -- btree in which to put the new entry 386 * key -- key component to add 387 * data -- data corresponding to key 388 * flag -- R_NOOVERWRITE or zero. 389 * 390 * Returns: 391 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the 392 * NOOVERWRITE flag was set and the specified key exists 393 * in the database. 394 * 395 * Side Effects: 396 * None. 397 */ 398 399 int 400 bt_put(tree, key, data, flag) 401 BTREE tree; 402 DBT *key; 403 DBT *data; 404 u_long flag; 405 { 406 BTREE_P t = (BTREE_P) tree; 407 BTITEM *item; 408 409 /* look for this key in the tree */ 410 item = _bt_search(t, key); 411 412 /* 413 * If this tree was originally created without R_DUP, then duplicate 414 * keys are not allowed. We need to check this at insertion time. 415 */ 416 417 if (VALIDITEM(t, item) && _bt_cmp(t, key->data, item->bti_index) == 0) { 418 if ((t->bt_flags & BTF_NODUPS) && flag == R_NOOVERWRITE) { 419 if (_bt_delone(t, item->bti_index) == RET_ERROR) { 420 while (_bt_pop(t) != P_NONE) 421 continue; 422 return (RET_ERROR); 423 } 424 } 425 } 426 427 return (_bt_insert(t, item, key, data, flag)); 428 } 429 430 /* 431 * BT_DELETE -- delete a key from the tree. 432 * 433 * Deletes all keys (and their associated data items) matching the 434 * supplied key from the tree. If the flags entry is R_CURSOR, then 435 * the current item in the active scan is deleted. 436 * 437 * Parameters: 438 * tree -- btree from which to delete key 439 * key -- key to delete 440 * flags -- R_CURSOR or zero 441 * 442 * Returns: 443 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified 444 * key was not in the tree. 445 * 446 * Side Effects: 447 * None. 448 */ 449 450 int 451 bt_delete(tree, key, flags) 452 BTREE tree; 453 DBT *key; 454 u_long flags; 455 { 456 BTREE_P t = (BTREE_P) tree; 457 BTHEADER *h; 458 BTITEM *item; 459 int ndel = 0; 460 461 if (flags == R_CURSOR) 462 return (_bt_crsrdel(t)); 463 464 /* find the first matching key in the tree */ 465 item = _bt_first(t, key); 466 h = t->bt_curpage; 467 468 /* don't need the descent stack for deletes */ 469 while (_bt_pop(t) != P_NONE) 470 continue; 471 472 /* delete all matching keys */ 473 for (;;) { 474 while (VALIDITEM(t, item) 475 && (_bt_cmp(t, key->data, item->bti_index) == 0)) { 476 if (_bt_delone(t, item->bti_index) == RET_ERROR) 477 return (RET_ERROR); 478 ndel++; 479 } 480 481 if (VALIDITEM(t, item) || h->h_nextpg == P_NONE) 482 break; 483 484 /* next page, if necessary */ 485 do { 486 if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) 487 return (RET_ERROR); 488 h = t->bt_curpage; 489 } while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE); 490 491 item->bti_pgno = h->h_pgno; 492 item->bti_index = 0; 493 494 if (!VALIDITEM(t, item) 495 || _bt_cmp(t, key->data, item->bti_index) != 0) 496 break; 497 } 498 499 /* flush changes to disk */ 500 if (ISDISK(t)) { 501 if (h->h_flags & F_DIRTY) { 502 if (_bt_write(t, t->bt_curpage, NORELEASE) == RET_ERROR) 503 return (RET_ERROR); 504 } 505 } 506 507 if (ndel == 0) 508 return (RET_SPECIAL); 509 510 return (RET_SUCCESS); 511 } 512 513 /* 514 * BT_SYNC -- sync the btree to disk. 515 * 516 * Parameters: 517 * tree -- btree to sync. 518 * 519 * Returns: 520 * RET_SUCCESS, RET_ERROR. 521 */ 522 523 bt_sync(tree) 524 BTREE tree; 525 { 526 BTREE_P t = (BTREE_P) tree; 527 BTHEADER *h; 528 pgno_t pgno; 529 530 /* if this is an in-memory btree, syncing is a no-op */ 531 if (!ISDISK(t)) 532 return (RET_SUCCESS); 533 534 h = (BTHEADER *) t->bt_curpage; 535 h->h_flags &= ~F_DIRTY; 536 537 if (ISCACHE(t)) { 538 pgno = t->bt_curpage->h_pgno; 539 if (_bt_write(t, h, RELEASE) == RET_ERROR) 540 return(RET_ERROR); 541 if (lrusync(t->bt_s.bt_d.d_cache) < RET_ERROR) 542 return (RET_ERROR); 543 if (_bt_getpage(t, pgno) == RET_ERROR) 544 return (RET_ERROR); 545 } else { 546 if (_bt_write(t, h, NORELEASE) == RET_ERROR) 547 return (RET_ERROR); 548 } 549 550 return (fsync(t->bt_s.bt_d.d_fd)); 551 } 552 553 /* 554 * BT_SEQ -- Sequential scan interface. 555 * 556 * This routine supports sequential scans on the btree. If called with 557 * flags set to R_CURSOR, or if no seq scan has been initialized in the 558 * current tree, we initialize the scan. Otherwise, we advance the scan 559 * and return the next item. 560 * 561 * Scans can be either keyed or non-keyed. Keyed scans basically have 562 * a starting point somewhere in the middle of the tree. Non-keyed 563 * scans start at an endpoint. Also, scans can be backward (descending 564 * order), forward (ascending order), or no movement (keep returning 565 * the same item). 566 * 567 * Flags is checked every time we enter the routine, so the user can 568 * change directions on an active scan if desired. The key argument 569 * is examined only when we initialize the scan, in order to position 570 * it properly. 571 * 572 * Items are returned via the key and data arguments passed in. 573 * 574 * Parameters: 575 * tree -- btree in which to do scan 576 * key -- key, used to position scan on initialization, and 577 * used to return key components to the user. 578 * data -- used to return data components to the user. 579 * flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or 580 * R_PREV. 581 * 582 * Returns: 583 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data 584 * exists in the tree in the specified direction. 585 * 586 * Side Effects: 587 * Changes the btree's notion of the current position in the 588 * scan. 589 * 590 * Warnings: 591 * The key and data items returned are static and will be 592 * overwritten by the next bt_get or bt_seq. 593 */ 594 595 int 596 bt_seq(tree, key, data, flags) 597 BTREE tree; 598 DBT *key; 599 DBT *data; 600 u_long flags; 601 { 602 BTREE_P t = (BTREE_P) tree; 603 BTHEADER *h; 604 DATUM *d; 605 int status; 606 607 /* do we need to initialize the scan? */ 608 if (flags == R_CURSOR || flags == R_LAST || flags == R_FIRST 609 || !(t->bt_flags & BTF_SEQINIT)) { 610 611 /* initialize it */ 612 status = _bt_seqinit(t, key, flags); 613 } else { 614 /* just advance the current scan pointer */ 615 status = _bt_seqadvance(t, flags); 616 } 617 618 key->size = data->size = 0; 619 key->data = data->data = (u_char *) NULL; 620 621 h = t->bt_curpage; 622 623 /* is there a valid item at the current scan location? */ 624 if (status == RET_SPECIAL) { 625 if (flags == R_NEXT) { 626 if (t->bt_cursor.c_index >= NEXTINDEX(h)) { 627 if (NEXTINDEX(h) > 0) 628 t->bt_cursor.c_index = NEXTINDEX(h) - 1; 629 else 630 t->bt_cursor.c_index = 0; 631 } 632 } else { 633 t->bt_cursor.c_index = 0; 634 } 635 return (RET_SPECIAL); 636 } else if (status == RET_ERROR) 637 return (RET_ERROR); 638 639 /* okay, return the data */ 640 d = (DATUM *) GETDATUM(h, t->bt_cursor.c_index); 641 642 return (_bt_buildret(t, d, data, key)); 643 } 644 645 /* 646 * BT_CLOSE -- Close a btree 647 * 648 * Parameters: 649 * tree -- tree to close 650 * 651 * Returns: 652 * RET_SUCCESS, RET_ERROR. 653 * 654 * Side Effects: 655 * Frees space occupied by the tree. 656 */ 657 658 int 659 bt_close(tree) 660 BTREE tree; 661 { 662 BTREE_P t = (BTREE_P) tree; 663 int i; 664 BTHEADER *h; 665 char *cache; 666 struct HTBUCKET *b, *sb; 667 HTABLE ht; 668 int fd; 669 670 if (t->bt_cursor.c_key != (char *) NULL) 671 (void) free(t->bt_cursor.c_key); 672 673 if (!ISDISK(t)) { 674 /* in-memory tree, release hash table memory */ 675 ht = t->bt_s.bt_ht; 676 for (i = 0; i < HTSIZE; i++) { 677 if ((b = ht[i]) == (struct HTBUCKET *) NULL) 678 break; 679 do { 680 sb = b; 681 (void) free((char *) b->ht_page); 682 b = b->ht_next; 683 (void) free((char *) sb); 684 } while (b != (struct HTBUCKET *) NULL); 685 } 686 (void) free ((char *) ht); 687 (void) free ((char *) t); 688 return (RET_SUCCESS); 689 } 690 691 if ((t->bt_flags & BTF_ISWRITE) && !(t->bt_flags & BTF_METAOK)) { 692 if (_bt_wrtmeta(t) == RET_ERROR) 693 return (RET_ERROR); 694 } 695 696 if (t->bt_curpage != (BTHEADER *) NULL) { 697 h = t->bt_curpage; 698 if (h->h_flags & F_DIRTY) { 699 if (_bt_write(t, h, RELEASE) == RET_ERROR) 700 return (RET_ERROR); 701 } else { 702 if (_bt_release(t, h) == RET_ERROR) 703 return (RET_ERROR); 704 } 705 706 /* flush and free the cache, if there is one */ 707 if (ISCACHE(t)) { 708 cache = t->bt_s.bt_d.d_cache; 709 if (lrusync(cache) == RET_ERROR) 710 return (RET_ERROR); 711 lrufree(cache); 712 } 713 (void) free ((char *) h); 714 } 715 716 fd = t->bt_s.bt_d.d_fd; 717 (void) free ((char *) t); 718 return (close(fd)); 719 } 720