1 /*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Mike Olson. 7 * 8 * %sccs.include.redist.c% 9 */ 10 11 #if defined(LIBC_SCCS) && !defined(lint) 12 static char sccsid[] = "@(#)bt_open.c 5.8 (Berkeley) 04/02/91"; 13 #endif /* LIBC_SCCS and not lint */ 14 15 /* 16 * btree.c -- implementation of btree access method for 4.4BSD. 17 * 18 * The design here is based on that of the btree access method used 19 * in the Postgres database system at UC Berkeley. The implementation 20 * is wholly independent of the Postgres code. 21 * 22 * This implementation supports btrees on disk (supply a filename) or 23 * in memory (don't). Public interfaces defined here are: 24 * 25 * btree_open() -- wrapper; returns a filled DB struct for 26 * a btree. 27 * 28 * bt_open() -- open a new or existing btree. 29 * bt_get() -- fetch data from a tree by key. 30 * bt_seq() -- do a sequential scan on a tree. 31 * bt_put() -- add data to a tree by key. 32 * bt_delete() -- remove data from a tree by key. 33 * bt_close() -- close a btree. 34 * bt_sync() -- force btree pages to disk (disk trees only). 35 */ 36 37 #include <sys/param.h> 38 #include <sys/stat.h> 39 #include <signal.h> 40 #include <errno.h> 41 #include <fcntl.h> 42 #include <db.h> 43 #include <stdlib.h> 44 #include <string.h> 45 #include <unistd.h> 46 #include "btree.h" 47 48 BTREEINFO _DefaultBTInfo = { 49 0, /* flags */ 50 0, /* cachesize */ 51 0, /* psize */ 52 strcmp, /* compare */ 53 0 54 }; 55 56 /* 57 * BTREE_OPEN -- Wrapper routine to open a btree. 58 * 59 * Creates and fills a DB struct, and calls the routine that actually 60 * opens the btree. 61 * 62 * Parameters: 63 * f: filename to open 64 * flags: flag bits passed to open 65 * mode: permission bits, used if O_CREAT specified 66 * b: BTREEINFO pointer 67 * 68 * Returns: 69 * Filled-in DBT on success; NULL on failure, with errno 70 * set as appropriate. 71 * 72 * Side Effects: 73 * Allocates memory for the DB struct. 74 */ 75 76 DB * 77 btree_open(f, flags, mode, b) 78 const char *f; 79 int flags; 80 int mode; 81 const BTREEINFO *b; 82 { 83 DB *db; 84 BTREE t; 85 86 if ((db = (DB *) malloc((unsigned) sizeof(DB))) == (DB *) NULL) 87 return ((DB *) NULL); 88 89 if ((t = bt_open(f, flags, mode, b)) == (BTREE) NULL) { 90 (void) free ((char *) db); 91 return ((DB *) NULL); 92 } 93 94 db->internal = (char *) t; 95 db->close = bt_close; 96 db->del = bt_delete; 97 db->get = bt_get; 98 db->put = bt_put; 99 db->seq = bt_seq; 100 db->sync = bt_sync; 101 db->type = DB_BTREE; 102 103 return (db); 104 } 105 106 /* 107 * BT_OPEN -- Open a btree. 108 * 109 * This routine creates the correct kind (disk or in-memory) of 110 * btree and initializes it as required. 111 * 112 * Parameters: 113 * f -- name of btree (NULL for in-memory btrees) 114 * flags -- flags passed to open() 115 * mode -- mode passed to open () 116 * b -- BTREEINFO structure, describing btree 117 * 118 * Returns: 119 * (Opaque) pointer to the btree. On failure, returns NULL 120 * with errno set as appropriate. 121 * 122 * Side Effects: 123 * Allocates memory, opens files. 124 */ 125 126 BTREE 127 bt_open(f, flags, mode, b) 128 char *f; 129 int flags; 130 int mode; 131 BTREEINFO *b; 132 { 133 BTREE_P t; 134 HTABLE ht; 135 int nbytes; 136 int fd; 137 CURSOR *c; 138 BTMETA m; 139 struct stat buf; 140 141 /* use the default info if none was provided */ 142 if (b == (BTREEINFO *) NULL) 143 b = &_DefaultBTInfo; 144 145 if ((t = (BTREE_P) malloc((unsigned) sizeof *t)) == (BTREE_P) NULL) 146 return ((BTREE) NULL); 147 148 if (b->compare) 149 t->bt_compare = b->compare; 150 else 151 t->bt_compare = strcmp; 152 153 t->bt_fname = f; 154 t->bt_curpage = (BTHEADER *) NULL; 155 t->bt_free = P_NONE; 156 c = &(t->bt_cursor); 157 c->c_pgno = P_NONE; 158 c->c_index = 0; 159 c->c_flags = (u_char) NULL; 160 c->c_key = (char *) NULL; 161 t->bt_stack = (BTSTACK *) NULL; 162 t->bt_flags = 0; 163 164 /* 165 * If no file name was supplied, this is an in-memory btree. 166 * Otherwise, it's a disk-based btree. 167 */ 168 if (f == (char *) NULL) { 169 /* in memory */ 170 if ((t->bt_psize = b->psize) < MINPSIZE) { 171 if (t->bt_psize != 0) { 172 (void) free ((char *) t); 173 errno = EINVAL; 174 return ((BTREE) NULL); 175 } 176 t->bt_psize = getpagesize(); 177 } 178 179 nbytes = HTSIZE * sizeof(HTBUCKET *); 180 if ((ht = (HTABLE) malloc((unsigned) nbytes)) 181 == (HTABLE) NULL) { 182 (void) free((char *) t); 183 return ((BTREE) NULL); 184 } 185 (void) bzero((char *) ht, nbytes); 186 t->bt_s.bt_ht = ht; 187 t->bt_npages = 0; 188 t->bt_lorder = BYTE_ORDER; 189 if (!(b->flags & R_DUP)) 190 t->bt_flags |= BTF_NODUPS; 191 } else { 192 /* on disk */ 193 if ((fd = open(f, O_RDONLY, 0)) < 0) { 194 /* doesn't exist yet, be sure page is big enough */ 195 if ((t->bt_psize = b->psize) < sizeof(BTHEADER) 196 && b->psize != 0) { 197 (void) free((char *) t); 198 errno = EINVAL; 199 return ((BTREE) NULL); 200 } 201 if (b->lorder == 0) 202 b->lorder = BYTE_ORDER; 203 204 if (b->lorder != BIG_ENDIAN 205 && b->lorder != LITTLE_ENDIAN) { 206 (void) free((char *) t); 207 errno = EINVAL; 208 return ((BTREE) NULL); 209 } 210 t->bt_lorder = b->lorder; 211 if (!(b->flags & R_DUP)) 212 t->bt_flags |= BTF_NODUPS; 213 } else { 214 /* exists, get page size from file */ 215 if (read(fd, &m, sizeof(m)) < sizeof(m)) { 216 (void) close(fd); 217 (void) free((char *) t); 218 errno = EINVAL; 219 return ((BTREE) NULL); 220 } 221 222 /* lorder always stored in host-independent format */ 223 NTOHL(m.m_lorder); 224 if (m.m_lorder != BIG_ENDIAN 225 && m.m_lorder != LITTLE_ENDIAN) { 226 (void) free((char *) t); 227 errno = EINVAL; 228 return ((BTREE) NULL); 229 } 230 t->bt_lorder = m.m_lorder; 231 232 if (t->bt_lorder != BYTE_ORDER) { 233 BLSWAP(m.m_magic); 234 BLSWAP(m.m_version); 235 BLSWAP(m.m_psize); 236 BLSWAP(m.m_free); 237 BLSWAP(m.m_flags); 238 } 239 240 if (m.m_magic != BTREEMAGIC 241 || m.m_version != BTREEVERSION 242 || m.m_psize < MINPSIZE) { 243 (void) close(fd); 244 (void) free((char *) t); 245 #ifndef EFTYPE 246 #define EFTYPE -100 247 #endif 248 errno = EFTYPE; 249 return ((BTREE) NULL); 250 } 251 t->bt_psize = m.m_psize; 252 t->bt_free = m.m_free; 253 t->bt_flags |= (m.m_flags & BTF_NODUPS) | BTF_METAOK; 254 (void) close(fd); 255 } 256 257 /* now open the file the way the user wanted it */ 258 if ((t->bt_s.bt_d.d_fd = open(f, flags, mode)) < 0) { 259 (void) free ((char *) t); 260 return ((BTREE) NULL); 261 } 262 263 /* access method files are always close-on-exec */ 264 if (fcntl(t->bt_s.bt_d.d_fd, F_SETFL, 1) == -1) { 265 (void) close(t->bt_s.bt_d.d_fd); 266 (void) free ((char *) t); 267 return ((BTREE) NULL); 268 } 269 270 /* get number of pages, page size if necessary */ 271 (void) fstat(t->bt_s.bt_d.d_fd, &buf); 272 if (t->bt_psize == 0) 273 t->bt_psize = buf.st_blksize; 274 t->bt_npages = (pgno_t) (buf.st_size / t->bt_psize); 275 276 /* page zero is metadata, doesn't count */ 277 if (t->bt_npages > 0) 278 --(t->bt_npages); 279 280 if (b->cachesize == 0) 281 b->cachesize = DEFCACHE; 282 283 /* get an lru buffer cache, if the user asked for one */ 284 if ((b->cachesize / t->bt_psize) > 0) { 285 BTDISK *d = &(t->bt_s.bt_d); 286 287 d->d_cache = lruinit(d->d_fd, 288 (int) (b->cachesize / t->bt_psize), 289 (int) t->bt_psize, 290 (char *) t->bt_lorder, 291 _bt_pgin, _bt_pgout); 292 293 if (d->d_cache == (char *) NULL) { 294 (void) free((char *) t); 295 return ((BTREE) NULL); 296 } 297 } 298 } 299 300 /* remember if tree was opened for write */ 301 if (((flags & O_WRONLY) == O_WRONLY) 302 || ((flags & O_RDWR) == O_RDWR)) 303 t->bt_flags |= BTF_ISWRITE; 304 305 return ((BTREE) t); 306 } 307 308 /* 309 * BT_GET -- Get an entry from a btree. 310 * 311 * Does a key lookup in the tree to find the specified key, and returns 312 * the key/data pair found. 313 * 314 * Parameters: 315 * tree -- btree in which to do lookup 316 * key -- key to find 317 * data -- pointer to DBT in which to return data 318 * flag -- ignored 319 * 320 * Returns: 321 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not 322 * found. If key is not found, nothing is stored in the 323 * return DBT 'data'. 324 * 325 * Side Effects: 326 * None. 327 * 328 * Warnings: 329 * Return data is statically allocated, and will be overwritten 330 * at the next call. 331 */ 332 333 int 334 bt_get(tree, key, data, flag) 335 BTREE tree; 336 DBT *key; 337 DBT *data; 338 u_long flag; 339 { 340 BTREE_P t = (BTREE_P) tree; 341 BTHEADER *h; 342 DATUM *d; 343 BTITEM *item; 344 345 #ifdef lint 346 flag = flag; 347 #endif /* lint */ 348 349 /* lookup */ 350 item = _bt_search(t, key); 351 352 /* clear parent pointer stack */ 353 while (_bt_pop(t) != P_NONE) 354 continue; 355 356 if (item == (BTITEM *) NULL) 357 return (RET_ERROR); 358 359 h = (BTHEADER *) t->bt_curpage; 360 data->size = 0; 361 data->data = (u_char *) NULL; 362 363 /* match? */ 364 if (VALIDITEM(t, item) 365 && _bt_cmp(t, key->data, item->bti_index) == 0) { 366 d = (DATUM *) GETDATUM(h, item->bti_index); 367 return (_bt_buildret(t, d, data, key)); 368 } 369 370 /* nope */ 371 return (RET_SPECIAL); 372 } 373 374 /* 375 * BT_PUT -- Add an entry to a btree. 376 * 377 * The specified (key, data) pair is added to the tree. If the tree 378 * was created for unique keys only, then duplicates will not be 379 * entered. If the requested key exists in the tree, it will be over- 380 * written unless the flags parameter is R_NOOVERWRITE, in which case 381 * the update will not be done. If duplicate keys are permitted in the 382 * tree, duplicates will be inserted and will not overwrite existing 383 * keys. Nodes are split as required. 384 * 385 * Parameters: 386 * tree -- btree in which to put the new entry 387 * key -- key component to add 388 * data -- data corresponding to key 389 * flag -- R_NOOVERWRITE or zero. 390 * 391 * Returns: 392 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the 393 * NOOVERWRITE flag was set and the specified key exists 394 * in the database. 395 * 396 * Side Effects: 397 * None. 398 */ 399 400 int 401 bt_put(tree, key, data, flag) 402 BTREE tree; 403 DBT *key; 404 DBT *data; 405 u_long flag; 406 { 407 BTREE_P t = (BTREE_P) tree; 408 BTITEM *item; 409 410 /* look for this key in the tree */ 411 item = _bt_search(t, key); 412 413 /* 414 * If this tree was originally created without R_DUP, then duplicate 415 * keys are not allowed. We need to check this at insertion time. 416 */ 417 418 if (VALIDITEM(t, item) && _bt_cmp(t, key->data, item->bti_index) == 0) { 419 if ((t->bt_flags & BTF_NODUPS) && flag == R_NOOVERWRITE) { 420 if (_bt_delone(t, item->bti_index) == RET_ERROR) { 421 while (_bt_pop(t) != P_NONE) 422 continue; 423 return (RET_ERROR); 424 } 425 } 426 } 427 428 return (_bt_insert(t, item, key, data, flag)); 429 } 430 431 /* 432 * BT_DELETE -- delete a key from the tree. 433 * 434 * Deletes all keys (and their associated data items) matching the 435 * supplied key from the tree. If the flags entry is R_CURSOR, then 436 * the current item in the active scan is deleted. 437 * 438 * Parameters: 439 * tree -- btree from which to delete key 440 * key -- key to delete 441 * flags -- R_CURSOR or zero 442 * 443 * Returns: 444 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified 445 * key was not in the tree. 446 * 447 * Side Effects: 448 * None. 449 */ 450 451 int 452 bt_delete(tree, key, flags) 453 BTREE tree; 454 DBT *key; 455 u_long flags; 456 { 457 BTREE_P t = (BTREE_P) tree; 458 BTHEADER *h; 459 BTITEM *item; 460 int ndel = 0; 461 462 if (flags == R_CURSOR) 463 return (_bt_crsrdel(t)); 464 465 /* find the first matching key in the tree */ 466 item = _bt_first(t, key); 467 h = t->bt_curpage; 468 469 /* don't need the descent stack for deletes */ 470 while (_bt_pop(t) != P_NONE) 471 continue; 472 473 /* delete all matching keys */ 474 for (;;) { 475 while (VALIDITEM(t, item) 476 && (_bt_cmp(t, key->data, item->bti_index) == 0)) { 477 if (_bt_delone(t, item->bti_index) == RET_ERROR) 478 return (RET_ERROR); 479 ndel++; 480 } 481 482 if (VALIDITEM(t, item) || h->h_nextpg == P_NONE) 483 break; 484 485 /* next page, if necessary */ 486 do { 487 if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) 488 return (RET_ERROR); 489 h = t->bt_curpage; 490 } while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE); 491 492 item->bti_pgno = h->h_pgno; 493 item->bti_index = 0; 494 495 if (!VALIDITEM(t, item) 496 || _bt_cmp(t, key->data, item->bti_index) != 0) 497 break; 498 } 499 500 /* flush changes to disk */ 501 if (ISDISK(t)) { 502 if (h->h_flags & F_DIRTY) { 503 if (_bt_write(t, t->bt_curpage, NORELEASE) == RET_ERROR) 504 return (RET_ERROR); 505 } 506 } 507 508 if (ndel == 0) 509 return (RET_SPECIAL); 510 511 return (RET_SUCCESS); 512 } 513 514 /* 515 * BT_SYNC -- sync the btree to disk. 516 * 517 * Parameters: 518 * tree -- btree to sync. 519 * 520 * Returns: 521 * RET_SUCCESS, RET_ERROR. 522 */ 523 524 bt_sync(tree) 525 BTREE tree; 526 { 527 BTREE_P t = (BTREE_P) tree; 528 BTHEADER *h; 529 pgno_t pgno; 530 531 /* if this is an in-memory btree, syncing is a no-op */ 532 if (!ISDISK(t)) 533 return (RET_SUCCESS); 534 535 h = (BTHEADER *) t->bt_curpage; 536 h->h_flags &= ~F_DIRTY; 537 538 if (ISCACHE(t)) { 539 pgno = t->bt_curpage->h_pgno; 540 if (_bt_write(t, h, RELEASE) == RET_ERROR) 541 return(RET_ERROR); 542 if (lrusync(t->bt_s.bt_d.d_cache) < RET_ERROR) 543 return (RET_ERROR); 544 if (_bt_getpage(t, pgno) == RET_ERROR) 545 return (RET_ERROR); 546 } else { 547 if (_bt_write(t, h, NORELEASE) == RET_ERROR) 548 return (RET_ERROR); 549 } 550 551 return (fsync(t->bt_s.bt_d.d_fd)); 552 } 553 554 /* 555 * BT_SEQ -- Sequential scan interface. 556 * 557 * This routine supports sequential scans on the btree. If called with 558 * flags set to R_CURSOR, or if no seq scan has been initialized in the 559 * current tree, we initialize the scan. Otherwise, we advance the scan 560 * and return the next item. 561 * 562 * Scans can be either keyed or non-keyed. Keyed scans basically have 563 * a starting point somewhere in the middle of the tree. Non-keyed 564 * scans start at an endpoint. Also, scans can be backward (descending 565 * order), forward (ascending order), or no movement (keep returning 566 * the same item). 567 * 568 * Flags is checked every time we enter the routine, so the user can 569 * change directions on an active scan if desired. The key argument 570 * is examined only when we initialize the scan, in order to position 571 * it properly. 572 * 573 * Items are returned via the key and data arguments passed in. 574 * 575 * Parameters: 576 * tree -- btree in which to do scan 577 * key -- key, used to position scan on initialization, and 578 * used to return key components to the user. 579 * data -- used to return data components to the user. 580 * flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or 581 * R_PREV. 582 * 583 * Returns: 584 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data 585 * exists in the tree in the specified direction. 586 * 587 * Side Effects: 588 * Changes the btree's notion of the current position in the 589 * scan. 590 * 591 * Warnings: 592 * The key and data items returned are static and will be 593 * overwritten by the next bt_get or bt_seq. 594 */ 595 596 int 597 bt_seq(tree, key, data, flags) 598 BTREE tree; 599 DBT *key; 600 DBT *data; 601 u_long flags; 602 { 603 BTREE_P t = (BTREE_P) tree; 604 BTHEADER *h; 605 DATUM *d; 606 int status; 607 608 /* do we need to initialize the scan? */ 609 if (flags == R_CURSOR || flags == R_LAST || flags == R_FIRST 610 || !(t->bt_flags & BTF_SEQINIT)) { 611 612 /* initialize it */ 613 status = _bt_seqinit(t, key, flags); 614 } else { 615 /* just advance the current scan pointer */ 616 status = _bt_seqadvance(t, flags); 617 } 618 619 key->size = data->size = 0; 620 key->data = data->data = (u_char *) NULL; 621 622 h = t->bt_curpage; 623 624 /* is there a valid item at the current scan location? */ 625 if (status == RET_SPECIAL) { 626 if (flags == R_NEXT) { 627 if (t->bt_cursor.c_index >= NEXTINDEX(h)) { 628 if (NEXTINDEX(h) > 0) 629 t->bt_cursor.c_index = NEXTINDEX(h) - 1; 630 else 631 t->bt_cursor.c_index = 0; 632 } 633 } else { 634 t->bt_cursor.c_index = 0; 635 } 636 return (RET_SPECIAL); 637 } else if (status == RET_ERROR) 638 return (RET_ERROR); 639 640 /* okay, return the data */ 641 d = (DATUM *) GETDATUM(h, t->bt_cursor.c_index); 642 643 return (_bt_buildret(t, d, data, key)); 644 } 645 646 /* 647 * BT_CLOSE -- Close a btree 648 * 649 * Parameters: 650 * tree -- tree to close 651 * 652 * Returns: 653 * RET_SUCCESS, RET_ERROR. 654 * 655 * Side Effects: 656 * Frees space occupied by the tree. 657 */ 658 659 int 660 bt_close(tree) 661 BTREE tree; 662 { 663 BTREE_P t = (BTREE_P) tree; 664 int i; 665 BTHEADER *h; 666 char *cache; 667 struct HTBUCKET *b, *sb; 668 HTABLE ht; 669 int fd; 670 671 if (t->bt_cursor.c_key != (char *) NULL) 672 (void) free(t->bt_cursor.c_key); 673 674 if (!ISDISK(t)) { 675 /* in-memory tree, release hash table memory */ 676 ht = t->bt_s.bt_ht; 677 for (i = 0; i < HTSIZE; i++) { 678 if ((b = ht[i]) == (struct HTBUCKET *) NULL) 679 break; 680 do { 681 sb = b; 682 (void) free((char *) b->ht_page); 683 b = b->ht_next; 684 (void) free((char *) sb); 685 } while (b != (struct HTBUCKET *) NULL); 686 } 687 (void) free ((char *) ht); 688 (void) free ((char *) t); 689 return (RET_SUCCESS); 690 } 691 692 if ((t->bt_flags & BTF_ISWRITE) && !(t->bt_flags & BTF_METAOK)) { 693 if (_bt_wrtmeta(t) == RET_ERROR) 694 return (RET_ERROR); 695 } 696 697 if (t->bt_curpage != (BTHEADER *) NULL) { 698 h = t->bt_curpage; 699 if (h->h_flags & F_DIRTY) { 700 if (_bt_write(t, h, RELEASE) == RET_ERROR) 701 return (RET_ERROR); 702 } else { 703 if (_bt_release(t, h) == RET_ERROR) 704 return (RET_ERROR); 705 } 706 707 /* flush and free the cache, if there is one */ 708 if (ISCACHE(t)) { 709 cache = t->bt_s.bt_d.d_cache; 710 if (lrusync(cache) == RET_ERROR) 711 return (RET_ERROR); 712 lrufree(cache); 713 } 714 (void) free ((char *) h); 715 } 716 717 fd = t->bt_s.bt_d.d_fd; 718 (void) free ((char *) t); 719 return (close(fd)); 720 } 721