xref: /original-bsd/lib/libc/db/btree/bt_open.c (revision 6884d44a)
1 /*-
2  * Copyright (c) 1990 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Mike Olson.
7  *
8  * %sccs.include.redist.c%
9  */
10 
11 #if defined(LIBC_SCCS) && !defined(lint)
12 static char sccsid[] = "@(#)bt_open.c	5.8 (Berkeley) 04/02/91";
13 #endif /* LIBC_SCCS and not lint */
14 
15 /*
16  *  btree.c -- implementation of btree access method for 4.4BSD.
17  *
18  *	The design here is based on that of the btree access method used
19  *	in the Postgres database system at UC Berkeley.  The implementation
20  *	is wholly independent of the Postgres code.
21  *
22  *	This implementation supports btrees on disk (supply a filename) or
23  *	in memory (don't).  Public interfaces defined here are:
24  *
25  *		btree_open()	-- wrapper; returns a filled DB struct for
26  *				   a btree.
27  *
28  *		bt_open()	-- open a new or existing btree.
29  *		bt_get()	-- fetch data from a tree by key.
30  *		bt_seq()	-- do a sequential scan on a tree.
31  *		bt_put()	-- add data to a tree by key.
32  *		bt_delete()	-- remove data from a tree by key.
33  *		bt_close()	-- close a btree.
34  *		bt_sync()	-- force btree pages to disk (disk trees only).
35  */
36 
37 #include <sys/param.h>
38 #include <sys/stat.h>
39 #include <signal.h>
40 #include <errno.h>
41 #include <fcntl.h>
42 #include <db.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <unistd.h>
46 #include "btree.h"
47 
48 BTREEINFO _DefaultBTInfo = {
49 	0,	/* flags */
50 	0,	/* cachesize */
51 	0,	/* psize */
52 	strcmp,	/* compare */
53 	0
54 };
55 
56 /*
57  *  BTREE_OPEN -- Wrapper routine to open a btree.
58  *
59  *	Creates and fills a DB struct, and calls the routine that actually
60  *	opens the btree.
61  *
62  *	Parameters:
63  *		f:  filename to open
64  *		flags:  flag bits passed to open
65  *		mode:  permission bits, used if O_CREAT specified
66  *		b:  BTREEINFO pointer
67  *
68  *	Returns:
69  *		Filled-in DBT on success; NULL on failure, with errno
70  *		set as appropriate.
71  *
72  *	Side Effects:
73  *		Allocates memory for the DB struct.
74  */
75 
76 DB *
77 btree_open(f, flags, mode, b)
78 	const char *f;
79 	int flags;
80 	int mode;
81 	const BTREEINFO *b;
82 {
83 	DB *db;
84 	BTREE t;
85 
86 	if ((db = (DB *) malloc((unsigned) sizeof(DB))) == (DB *) NULL)
87 		return ((DB *) NULL);
88 
89 	if ((t = bt_open(f, flags, mode, b)) == (BTREE) NULL) {
90 		(void) free ((char *) db);
91 		return ((DB *) NULL);
92 	}
93 
94 	db->internal = (char *) t;
95 	db->close = bt_close;
96 	db->del = bt_delete;
97 	db->get = bt_get;
98 	db->put = bt_put;
99 	db->seq = bt_seq;
100 	db->sync = bt_sync;
101 	db->type = DB_BTREE;
102 
103 	return (db);
104 }
105 
106 /*
107  *  BT_OPEN -- Open a btree.
108  *
109  *	This routine creates the correct kind (disk or in-memory) of
110  *	btree and initializes it as required.
111  *
112  *	Parameters:
113  *		f -- name of btree (NULL for in-memory btrees)
114  *		flags -- flags passed to open()
115  *		mode -- mode passed to open ()
116  *		b -- BTREEINFO structure, describing btree
117  *
118  *	Returns:
119  *		(Opaque) pointer to the btree.  On failure, returns NULL
120  *		with errno set as appropriate.
121  *
122  *	Side Effects:
123  *		Allocates memory, opens files.
124  */
125 
126 BTREE
127 bt_open(f, flags, mode, b)
128 	char *f;
129 	int flags;
130 	int mode;
131 	BTREEINFO *b;
132 {
133 	BTREE_P t;
134 	HTABLE ht;
135 	int nbytes;
136 	int fd;
137 	CURSOR *c;
138 	BTMETA m;
139 	struct stat buf;
140 
141 	/* use the default info if none was provided */
142 	if (b == (BTREEINFO *) NULL)
143 		b = &_DefaultBTInfo;
144 
145 	if ((t = (BTREE_P) malloc((unsigned) sizeof *t)) == (BTREE_P) NULL)
146 		return ((BTREE) NULL);
147 
148 	if (b->compare)
149 		t->bt_compare = b->compare;
150 	else
151 		t->bt_compare = strcmp;
152 
153 	t->bt_fname = f;
154 	t->bt_curpage = (BTHEADER *) NULL;
155 	t->bt_free = P_NONE;
156 	c = &(t->bt_cursor);
157 	c->c_pgno = P_NONE;
158 	c->c_index = 0;
159 	c->c_flags = (u_char) NULL;
160 	c->c_key = (char *) NULL;
161 	t->bt_stack = (BTSTACK *) NULL;
162 	t->bt_flags = 0;
163 
164 	/*
165 	 *  If no file name was supplied, this is an in-memory btree.
166 	 *  Otherwise, it's a disk-based btree.
167 	 */
168 	if (f == (char *) NULL) {
169 		/* in memory */
170 		if ((t->bt_psize = b->psize) < MINPSIZE) {
171 			if (t->bt_psize != 0) {
172 				(void) free ((char *) t);
173 				errno = EINVAL;
174 				return ((BTREE) NULL);
175 			}
176 			t->bt_psize = getpagesize();
177 		}
178 
179 		nbytes = HTSIZE * sizeof(HTBUCKET *);
180 		if ((ht = (HTABLE) malloc((unsigned) nbytes))
181 		    == (HTABLE) NULL) {
182 			(void) free((char *) t);
183 			return ((BTREE) NULL);
184 		}
185 		(void) bzero((char *) ht, nbytes);
186 		t->bt_s.bt_ht = ht;
187 		t->bt_npages = 0;
188 		t->bt_lorder = BYTE_ORDER;
189 		if (!(b->flags & R_DUP))
190 			t->bt_flags |= BTF_NODUPS;
191 	} else {
192 		/* on disk */
193 		if ((fd = open(f, O_RDONLY, 0)) < 0) {
194 			/* doesn't exist yet, be sure page is big enough */
195 			if ((t->bt_psize = b->psize) < sizeof(BTHEADER)
196 			    && b->psize != 0) {
197 				(void) free((char *) t);
198 				errno = EINVAL;
199 				return ((BTREE) NULL);
200 			}
201 			if (b->lorder == 0)
202 				b->lorder = BYTE_ORDER;
203 
204 			if (b->lorder != BIG_ENDIAN
205 			    && b->lorder != LITTLE_ENDIAN) {
206 				(void) free((char *) t);
207 				errno = EINVAL;
208 				return ((BTREE) NULL);
209 			}
210 			t->bt_lorder = b->lorder;
211 			if (!(b->flags & R_DUP))
212 				t->bt_flags |= BTF_NODUPS;
213 		} else {
214 			/* exists, get page size from file */
215 			if (read(fd, &m, sizeof(m)) < sizeof(m)) {
216 				(void) close(fd);
217 				(void) free((char *) t);
218 				errno = EINVAL;
219 				return ((BTREE) NULL);
220 			}
221 
222 			/* lorder always stored in host-independent format */
223 			NTOHL(m.m_lorder);
224 			if (m.m_lorder != BIG_ENDIAN
225 			    && m.m_lorder != LITTLE_ENDIAN) {
226 				(void) free((char *) t);
227 				errno = EINVAL;
228 				return ((BTREE) NULL);
229 			}
230 			t->bt_lorder = m.m_lorder;
231 
232 			if (t->bt_lorder != BYTE_ORDER) {
233 				BLSWAP(m.m_magic);
234 				BLSWAP(m.m_version);
235 				BLSWAP(m.m_psize);
236 				BLSWAP(m.m_free);
237 				BLSWAP(m.m_flags);
238 			}
239 
240 			if (m.m_magic != BTREEMAGIC
241 			    || m.m_version != BTREEVERSION
242 			    || m.m_psize < MINPSIZE) {
243 				(void) close(fd);
244 				(void) free((char *) t);
245 #ifndef EFTYPE
246 #define EFTYPE	-100
247 #endif
248 				errno = EFTYPE;
249 				return ((BTREE) NULL);
250 			}
251 			t->bt_psize = m.m_psize;
252 			t->bt_free = m.m_free;
253 			t->bt_flags |= (m.m_flags & BTF_NODUPS) | BTF_METAOK;
254 			(void) close(fd);
255 		}
256 
257 		/* now open the file the way the user wanted it */
258 		if ((t->bt_s.bt_d.d_fd = open(f, flags, mode)) < 0) {
259 			(void) free ((char *) t);
260 			return ((BTREE) NULL);
261 		}
262 
263 		/* access method files are always close-on-exec */
264 		if (fcntl(t->bt_s.bt_d.d_fd, F_SETFL, 1) == -1) {
265 			(void) close(t->bt_s.bt_d.d_fd);
266 			(void) free ((char *) t);
267 			return ((BTREE) NULL);
268 		}
269 
270 		/* get number of pages, page size if necessary */
271 		(void) fstat(t->bt_s.bt_d.d_fd, &buf);
272 		if (t->bt_psize == 0)
273 			t->bt_psize = buf.st_blksize;
274 		t->bt_npages = (pgno_t) (buf.st_size / t->bt_psize);
275 
276 		/* page zero is metadata, doesn't count */
277 		if (t->bt_npages > 0)
278 			--(t->bt_npages);
279 
280 		if (b->cachesize == 0)
281 			b->cachesize = DEFCACHE;
282 
283 		/* get an lru buffer cache, if the user asked for one */
284 		if ((b->cachesize / t->bt_psize) > 0) {
285 			BTDISK *d = &(t->bt_s.bt_d);
286 
287 			d->d_cache = lruinit(d->d_fd,
288 					     (int) (b->cachesize / t->bt_psize),
289 					     (int) t->bt_psize,
290 					     (char *) t->bt_lorder,
291 					     _bt_pgin, _bt_pgout);
292 
293 			if (d->d_cache == (char *) NULL) {
294 				(void) free((char *) t);
295 				return ((BTREE) NULL);
296 			}
297 		}
298 	}
299 
300 	/* remember if tree was opened for write */
301 	if (((flags & O_WRONLY) == O_WRONLY)
302 	    || ((flags & O_RDWR) == O_RDWR))
303 		t->bt_flags |= BTF_ISWRITE;
304 
305 	return ((BTREE) t);
306 }
307 
308 /*
309  *  BT_GET -- Get an entry from a btree.
310  *
311  *	Does a key lookup in the tree to find the specified key, and returns
312  *	the key/data pair found.
313  *
314  *	Parameters:
315  *		tree -- btree in which to do lookup
316  *		key -- key to find
317  *		data -- pointer to DBT in which to return data
318  *		flag -- ignored
319  *
320  *	Returns:
321  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not
322  *		found.  If key is not found, nothing is stored in the
323  *		return DBT 'data'.
324  *
325  *	Side Effects:
326  *		None.
327  *
328  *	Warnings:
329  *		Return data is statically allocated, and will be overwritten
330  *		at the next call.
331  */
332 
333 int
334 bt_get(tree, key, data, flag)
335 	BTREE tree;
336 	DBT *key;
337 	DBT *data;
338 	u_long flag;
339 {
340 	BTREE_P t = (BTREE_P) tree;
341 	BTHEADER *h;
342 	DATUM *d;
343 	BTITEM *item;
344 
345 #ifdef lint
346 	flag = flag;
347 #endif /* lint */
348 
349 	/* lookup */
350 	item = _bt_search(t, key);
351 
352 	/* clear parent pointer stack */
353 	while (_bt_pop(t) != P_NONE)
354 		continue;
355 
356 	if (item == (BTITEM *) NULL)
357 		return (RET_ERROR);
358 
359 	h = (BTHEADER *) t->bt_curpage;
360 	data->size = 0;
361 	data->data = (u_char *) NULL;
362 
363 	/* match? */
364 	if (VALIDITEM(t, item)
365 	    && _bt_cmp(t, key->data, item->bti_index) == 0) {
366 		d = (DATUM *) GETDATUM(h, item->bti_index);
367 		return (_bt_buildret(t, d, data, key));
368 	}
369 
370 	/* nope */
371 	return (RET_SPECIAL);
372 }
373 
374 /*
375  *  BT_PUT -- Add an entry to a btree.
376  *
377  *	The specified (key, data) pair is added to the tree.  If the tree
378  *	was created for unique keys only, then duplicates will not be
379  *	entered.  If the requested key exists in the tree, it will be over-
380  *	written unless the flags parameter is R_NOOVERWRITE, in which case
381  *	the update will not be done.  If duplicate keys are permitted in the
382  *	tree, duplicates will be inserted and will not overwrite existing
383  *	keys.  Nodes are split as required.
384  *
385  *	Parameters:
386  *		tree -- btree in which to put the new entry
387  *		key -- key component to add
388  *		data -- data corresponding to key
389  *		flag -- R_NOOVERWRITE or zero.
390  *
391  *	Returns:
392  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the
393  *		NOOVERWRITE flag was set and the specified key exists
394  *		in the database.
395  *
396  *	Side Effects:
397  *		None.
398  */
399 
400 int
401 bt_put(tree, key, data, flag)
402 	BTREE tree;
403 	DBT *key;
404 	DBT *data;
405 	u_long flag;
406 {
407 	BTREE_P t = (BTREE_P) tree;
408 	BTITEM *item;
409 
410 	/* look for this key in the tree */
411 	item = _bt_search(t, key);
412 
413 	/*
414 	 *  If this tree was originally created without R_DUP, then duplicate
415 	 *  keys are not allowed.  We need to check this at insertion time.
416 	 */
417 
418 	if (VALIDITEM(t, item) && _bt_cmp(t, key->data, item->bti_index) == 0) {
419 		if ((t->bt_flags & BTF_NODUPS) && flag == R_NOOVERWRITE) {
420 			if (_bt_delone(t, item->bti_index) == RET_ERROR) {
421 				while (_bt_pop(t) != P_NONE)
422 					continue;
423 				return (RET_ERROR);
424 			}
425 		}
426 	}
427 
428 	return (_bt_insert(t, item, key, data, flag));
429 }
430 
431 /*
432  *  BT_DELETE -- delete a key from the tree.
433  *
434  *	Deletes all keys (and their associated data items) matching the
435  *	supplied key from the tree.  If the flags entry is R_CURSOR, then
436  *	the current item in the active scan is deleted.
437  *
438  *	Parameters:
439  *		tree -- btree from which to delete key
440  *		key -- key to delete
441  *		flags -- R_CURSOR or zero
442  *
443  *	Returns:
444  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified
445  *		key was not in the tree.
446  *
447  *	Side Effects:
448  *		None.
449  */
450 
451 int
452 bt_delete(tree, key, flags)
453 	BTREE tree;
454 	DBT *key;
455 	u_long flags;
456 {
457 	BTREE_P t = (BTREE_P) tree;
458 	BTHEADER *h;
459 	BTITEM *item;
460 	int ndel = 0;
461 
462 	if (flags == R_CURSOR)
463 		return (_bt_crsrdel(t));
464 
465 	/* find the first matching key in the tree */
466 	item = _bt_first(t, key);
467 	h = t->bt_curpage;
468 
469 	/* don't need the descent stack for deletes */
470 	while (_bt_pop(t) != P_NONE)
471 		continue;
472 
473 	/* delete all matching keys */
474 	for (;;) {
475 		while (VALIDITEM(t, item)
476 		       && (_bt_cmp(t, key->data, item->bti_index) == 0)) {
477 			if (_bt_delone(t, item->bti_index) == RET_ERROR)
478 				return (RET_ERROR);
479 			ndel++;
480 		}
481 
482 		if (VALIDITEM(t, item) || h->h_nextpg == P_NONE)
483 			break;
484 
485 		/* next page, if necessary */
486 		do {
487 			if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
488 				return (RET_ERROR);
489 			h = t->bt_curpage;
490 		} while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE);
491 
492 		item->bti_pgno = h->h_pgno;
493 		item->bti_index = 0;
494 
495 		if (!VALIDITEM(t, item)
496 		    || _bt_cmp(t, key->data, item->bti_index) != 0)
497 			break;
498 	}
499 
500 	/* flush changes to disk */
501 	if (ISDISK(t)) {
502 		if (h->h_flags & F_DIRTY) {
503 			if (_bt_write(t, t->bt_curpage, NORELEASE) == RET_ERROR)
504 				return (RET_ERROR);
505 		}
506 	}
507 
508 	if (ndel == 0)
509 		return (RET_SPECIAL);
510 
511 	return (RET_SUCCESS);
512 }
513 
514 /*
515  *  BT_SYNC -- sync the btree to disk.
516  *
517  *	Parameters:
518  *		tree -- btree to sync.
519  *
520  *	Returns:
521  *		RET_SUCCESS, RET_ERROR.
522  */
523 
524 bt_sync(tree)
525 	BTREE tree;
526 {
527 	BTREE_P t = (BTREE_P) tree;
528 	BTHEADER *h;
529 	pgno_t pgno;
530 
531 	/* if this is an in-memory btree, syncing is a no-op */
532 	if (!ISDISK(t))
533 		return (RET_SUCCESS);
534 
535 	h = (BTHEADER *) t->bt_curpage;
536 	h->h_flags &= ~F_DIRTY;
537 
538 	if (ISCACHE(t)) {
539 		pgno = t->bt_curpage->h_pgno;
540 		if (_bt_write(t, h, RELEASE) == RET_ERROR)
541 			return(RET_ERROR);
542 		if (lrusync(t->bt_s.bt_d.d_cache) < RET_ERROR)
543 			return (RET_ERROR);
544 		if (_bt_getpage(t, pgno) == RET_ERROR)
545 			return (RET_ERROR);
546 	} else {
547 		if (_bt_write(t, h, NORELEASE) == RET_ERROR)
548 			return (RET_ERROR);
549 	}
550 
551 	return (fsync(t->bt_s.bt_d.d_fd));
552 }
553 
554 /*
555  *  BT_SEQ -- Sequential scan interface.
556  *
557  *	This routine supports sequential scans on the btree.  If called with
558  *	flags set to R_CURSOR, or if no seq scan has been initialized in the
559  *	current tree, we initialize the scan.  Otherwise, we advance the scan
560  *	and return the next item.
561  *
562  *	Scans can be either keyed or non-keyed.  Keyed scans basically have
563  *	a starting point somewhere in the middle of the tree.  Non-keyed
564  *	scans start at an endpoint.  Also, scans can be backward (descending
565  *	order), forward (ascending order), or no movement (keep returning
566  *	the same item).
567  *
568  *	Flags is checked every time we enter the routine, so the user can
569  *	change directions on an active scan if desired.  The key argument
570  *	is examined only when we initialize the scan, in order to position
571  *	it properly.
572  *
573  *	Items are returned via the key and data arguments passed in.
574  *
575  *	Parameters:
576  *		tree -- btree in which to do scan
577  *		key -- key, used to position scan on initialization, and
578  *		       used to return key components to the user.
579  *		data -- used to return data components to the user.
580  *		flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or
581  *			 R_PREV.
582  *
583  *	Returns:
584  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data
585  *		exists in the tree in the specified direction.
586  *
587  *	Side Effects:
588  *		Changes the btree's notion of the current position in the
589  *		scan.
590  *
591  *	Warnings:
592  *		The key and data items returned are static and will be
593  *		overwritten by the next bt_get or bt_seq.
594  */
595 
596 int
597 bt_seq(tree, key, data, flags)
598 	BTREE tree;
599 	DBT *key;
600 	DBT *data;
601 	u_long flags;
602 {
603 	BTREE_P t = (BTREE_P) tree;
604 	BTHEADER *h;
605 	DATUM *d;
606 	int status;
607 
608 	/* do we need to initialize the scan? */
609 	if (flags == R_CURSOR || flags == R_LAST || flags == R_FIRST
610 	    || !(t->bt_flags & BTF_SEQINIT)) {
611 
612 		/* initialize it */
613 		status = _bt_seqinit(t, key, flags);
614 	} else {
615 		/* just advance the current scan pointer */
616 		status = _bt_seqadvance(t, flags);
617 	}
618 
619 	key->size = data->size = 0;
620 	key->data = data->data = (u_char *) NULL;
621 
622 	h = t->bt_curpage;
623 
624 	/* is there a valid item at the current scan location? */
625 	if (status == RET_SPECIAL) {
626 		if (flags == R_NEXT) {
627 			if (t->bt_cursor.c_index >= NEXTINDEX(h)) {
628 				if (NEXTINDEX(h) > 0)
629 					t->bt_cursor.c_index = NEXTINDEX(h) - 1;
630 				else
631 					t->bt_cursor.c_index = 0;
632 			}
633 		} else {
634 			t->bt_cursor.c_index = 0;
635 		}
636 		return (RET_SPECIAL);
637 	} else if (status == RET_ERROR)
638 		return (RET_ERROR);
639 
640 	/* okay, return the data */
641 	d = (DATUM *) GETDATUM(h, t->bt_cursor.c_index);
642 
643 	return (_bt_buildret(t, d, data, key));
644 }
645 
646 /*
647  *  BT_CLOSE -- Close a btree
648  *
649  *	Parameters:
650  *		tree -- tree to close
651  *
652  *	Returns:
653  *		RET_SUCCESS, RET_ERROR.
654  *
655  *	Side Effects:
656  *		Frees space occupied by the tree.
657  */
658 
659 int
660 bt_close(tree)
661 	BTREE tree;
662 {
663 	BTREE_P t = (BTREE_P) tree;
664 	int i;
665 	BTHEADER *h;
666 	char *cache;
667 	struct HTBUCKET *b, *sb;
668 	HTABLE ht;
669 	int fd;
670 
671 	if (t->bt_cursor.c_key != (char *) NULL)
672 		(void) free(t->bt_cursor.c_key);
673 
674 	if (!ISDISK(t)) {
675 		/* in-memory tree, release hash table memory */
676 		ht = t->bt_s.bt_ht;
677 		for (i = 0; i < HTSIZE; i++) {
678 			if ((b = ht[i]) == (struct HTBUCKET *) NULL)
679 				break;
680 			do {
681 				sb = b;
682 				(void) free((char *) b->ht_page);
683 				b = b->ht_next;
684 				(void) free((char *) sb);
685 			} while (b != (struct HTBUCKET *) NULL);
686 		}
687 		(void) free ((char *) ht);
688 		(void) free ((char *) t);
689 		return (RET_SUCCESS);
690 	}
691 
692 	if ((t->bt_flags & BTF_ISWRITE) && !(t->bt_flags & BTF_METAOK)) {
693 		if (_bt_wrtmeta(t) == RET_ERROR)
694 			return (RET_ERROR);
695 	}
696 
697 	if (t->bt_curpage != (BTHEADER *) NULL) {
698 		h = t->bt_curpage;
699 		if (h->h_flags & F_DIRTY) {
700 			if (_bt_write(t, h, RELEASE) == RET_ERROR)
701 				return (RET_ERROR);
702 		} else {
703 			if (_bt_release(t, h) == RET_ERROR)
704 				return (RET_ERROR);
705 		}
706 
707 		/* flush and free the cache, if there is one */
708 		if (ISCACHE(t)) {
709 			cache = t->bt_s.bt_d.d_cache;
710 			if (lrusync(cache) == RET_ERROR)
711 				return (RET_ERROR);
712 			lrufree(cache);
713 		}
714 		(void) free ((char *) h);
715 	}
716 
717 	fd = t->bt_s.bt_d.d_fd;
718 	(void) free ((char *) t);
719 	return (close(fd));
720 }
721