xref: /original-bsd/lib/libc/db/btree/bt_open.c (revision 95a66346)
1 /*-
2  * Copyright (c) 1990 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Mike Olson.
7  *
8  * %sccs.include.redist.c%
9  */
10 
11 #if defined(LIBC_SCCS) && !defined(lint)
12 static char sccsid[] = "@(#)bt_open.c	5.7 (Berkeley) 03/03/91";
13 #endif /* LIBC_SCCS and not lint */
14 
15 /*
16  *  btree.c -- implementation of btree access method for 4.4BSD.
17  *
18  *	The design here is based on that of the btree access method used
19  *	in the Postgres database system at UC Berkeley.  The implementation
20  *	is wholly independent of the Postgres code.
21  *
22  *	This implementation supports btrees on disk (supply a filename) or
23  *	in memory (don't).  Public interfaces defined here are:
24  *
25  *		btree_open()	-- wrapper; returns a filled DB struct for
26  *				   a btree.
27  *
28  *		bt_open()	-- open a new or existing btree.
29  *		bt_get()	-- fetch data from a tree by key.
30  *		bt_seq()	-- do a sequential scan on a tree.
31  *		bt_put()	-- add data to a tree by key.
32  *		bt_delete()	-- remove data from a tree by key.
33  *		bt_close()	-- close a btree.
34  *		bt_sync()	-- force btree pages to disk (disk trees only).
35  */
36 
37 #include <sys/param.h>
38 #include <sys/stat.h>
39 #include <signal.h>
40 #include <errno.h>
41 #include <fcntl.h>
42 #include <db.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <unistd.h>
46 #include "btree.h"
47 
48 BTREEINFO _DefaultBTInfo = {
49 	0,	/* flags */
50 	0,	/* cachesize */
51 	0,	/* psize */
52 	strcmp,	/* compare */
53 	0
54 };
55 
56 /*
57  *  BTREE_OPEN -- Wrapper routine to open a btree.
58  *
59  *	Creates and fills a DB struct, and calls the routine that actually
60  *	opens the btree.
61  *
62  *	Parameters:
63  *		f:  filename to open
64  *		flags:  flag bits passed to open
65  *		mode:  permission bits, used if O_CREAT specified
66  *		b:  BTREEINFO pointer
67  *
68  *	Returns:
69  *		Filled-in DBT on success; NULL on failure, with errno
70  *		set as appropriate.
71  *
72  *	Side Effects:
73  *		Allocates memory for the DB struct.
74  */
75 
76 DB *
77 btree_open(f, flags, mode, b)
78 	const char *f;
79 	int flags;
80 	int mode;
81 	const BTREEINFO *b;
82 {
83 	DB *db;
84 	BTREE t;
85 
86 	if ((db = (DB *) malloc((unsigned) sizeof(DB))) == (DB *) NULL)
87 		return ((DB *) NULL);
88 
89 	if ((t = bt_open(f, flags, mode, b)) == (BTREE) NULL) {
90 		(void) free ((char *) db);
91 		return ((DB *) NULL);
92 	}
93 
94 	db->internal = (char *) t;
95 	db->close = bt_close;
96 	db->del = bt_delete;
97 	db->get = bt_get;
98 	db->put = bt_put;
99 	db->seq = bt_seq;
100 	db->sync = bt_sync;
101 
102 	return (db);
103 }
104 
105 /*
106  *  BT_OPEN -- Open a btree.
107  *
108  *	This routine creates the correct kind (disk or in-memory) of
109  *	btree and initializes it as required.
110  *
111  *	Parameters:
112  *		f -- name of btree (NULL for in-memory btrees)
113  *		flags -- flags passed to open()
114  *		mode -- mode passed to open ()
115  *		b -- BTREEINFO structure, describing btree
116  *
117  *	Returns:
118  *		(Opaque) pointer to the btree.  On failure, returns NULL
119  *		with errno set as appropriate.
120  *
121  *	Side Effects:
122  *		Allocates memory, opens files.
123  */
124 
125 BTREE
126 bt_open(f, flags, mode, b)
127 	char *f;
128 	int flags;
129 	int mode;
130 	BTREEINFO *b;
131 {
132 	BTREE_P t;
133 	HTABLE ht;
134 	int nbytes;
135 	int fd;
136 	CURSOR *c;
137 	BTMETA m;
138 	struct stat buf;
139 
140 	/* use the default info if none was provided */
141 	if (b == (BTREEINFO *) NULL)
142 		b = &_DefaultBTInfo;
143 
144 	if ((t = (BTREE_P) malloc((unsigned) sizeof *t)) == (BTREE_P) NULL)
145 		return ((BTREE) NULL);
146 
147 	if (b->compare)
148 		t->bt_compare = b->compare;
149 	else
150 		t->bt_compare = strcmp;
151 
152 	t->bt_fname = f;
153 	t->bt_curpage = (BTHEADER *) NULL;
154 	t->bt_free = P_NONE;
155 	c = &(t->bt_cursor);
156 	c->c_pgno = P_NONE;
157 	c->c_index = 0;
158 	c->c_flags = (u_char) NULL;
159 	c->c_key = (char *) NULL;
160 	t->bt_stack = (BTSTACK *) NULL;
161 	t->bt_flags = 0;
162 
163 	/*
164 	 *  If no file name was supplied, this is an in-memory btree.
165 	 *  Otherwise, it's a disk-based btree.
166 	 */
167 	if (f == (char *) NULL) {
168 		/* in memory */
169 		if ((t->bt_psize = b->psize) < MINPSIZE) {
170 			if (t->bt_psize != 0) {
171 				(void) free ((char *) t);
172 				errno = EINVAL;
173 				return ((BTREE) NULL);
174 			}
175 			t->bt_psize = getpagesize();
176 		}
177 
178 		nbytes = HTSIZE * sizeof(HTBUCKET *);
179 		if ((ht = (HTABLE) malloc((unsigned) nbytes))
180 		    == (HTABLE) NULL) {
181 			(void) free((char *) t);
182 			return ((BTREE) NULL);
183 		}
184 		(void) bzero((char *) ht, nbytes);
185 		t->bt_s.bt_ht = ht;
186 		t->bt_npages = 0;
187 		t->bt_lorder = BYTE_ORDER;
188 		if (!(b->flags & R_DUP))
189 			t->bt_flags |= BTF_NODUPS;
190 	} else {
191 		/* on disk */
192 		if ((fd = open(f, O_RDONLY, 0)) < 0) {
193 			/* doesn't exist yet, be sure page is big enough */
194 			if ((t->bt_psize = b->psize) < sizeof(BTHEADER)
195 			    && b->psize != 0) {
196 				(void) free((char *) t);
197 				errno = EINVAL;
198 				return ((BTREE) NULL);
199 			}
200 			if (b->lorder == 0)
201 				b->lorder = BYTE_ORDER;
202 
203 			if (b->lorder != BIG_ENDIAN
204 			    && b->lorder != LITTLE_ENDIAN) {
205 				(void) free((char *) t);
206 				errno = EINVAL;
207 				return ((BTREE) NULL);
208 			}
209 			t->bt_lorder = b->lorder;
210 			if (!(b->flags & R_DUP))
211 				t->bt_flags |= BTF_NODUPS;
212 		} else {
213 			/* exists, get page size from file */
214 			if (read(fd, &m, sizeof(m)) < sizeof(m)) {
215 				(void) close(fd);
216 				(void) free((char *) t);
217 				errno = EINVAL;
218 				return ((BTREE) NULL);
219 			}
220 
221 			/* lorder always stored in host-independent format */
222 			NTOHL(m.m_lorder);
223 			if (m.m_lorder != BIG_ENDIAN
224 			    && m.m_lorder != LITTLE_ENDIAN) {
225 				(void) free((char *) t);
226 				errno = EINVAL;
227 				return ((BTREE) NULL);
228 			}
229 			t->bt_lorder = m.m_lorder;
230 
231 			if (t->bt_lorder != BYTE_ORDER) {
232 				BLSWAP(m.m_magic);
233 				BLSWAP(m.m_version);
234 				BLSWAP(m.m_psize);
235 				BLSWAP(m.m_free);
236 				BLSWAP(m.m_flags);
237 			}
238 
239 			if (m.m_magic != BTREEMAGIC
240 			    || m.m_version != BTREEVERSION
241 			    || m.m_psize < MINPSIZE) {
242 				(void) close(fd);
243 				(void) free((char *) t);
244 #ifndef EFTYPE
245 #define EFTYPE	-100
246 #endif
247 				errno = EFTYPE;
248 				return ((BTREE) NULL);
249 			}
250 			t->bt_psize = m.m_psize;
251 			t->bt_free = m.m_free;
252 			t->bt_flags |= (m.m_flags & BTF_NODUPS) | BTF_METAOK;
253 			(void) close(fd);
254 		}
255 
256 		/* now open the file the way the user wanted it */
257 		if ((t->bt_s.bt_d.d_fd = open(f, flags, mode)) < 0) {
258 			(void) free ((char *) t);
259 			return ((BTREE) NULL);
260 		}
261 
262 		/* access method files are always close-on-exec */
263 		if (fcntl(t->bt_s.bt_d.d_fd, F_SETFL, 1) == -1) {
264 			(void) close(t->bt_s.bt_d.d_fd);
265 			(void) free ((char *) t);
266 			return ((BTREE) NULL);
267 		}
268 
269 		/* get number of pages, page size if necessary */
270 		(void) fstat(t->bt_s.bt_d.d_fd, &buf);
271 		if (t->bt_psize == 0)
272 			t->bt_psize = buf.st_blksize;
273 		t->bt_npages = (pgno_t) (buf.st_size / t->bt_psize);
274 
275 		/* page zero is metadata, doesn't count */
276 		if (t->bt_npages > 0)
277 			--(t->bt_npages);
278 
279 		if (b->cachesize == 0)
280 			b->cachesize = DEFCACHE;
281 
282 		/* get an lru buffer cache, if the user asked for one */
283 		if ((b->cachesize / t->bt_psize) > 0) {
284 			BTDISK *d = &(t->bt_s.bt_d);
285 
286 			d->d_cache = lruinit(d->d_fd,
287 					     (int) (b->cachesize / t->bt_psize),
288 					     (int) t->bt_psize,
289 					     (char *) t->bt_lorder,
290 					     _bt_pgin, _bt_pgout);
291 
292 			if (d->d_cache == (char *) NULL) {
293 				(void) free((char *) t);
294 				return ((BTREE) NULL);
295 			}
296 		}
297 	}
298 
299 	/* remember if tree was opened for write */
300 	if (((flags & O_WRONLY) == O_WRONLY)
301 	    || ((flags & O_RDWR) == O_RDWR))
302 		t->bt_flags |= BTF_ISWRITE;
303 
304 	return ((BTREE) t);
305 }
306 
307 /*
308  *  BT_GET -- Get an entry from a btree.
309  *
310  *	Does a key lookup in the tree to find the specified key, and returns
311  *	the key/data pair found.
312  *
313  *	Parameters:
314  *		tree -- btree in which to do lookup
315  *		key -- key to find
316  *		data -- pointer to DBT in which to return data
317  *		flag -- ignored
318  *
319  *	Returns:
320  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not
321  *		found.  If key is not found, nothing is stored in the
322  *		return DBT 'data'.
323  *
324  *	Side Effects:
325  *		None.
326  *
327  *	Warnings:
328  *		Return data is statically allocated, and will be overwritten
329  *		at the next call.
330  */
331 
332 int
333 bt_get(tree, key, data, flag)
334 	BTREE tree;
335 	DBT *key;
336 	DBT *data;
337 	u_long flag;
338 {
339 	BTREE_P t = (BTREE_P) tree;
340 	BTHEADER *h;
341 	DATUM *d;
342 	BTITEM *item;
343 
344 #ifdef lint
345 	flag = flag;
346 #endif /* lint */
347 
348 	/* lookup */
349 	item = _bt_search(t, key);
350 
351 	/* clear parent pointer stack */
352 	while (_bt_pop(t) != P_NONE)
353 		continue;
354 
355 	if (item == (BTITEM *) NULL)
356 		return (RET_ERROR);
357 
358 	h = (BTHEADER *) t->bt_curpage;
359 	data->size = 0;
360 	data->data = (u_char *) NULL;
361 
362 	/* match? */
363 	if (VALIDITEM(t, item)
364 	    && _bt_cmp(t, key->data, item->bti_index) == 0) {
365 		d = (DATUM *) GETDATUM(h, item->bti_index);
366 		return (_bt_buildret(t, d, data, key));
367 	}
368 
369 	/* nope */
370 	return (RET_SPECIAL);
371 }
372 
373 /*
374  *  BT_PUT -- Add an entry to a btree.
375  *
376  *	The specified (key, data) pair is added to the tree.  If the tree
377  *	was created for unique keys only, then duplicates will not be
378  *	entered.  If the requested key exists in the tree, it will be over-
379  *	written unless the flags parameter is R_NOOVERWRITE, in which case
380  *	the update will not be done.  If duplicate keys are permitted in the
381  *	tree, duplicates will be inserted and will not overwrite existing
382  *	keys.  Nodes are split as required.
383  *
384  *	Parameters:
385  *		tree -- btree in which to put the new entry
386  *		key -- key component to add
387  *		data -- data corresponding to key
388  *		flag -- R_NOOVERWRITE or zero.
389  *
390  *	Returns:
391  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the
392  *		NOOVERWRITE flag was set and the specified key exists
393  *		in the database.
394  *
395  *	Side Effects:
396  *		None.
397  */
398 
399 int
400 bt_put(tree, key, data, flag)
401 	BTREE tree;
402 	DBT *key;
403 	DBT *data;
404 	u_long flag;
405 {
406 	BTREE_P t = (BTREE_P) tree;
407 	BTITEM *item;
408 
409 	/* look for this key in the tree */
410 	item = _bt_search(t, key);
411 
412 	/*
413 	 *  If this tree was originally created without R_DUP, then duplicate
414 	 *  keys are not allowed.  We need to check this at insertion time.
415 	 */
416 
417 	if (VALIDITEM(t, item) && _bt_cmp(t, key->data, item->bti_index) == 0) {
418 		if ((t->bt_flags & BTF_NODUPS) && flag == R_NOOVERWRITE) {
419 			if (_bt_delone(t, item->bti_index) == RET_ERROR) {
420 				while (_bt_pop(t) != P_NONE)
421 					continue;
422 				return (RET_ERROR);
423 			}
424 		}
425 	}
426 
427 	return (_bt_insert(t, item, key, data, flag));
428 }
429 
430 /*
431  *  BT_DELETE -- delete a key from the tree.
432  *
433  *	Deletes all keys (and their associated data items) matching the
434  *	supplied key from the tree.  If the flags entry is R_CURSOR, then
435  *	the current item in the active scan is deleted.
436  *
437  *	Parameters:
438  *		tree -- btree from which to delete key
439  *		key -- key to delete
440  *		flags -- R_CURSOR or zero
441  *
442  *	Returns:
443  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified
444  *		key was not in the tree.
445  *
446  *	Side Effects:
447  *		None.
448  */
449 
450 int
451 bt_delete(tree, key, flags)
452 	BTREE tree;
453 	DBT *key;
454 	u_long flags;
455 {
456 	BTREE_P t = (BTREE_P) tree;
457 	BTHEADER *h;
458 	BTITEM *item;
459 	int ndel = 0;
460 
461 	if (flags == R_CURSOR)
462 		return (_bt_crsrdel(t));
463 
464 	/* find the first matching key in the tree */
465 	item = _bt_first(t, key);
466 	h = t->bt_curpage;
467 
468 	/* don't need the descent stack for deletes */
469 	while (_bt_pop(t) != P_NONE)
470 		continue;
471 
472 	/* delete all matching keys */
473 	for (;;) {
474 		while (VALIDITEM(t, item)
475 		       && (_bt_cmp(t, key->data, item->bti_index) == 0)) {
476 			if (_bt_delone(t, item->bti_index) == RET_ERROR)
477 				return (RET_ERROR);
478 			ndel++;
479 		}
480 
481 		if (VALIDITEM(t, item) || h->h_nextpg == P_NONE)
482 			break;
483 
484 		/* next page, if necessary */
485 		do {
486 			if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
487 				return (RET_ERROR);
488 			h = t->bt_curpage;
489 		} while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE);
490 
491 		item->bti_pgno = h->h_pgno;
492 		item->bti_index = 0;
493 
494 		if (!VALIDITEM(t, item)
495 		    || _bt_cmp(t, key->data, item->bti_index) != 0)
496 			break;
497 	}
498 
499 	/* flush changes to disk */
500 	if (ISDISK(t)) {
501 		if (h->h_flags & F_DIRTY) {
502 			if (_bt_write(t, t->bt_curpage, NORELEASE) == RET_ERROR)
503 				return (RET_ERROR);
504 		}
505 	}
506 
507 	if (ndel == 0)
508 		return (RET_SPECIAL);
509 
510 	return (RET_SUCCESS);
511 }
512 
513 /*
514  *  BT_SYNC -- sync the btree to disk.
515  *
516  *	Parameters:
517  *		tree -- btree to sync.
518  *
519  *	Returns:
520  *		RET_SUCCESS, RET_ERROR.
521  */
522 
523 bt_sync(tree)
524 	BTREE tree;
525 {
526 	BTREE_P t = (BTREE_P) tree;
527 	BTHEADER *h;
528 	pgno_t pgno;
529 
530 	/* if this is an in-memory btree, syncing is a no-op */
531 	if (!ISDISK(t))
532 		return (RET_SUCCESS);
533 
534 	h = (BTHEADER *) t->bt_curpage;
535 	h->h_flags &= ~F_DIRTY;
536 
537 	if (ISCACHE(t)) {
538 		pgno = t->bt_curpage->h_pgno;
539 		if (_bt_write(t, h, RELEASE) == RET_ERROR)
540 			return(RET_ERROR);
541 		if (lrusync(t->bt_s.bt_d.d_cache) < RET_ERROR)
542 			return (RET_ERROR);
543 		if (_bt_getpage(t, pgno) == RET_ERROR)
544 			return (RET_ERROR);
545 	} else {
546 		if (_bt_write(t, h, NORELEASE) == RET_ERROR)
547 			return (RET_ERROR);
548 	}
549 
550 	return (fsync(t->bt_s.bt_d.d_fd));
551 }
552 
553 /*
554  *  BT_SEQ -- Sequential scan interface.
555  *
556  *	This routine supports sequential scans on the btree.  If called with
557  *	flags set to R_CURSOR, or if no seq scan has been initialized in the
558  *	current tree, we initialize the scan.  Otherwise, we advance the scan
559  *	and return the next item.
560  *
561  *	Scans can be either keyed or non-keyed.  Keyed scans basically have
562  *	a starting point somewhere in the middle of the tree.  Non-keyed
563  *	scans start at an endpoint.  Also, scans can be backward (descending
564  *	order), forward (ascending order), or no movement (keep returning
565  *	the same item).
566  *
567  *	Flags is checked every time we enter the routine, so the user can
568  *	change directions on an active scan if desired.  The key argument
569  *	is examined only when we initialize the scan, in order to position
570  *	it properly.
571  *
572  *	Items are returned via the key and data arguments passed in.
573  *
574  *	Parameters:
575  *		tree -- btree in which to do scan
576  *		key -- key, used to position scan on initialization, and
577  *		       used to return key components to the user.
578  *		data -- used to return data components to the user.
579  *		flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or
580  *			 R_PREV.
581  *
582  *	Returns:
583  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data
584  *		exists in the tree in the specified direction.
585  *
586  *	Side Effects:
587  *		Changes the btree's notion of the current position in the
588  *		scan.
589  *
590  *	Warnings:
591  *		The key and data items returned are static and will be
592  *		overwritten by the next bt_get or bt_seq.
593  */
594 
595 int
596 bt_seq(tree, key, data, flags)
597 	BTREE tree;
598 	DBT *key;
599 	DBT *data;
600 	u_long flags;
601 {
602 	BTREE_P t = (BTREE_P) tree;
603 	BTHEADER *h;
604 	DATUM *d;
605 	int status;
606 
607 	/* do we need to initialize the scan? */
608 	if (flags == R_CURSOR || flags == R_LAST || flags == R_FIRST
609 	    || !(t->bt_flags & BTF_SEQINIT)) {
610 
611 		/* initialize it */
612 		status = _bt_seqinit(t, key, flags);
613 	} else {
614 		/* just advance the current scan pointer */
615 		status = _bt_seqadvance(t, flags);
616 	}
617 
618 	key->size = data->size = 0;
619 	key->data = data->data = (u_char *) NULL;
620 
621 	h = t->bt_curpage;
622 
623 	/* is there a valid item at the current scan location? */
624 	if (status == RET_SPECIAL) {
625 		if (flags == R_NEXT) {
626 			if (t->bt_cursor.c_index >= NEXTINDEX(h)) {
627 				if (NEXTINDEX(h) > 0)
628 					t->bt_cursor.c_index = NEXTINDEX(h) - 1;
629 				else
630 					t->bt_cursor.c_index = 0;
631 			}
632 		} else {
633 			t->bt_cursor.c_index = 0;
634 		}
635 		return (RET_SPECIAL);
636 	} else if (status == RET_ERROR)
637 		return (RET_ERROR);
638 
639 	/* okay, return the data */
640 	d = (DATUM *) GETDATUM(h, t->bt_cursor.c_index);
641 
642 	return (_bt_buildret(t, d, data, key));
643 }
644 
645 /*
646  *  BT_CLOSE -- Close a btree
647  *
648  *	Parameters:
649  *		tree -- tree to close
650  *
651  *	Returns:
652  *		RET_SUCCESS, RET_ERROR.
653  *
654  *	Side Effects:
655  *		Frees space occupied by the tree.
656  */
657 
658 int
659 bt_close(tree)
660 	BTREE tree;
661 {
662 	BTREE_P t = (BTREE_P) tree;
663 	int i;
664 	BTHEADER *h;
665 	char *cache;
666 	struct HTBUCKET *b, *sb;
667 	HTABLE ht;
668 	int fd;
669 
670 	if (t->bt_cursor.c_key != (char *) NULL)
671 		(void) free(t->bt_cursor.c_key);
672 
673 	if (!ISDISK(t)) {
674 		/* in-memory tree, release hash table memory */
675 		ht = t->bt_s.bt_ht;
676 		for (i = 0; i < HTSIZE; i++) {
677 			if ((b = ht[i]) == (struct HTBUCKET *) NULL)
678 				break;
679 			do {
680 				sb = b;
681 				(void) free((char *) b->ht_page);
682 				b = b->ht_next;
683 				(void) free((char *) sb);
684 			} while (b != (struct HTBUCKET *) NULL);
685 		}
686 		(void) free ((char *) ht);
687 		(void) free ((char *) t);
688 		return (RET_SUCCESS);
689 	}
690 
691 	if ((t->bt_flags & BTF_ISWRITE) && !(t->bt_flags & BTF_METAOK)) {
692 		if (_bt_wrtmeta(t) == RET_ERROR)
693 			return (RET_ERROR);
694 	}
695 
696 	if (t->bt_curpage != (BTHEADER *) NULL) {
697 		h = t->bt_curpage;
698 		if (h->h_flags & F_DIRTY) {
699 			if (_bt_write(t, h, RELEASE) == RET_ERROR)
700 				return (RET_ERROR);
701 		} else {
702 			if (_bt_release(t, h) == RET_ERROR)
703 				return (RET_ERROR);
704 		}
705 
706 		/* flush and free the cache, if there is one */
707 		if (ISCACHE(t)) {
708 			cache = t->bt_s.bt_d.d_cache;
709 			if (lrusync(cache) == RET_ERROR)
710 				return (RET_ERROR);
711 			lrufree(cache);
712 		}
713 		(void) free ((char *) h);
714 	}
715 
716 	fd = t->bt_s.bt_d.d_fd;
717 	(void) free ((char *) t);
718 	return (close(fd));
719 }
720