xref: /original-bsd/lib/libc/db/btree/bt_open.c (revision 703f6d5d)
1 /*-
2  * Copyright (c) 1990 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Mike Olson.
7  *
8  * %sccs.include.redist.c%
9  */
10 
11 #if defined(LIBC_SCCS) && !defined(lint)
12 static char sccsid[] = "@(#)bt_open.c	5.9 (Berkeley) 05/07/91";
13 #endif /* LIBC_SCCS and not lint */
14 
15 /*
16  *  btree.c -- implementation of btree access method for 4.4BSD.
17  *
18  *	The design here is based on that of the btree access method used
19  *	in the Postgres database system at UC Berkeley.  The implementation
20  *	is wholly independent of the Postgres code.
21  *
22  *	This implementation supports btrees on disk (supply a filename) or
23  *	in memory (don't).  Public interfaces defined here are:
24  *
25  *		btree_open()	-- wrapper; returns a filled DB struct for
26  *				   a btree.
27  *
28  *		bt_open()	-- open a new or existing btree.
29  *		bt_get()	-- fetch data from a tree by key.
30  *		bt_seq()	-- do a sequential scan on a tree.
31  *		bt_put()	-- add data to a tree by key.
32  *		bt_delete()	-- remove data from a tree by key.
33  *		bt_close()	-- close a btree.
34  *		bt_sync()	-- force btree pages to disk (disk trees only).
35  */
36 
37 #include <sys/param.h>
38 #include <sys/stat.h>
39 #include <signal.h>
40 #include <errno.h>
41 #include <fcntl.h>
42 #include <db.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <unistd.h>
46 #include "btree.h"
47 
48 BTREEINFO _DefaultBTInfo = {
49 	0,	/* flags */
50 	0,	/* cachesize */
51 	0,	/* psize */
52 	strcmp,	/* compare */
53 	0
54 };
55 
56 /*
57  *  BTREE_OPEN -- Wrapper routine to open a btree.
58  *
59  *	Creates and fills a DB struct, and calls the routine that actually
60  *	opens the btree.
61  *
62  *	Parameters:
63  *		f:  filename to open
64  *		flags:  flag bits passed to open
65  *		mode:  permission bits, used if O_CREAT specified
66  *		b:  BTREEINFO pointer
67  *
68  *	Returns:
69  *		Filled-in DBT on success; NULL on failure, with errno
70  *		set as appropriate.
71  *
72  *	Side Effects:
73  *		Allocates memory for the DB struct.
74  */
75 
76 DB *
77 btree_open(f, flags, mode, b)
78 	const char *f;
79 	int flags;
80 	int mode;
81 	const BTREEINFO *b;
82 {
83 	DB *db;
84 	BTREE t;
85 
86 	if ((db = (DB *) malloc((unsigned) sizeof(DB))) == (DB *) NULL)
87 		return ((DB *) NULL);
88 
89 	if ((t = bt_open(f, flags, mode, b)) == (BTREE) NULL) {
90 		(void) free ((char *) db);
91 		return ((DB *) NULL);
92 	}
93 
94 	db->internal = (char *) t;
95 	db->close = bt_close;
96 	db->del = bt_delete;
97 	db->get = bt_get;
98 	db->put = bt_put;
99 	db->seq = bt_seq;
100 	db->sync = bt_sync;
101 	db->type = DB_BTREE;
102 
103 	return (db);
104 }
105 
106 /*
107  *  BT_OPEN -- Open a btree.
108  *
109  *	This routine creates the correct kind (disk or in-memory) of
110  *	btree and initializes it as required.
111  *
112  *	Parameters:
113  *		f -- name of btree (NULL for in-memory btrees)
114  *		flags -- flags passed to open()
115  *		mode -- mode passed to open ()
116  *		b -- BTREEINFO structure, describing btree
117  *
118  *	Returns:
119  *		(Opaque) pointer to the btree.  On failure, returns NULL
120  *		with errno set as appropriate.
121  *
122  *	Side Effects:
123  *		Allocates memory, opens files.
124  */
125 
126 BTREE
127 bt_open(f, flags, mode, b)
128 	char *f;
129 	int flags;
130 	int mode;
131 	BTREEINFO *b;
132 {
133 	BTREE_P t;
134 	HTABLE ht;
135 	int nbytes;
136 	int fd;
137 	CURSOR *c;
138 	BTMETA m;
139 	struct stat buf;
140 
141 	/* use the default info if none was provided */
142 	if (b == (BTREEINFO *) NULL)
143 		b = &_DefaultBTInfo;
144 
145 	if ((t = (BTREE_P) malloc((unsigned) sizeof *t)) == (BTREE_P) NULL)
146 		return ((BTREE) NULL);
147 
148 	if (b->compare)
149 		t->bt_compare = b->compare;
150 	else
151 		t->bt_compare = strcmp;
152 
153 	t->bt_fname = f;
154 	t->bt_curpage = (BTHEADER *) NULL;
155 	t->bt_free = P_NONE;
156 	c = &(t->bt_cursor);
157 	c->c_pgno = P_NONE;
158 	c->c_index = 0;
159 	c->c_flags = (u_char) NULL;
160 	c->c_key = (char *) NULL;
161 	t->bt_stack = (BTSTACK *) NULL;
162 	t->bt_flags = 0;
163 
164 	/*
165 	 *  If no file name was supplied, this is an in-memory btree.
166 	 *  Otherwise, it's a disk-based btree.
167 	 */
168 	if (f == (char *) NULL) {
169 		/* in memory */
170 		if ((t->bt_psize = b->psize) < MINPSIZE) {
171 			if (t->bt_psize != 0) {
172 				(void) free ((char *) t);
173 				errno = EINVAL;
174 				return ((BTREE) NULL);
175 			}
176 			t->bt_psize = getpagesize();
177 		}
178 
179 		nbytes = HTSIZE * sizeof(HTBUCKET *);
180 		if ((ht = (HTABLE) malloc((unsigned) nbytes))
181 		    == (HTABLE) NULL) {
182 			(void) free((char *) t);
183 			return ((BTREE) NULL);
184 		}
185 		(void) bzero((char *) ht, nbytes);
186 		t->bt_s.bt_ht = ht;
187 		t->bt_npages = 0;
188 		t->bt_lorder = BYTE_ORDER;
189 		if (!(b->flags & R_DUP))
190 			t->bt_flags |= BTF_NODUPS;
191 	} else {
192 		/* on disk */
193 		if ((fd = open(f, O_RDONLY, 0)) < 0) {
194 			/* doesn't exist yet, be sure page is big enough */
195 			if ((t->bt_psize = b->psize) < sizeof(BTHEADER)
196 			    && b->psize != 0) {
197 				(void) free((char *) t);
198 				errno = EINVAL;
199 				return ((BTREE) NULL);
200 			}
201 			if (b->lorder == 0)
202 				b->lorder = BYTE_ORDER;
203 
204 			if (b->lorder != BIG_ENDIAN
205 			    && b->lorder != LITTLE_ENDIAN) {
206 				(void) free((char *) t);
207 				errno = EINVAL;
208 				return ((BTREE) NULL);
209 			}
210 			t->bt_lorder = b->lorder;
211 			if (!(b->flags & R_DUP))
212 				t->bt_flags |= BTF_NODUPS;
213 		} else {
214 			/* exists, get page size from file */
215 			if (read(fd, &m, sizeof(m)) < sizeof(m)) {
216 				(void) close(fd);
217 				(void) free((char *) t);
218 				errno = EINVAL;
219 				return ((BTREE) NULL);
220 			}
221 
222 			/* lorder always stored in host-independent format */
223 			NTOHL(m.m_lorder);
224 			if (m.m_lorder != BIG_ENDIAN
225 			    && m.m_lorder != LITTLE_ENDIAN) {
226 				(void) free((char *) t);
227 				errno = EINVAL;
228 				return ((BTREE) NULL);
229 			}
230 			t->bt_lorder = m.m_lorder;
231 
232 			if (t->bt_lorder != BYTE_ORDER) {
233 				BLSWAP(m.m_magic);
234 				BLSWAP(m.m_version);
235 				BLSWAP(m.m_psize);
236 				BLSWAP(m.m_free);
237 				BLSWAP(m.m_flags);
238 			}
239 
240 			if (m.m_magic != BTREEMAGIC
241 			    || m.m_version != BTREEVERSION
242 			    || m.m_psize < MINPSIZE) {
243 				(void) close(fd);
244 				(void) free((char *) t);
245 #ifndef EFTYPE
246 #define EFTYPE	-100
247 #endif
248 				errno = EFTYPE;
249 				return ((BTREE) NULL);
250 			}
251 			t->bt_psize = m.m_psize;
252 			t->bt_free = m.m_free;
253 			t->bt_flags |= (m.m_flags & BTF_NODUPS) | BTF_METAOK;
254 			(void) close(fd);
255 		}
256 
257 		/* now open the file the way the user wanted it */
258 		if ((t->bt_s.bt_d.d_fd = open(f, flags, mode)) < 0) {
259 			(void) free ((char *) t);
260 			return ((BTREE) NULL);
261 		}
262 
263 		/* access method files are always close-on-exec */
264 		if (fcntl(t->bt_s.bt_d.d_fd, F_SETFL, 1) == -1) {
265 			(void) close(t->bt_s.bt_d.d_fd);
266 			(void) free ((char *) t);
267 			return ((BTREE) NULL);
268 		}
269 
270 		/* get number of pages, page size if necessary */
271 		(void) fstat(t->bt_s.bt_d.d_fd, &buf);
272 		if (t->bt_psize == 0)
273 			t->bt_psize = buf.st_blksize;
274 		t->bt_npages = (pgno_t) (buf.st_size / t->bt_psize);
275 
276 		/* page zero is metadata, doesn't count */
277 		if (t->bt_npages > 0)
278 			--(t->bt_npages);
279 
280 		if (b->cachesize == 0)
281 			b->cachesize = DEFCACHE;
282 
283 		/* get an lru buffer cache, if the user asked for one */
284 		if ((b->cachesize / t->bt_psize) > 0) {
285 			BTDISK *d = &(t->bt_s.bt_d);
286 
287 			d->d_cache = lruinit(d->d_fd,
288 					     (int) (b->cachesize / t->bt_psize),
289 					     (int) t->bt_psize,
290 					     (char *) t->bt_lorder,
291 					     _bt_pgin, _bt_pgout);
292 
293 			if (d->d_cache == (char *) NULL) {
294 				(void) free((char *) t);
295 				return ((BTREE) NULL);
296 			}
297 		}
298 	}
299 
300 	/* remember if tree was opened for write */
301 	if (((flags & O_WRONLY) == O_WRONLY)
302 	    || ((flags & O_RDWR) == O_RDWR))
303 		t->bt_flags |= BTF_ISWRITE;
304 
305 	return ((BTREE) t);
306 }
307 
308 /*
309  *  BT_GET -- Get an entry from a btree.
310  *
311  *	Does a key lookup in the tree to find the specified key, and returns
312  *	the key/data pair found.
313  *
314  *	Parameters:
315  *		tree -- btree in which to do lookup
316  *		key -- key to find
317  *		data -- pointer to DBT in which to return data
318  *		flag -- ignored
319  *
320  *	Returns:
321  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not
322  *		found.  If key is not found, nothing is stored in the
323  *		return DBT 'data'.
324  *
325  *	Side Effects:
326  *		None.
327  *
328  *	Warnings:
329  *		Return data is statically allocated, and will be overwritten
330  *		at the next call.
331  */
332 
333 int
334 bt_get(dbp, key, data, flag)
335 	DB *dbp;
336 	DBT *key, *data;
337 	u_long flag;
338 {
339 	BTREE_P t = (BTREE_P) (dbp->internal);
340 	BTHEADER *h;
341 	DATUM *d;
342 	BTITEM *item;
343 
344 #ifdef lint
345 	flag = flag;
346 #endif /* lint */
347 
348 	/* lookup */
349 	item = _bt_search(t, key);
350 
351 	/* clear parent pointer stack */
352 	while (_bt_pop(t) != P_NONE)
353 		continue;
354 
355 	if (item == (BTITEM *) NULL)
356 		return (RET_ERROR);
357 
358 	h = (BTHEADER *) t->bt_curpage;
359 	data->size = 0;
360 	data->data = (u_char *) NULL;
361 
362 	/* match? */
363 	if (VALIDITEM(t, item)
364 	    && _bt_cmp(t, key->data, item->bti_index) == 0) {
365 		d = (DATUM *) GETDATUM(h, item->bti_index);
366 		return (_bt_buildret(t, d, data, key));
367 	}
368 
369 	/* nope */
370 	return (RET_SPECIAL);
371 }
372 
373 /*
374  *  BT_PUT -- Add an entry to a btree.
375  *
376  *	The specified (key, data) pair is added to the tree.  If the tree
377  *	was created for unique keys only, then duplicates will not be
378  *	entered.  If the requested key exists in the tree, it will be over-
379  *	written unless the flags parameter is R_NOOVERWRITE, in which case
380  *	the update will not be done.  If duplicate keys are permitted in the
381  *	tree, duplicates will be inserted and will not overwrite existing
382  *	keys.  Nodes are split as required.
383  *
384  *	Parameters:
385  *		tree -- btree in which to put the new entry
386  *		key -- key component to add
387  *		data -- data corresponding to key
388  *		flag -- R_NOOVERWRITE or zero.
389  *
390  *	Returns:
391  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the
392  *		NOOVERWRITE flag was set and the specified key exists
393  *		in the database.
394  *
395  *	Side Effects:
396  *		None.
397  */
398 
399 int
400 bt_put(dbp, key, data, flag)
401 	DB *dbp;
402 	DBT *key, *data;
403 	u_long flag;
404 {
405 	BTREE_P t;
406 	BTITEM *item;
407 
408 	t = (BTREE_P)dbp->internal;
409 
410 	/* look for this key in the tree */
411 	item = _bt_search(t, key);
412 
413 	/*
414 	 *  If this tree was originally created without R_DUP, then duplicate
415 	 *  keys are not allowed.  We need to check this at insertion time.
416 	 */
417 
418 	if (VALIDITEM(t, item) && _bt_cmp(t, key->data, item->bti_index) == 0) {
419 		if ((t->bt_flags & BTF_NODUPS) && flag == R_NOOVERWRITE) {
420 			if (_bt_delone(t, item->bti_index) == RET_ERROR) {
421 				while (_bt_pop(t) != P_NONE)
422 					continue;
423 				return (RET_ERROR);
424 			}
425 		}
426 	}
427 
428 	return (_bt_insert(t, item, key, data, flag));
429 }
430 
431 /*
432  *  BT_DELETE -- delete a key from the tree.
433  *
434  *	Deletes all keys (and their associated data items) matching the
435  *	supplied key from the tree.  If the flags entry is R_CURSOR, then
436  *	the current item in the active scan is deleted.
437  *
438  *	Parameters:
439  *		tree -- btree from which to delete key
440  *		key -- key to delete
441  *		flags -- R_CURSOR or zero
442  *
443  *	Returns:
444  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified
445  *		key was not in the tree.
446  *
447  *	Side Effects:
448  *		None.
449  */
450 
451 int
452 bt_delete(dbp, key, flags)
453 	DB *dbp;
454 	DBT *key;
455 	u_long flags;
456 {
457 	BTREE_P t;
458 	BTHEADER *h;
459 	BTITEM *item;
460 	int ndel = 0;
461 
462 	t = (BTREE_P)dbp->internal;
463 
464 	if (flags == R_CURSOR)
465 		return (_bt_crsrdel(t));
466 
467 	/* find the first matching key in the tree */
468 	item = _bt_first(t, key);
469 	h = t->bt_curpage;
470 
471 	/* don't need the descent stack for deletes */
472 	while (_bt_pop(t) != P_NONE)
473 		continue;
474 
475 	/* delete all matching keys */
476 	for (;;) {
477 		while (VALIDITEM(t, item)
478 		       && (_bt_cmp(t, key->data, item->bti_index) == 0)) {
479 			if (_bt_delone(t, item->bti_index) == RET_ERROR)
480 				return (RET_ERROR);
481 			ndel++;
482 		}
483 
484 		if (VALIDITEM(t, item) || h->h_nextpg == P_NONE)
485 			break;
486 
487 		/* next page, if necessary */
488 		do {
489 			if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
490 				return (RET_ERROR);
491 			h = t->bt_curpage;
492 		} while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE);
493 
494 		item->bti_pgno = h->h_pgno;
495 		item->bti_index = 0;
496 
497 		if (!VALIDITEM(t, item)
498 		    || _bt_cmp(t, key->data, item->bti_index) != 0)
499 			break;
500 	}
501 
502 	/* flush changes to disk */
503 	if (ISDISK(t)) {
504 		if (h->h_flags & F_DIRTY) {
505 			if (_bt_write(t, t->bt_curpage, NORELEASE) == RET_ERROR)
506 				return (RET_ERROR);
507 		}
508 	}
509 
510 	if (ndel == 0)
511 		return (RET_SPECIAL);
512 
513 	return (RET_SUCCESS);
514 }
515 
516 /*
517  *  BT_SYNC -- sync the btree to disk.
518  *
519  *	Parameters:
520  *		tree -- btree to sync.
521  *
522  *	Returns:
523  *		RET_SUCCESS, RET_ERROR.
524  */
525 
526 bt_sync(dbp)
527 	DB *dbp;
528 {
529 	BTREE_P t;
530 	BTHEADER *h;
531 	pgno_t pgno;
532 
533 	t = (BTREE_P)dbp->internal;
534 
535 	/* if this is an in-memory btree, syncing is a no-op */
536 	if (!ISDISK(t))
537 		return (RET_SUCCESS);
538 
539 	h = (BTHEADER *) t->bt_curpage;
540 	h->h_flags &= ~F_DIRTY;
541 
542 	if (ISCACHE(t)) {
543 		pgno = t->bt_curpage->h_pgno;
544 		if (_bt_write(t, h, RELEASE) == RET_ERROR)
545 			return(RET_ERROR);
546 		if (lrusync(t->bt_s.bt_d.d_cache) < RET_ERROR)
547 			return (RET_ERROR);
548 		if (_bt_getpage(t, pgno) == RET_ERROR)
549 			return (RET_ERROR);
550 	} else {
551 		if (_bt_write(t, h, NORELEASE) == RET_ERROR)
552 			return (RET_ERROR);
553 	}
554 
555 	return (fsync(t->bt_s.bt_d.d_fd));
556 }
557 
558 /*
559  *  BT_SEQ -- Sequential scan interface.
560  *
561  *	This routine supports sequential scans on the btree.  If called with
562  *	flags set to R_CURSOR, or if no seq scan has been initialized in the
563  *	current tree, we initialize the scan.  Otherwise, we advance the scan
564  *	and return the next item.
565  *
566  *	Scans can be either keyed or non-keyed.  Keyed scans basically have
567  *	a starting point somewhere in the middle of the tree.  Non-keyed
568  *	scans start at an endpoint.  Also, scans can be backward (descending
569  *	order), forward (ascending order), or no movement (keep returning
570  *	the same item).
571  *
572  *	Flags is checked every time we enter the routine, so the user can
573  *	change directions on an active scan if desired.  The key argument
574  *	is examined only when we initialize the scan, in order to position
575  *	it properly.
576  *
577  *	Items are returned via the key and data arguments passed in.
578  *
579  *	Parameters:
580  *		tree -- btree in which to do scan
581  *		key -- key, used to position scan on initialization, and
582  *		       used to return key components to the user.
583  *		data -- used to return data components to the user.
584  *		flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or
585  *			 R_PREV.
586  *
587  *	Returns:
588  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data
589  *		exists in the tree in the specified direction.
590  *
591  *	Side Effects:
592  *		Changes the btree's notion of the current position in the
593  *		scan.
594  *
595  *	Warnings:
596  *		The key and data items returned are static and will be
597  *		overwritten by the next bt_get or bt_seq.
598  */
599 
600 int
601 bt_seq(dbp, key, data, flags)
602 	DB *dbp;
603 	DBT *key, *data;
604 	u_long flags;
605 {
606 	BTREE_P t;
607 	BTHEADER *h;
608 	DATUM *d;
609 	int status;
610 
611 	t = (BTREE_P)dbp->internal;
612 
613 	/* do we need to initialize the scan? */
614 	if (flags == R_CURSOR || flags == R_LAST || flags == R_FIRST
615 	    || !(t->bt_flags & BTF_SEQINIT)) {
616 
617 		/* initialize it */
618 		status = _bt_seqinit(t, key, flags);
619 	} else {
620 		/* just advance the current scan pointer */
621 		status = _bt_seqadvance(t, flags);
622 	}
623 
624 	key->size = data->size = 0;
625 	key->data = data->data = (u_char *) NULL;
626 
627 	h = t->bt_curpage;
628 
629 	/* is there a valid item at the current scan location? */
630 	if (status == RET_SPECIAL) {
631 		if (flags == R_NEXT) {
632 			if (t->bt_cursor.c_index >= NEXTINDEX(h)) {
633 				if (NEXTINDEX(h) > 0)
634 					t->bt_cursor.c_index = NEXTINDEX(h) - 1;
635 				else
636 					t->bt_cursor.c_index = 0;
637 			}
638 		} else {
639 			t->bt_cursor.c_index = 0;
640 		}
641 		return (RET_SPECIAL);
642 	} else if (status == RET_ERROR)
643 		return (RET_ERROR);
644 
645 	/* okay, return the data */
646 	d = (DATUM *) GETDATUM(h, t->bt_cursor.c_index);
647 
648 	return (_bt_buildret(t, d, data, key));
649 }
650 
651 /*
652  *  BT_CLOSE -- Close a btree
653  *
654  *	Parameters:
655  *		tree -- tree to close
656  *
657  *	Returns:
658  *		RET_SUCCESS, RET_ERROR.
659  *
660  *	Side Effects:
661  *		Frees space occupied by the tree.
662  */
663 
664 int
665 bt_close(dbp)
666 	DB *dbp;
667 {
668 	struct HTBUCKET *b, *sb;
669 	BTREE_P t;
670 	BTHEADER *h;
671 	HTABLE ht;
672 	int fd, i;
673 	char *cache;
674 
675 	t = (BTREE_P)dbp->internal;
676 
677 	if (t->bt_cursor.c_key != (char *) NULL)
678 		(void) free(t->bt_cursor.c_key);
679 
680 	if (!ISDISK(t)) {
681 		/* in-memory tree, release hash table memory */
682 		ht = t->bt_s.bt_ht;
683 		for (i = 0; i < HTSIZE; i++) {
684 			if ((b = ht[i]) == (struct HTBUCKET *) NULL)
685 				break;
686 			do {
687 				sb = b;
688 				(void) free((char *) b->ht_page);
689 				b = b->ht_next;
690 				(void) free((char *) sb);
691 			} while (b != (struct HTBUCKET *) NULL);
692 		}
693 		(void) free ((char *) ht);
694 		(void) free ((char *) t);
695 		return (RET_SUCCESS);
696 	}
697 
698 	if ((t->bt_flags & BTF_ISWRITE) && !(t->bt_flags & BTF_METAOK)) {
699 		if (_bt_wrtmeta(t) == RET_ERROR)
700 			return (RET_ERROR);
701 	}
702 
703 	if (t->bt_curpage != (BTHEADER *) NULL) {
704 		h = t->bt_curpage;
705 		if (h->h_flags & F_DIRTY) {
706 			if (_bt_write(t, h, RELEASE) == RET_ERROR)
707 				return (RET_ERROR);
708 		} else {
709 			if (_bt_release(t, h) == RET_ERROR)
710 				return (RET_ERROR);
711 		}
712 
713 		/* flush and free the cache, if there is one */
714 		if (ISCACHE(t)) {
715 			cache = t->bt_s.bt_d.d_cache;
716 			if (lrusync(cache) == RET_ERROR)
717 				return (RET_ERROR);
718 			lrufree(cache);
719 		}
720 		(void) free ((char *) h);
721 	}
722 
723 	fd = t->bt_s.bt_d.d_fd;
724 	(void) free ((char *) t);
725 	return (close(fd));
726 }
727