xref: /original-bsd/lib/libc/db/btree/btree.h (revision 1afa398d)
1 /*-
2  * Copyright (c) 1990 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Mike Olson.
7  *
8  * %sccs.include.redist.c%
9  */
10 
11 /*
12  *  @(#)btree.h	5.2 (Berkeley) 02/22/91
13  */
14 
15 typedef char	*BTREE;		/* should really be (void *) */
16 
17 /* #define	DEBUG */
18 
19 #define RET_ERROR	-1
20 #define RET_SUCCESS	 0
21 #define RET_SPECIAL	 1
22 
23 #ifndef TRUE
24 #define TRUE	1
25 #define FALSE	0
26 #endif /* ndef TRUE */
27 
28 #ifndef NULL
29 #define NULL	0
30 #endif /* ndef NULL */
31 
32 /* these are defined in lrucache.c */
33 extern char	*lruinit();
34 extern char	*lruget();
35 extern char	*lrugetnew();
36 extern int	lrusync();
37 extern int	lruwrite();
38 extern int	lrurelease();
39 extern void	lrufree();
40 
41 /* these are defined here */
42 extern BTREE	bt_open();
43 extern int	bt_close();
44 extern int	bt_delete();
45 extern int	bt_get();
46 extern int	bt_put();
47 extern int	bt_seq();
48 extern int	bt_sync();
49 
50 /*
51  *  Private types.  What you choose for these depends on how big you
52  *  want to let files get, and how big you want to let pages get.
53  */
54 
55 typedef u_long	index_t;	/* so # bytes on a page fits in a long */
56 typedef u_long	pgno_t;		/* so # of pages in a btree fits in a long */
57 
58 /*
59  *  When we do searches, we push the parent page numbers onto a stack
60  *  as we descend the tree.  This is so that for insertions, we can
61  *  find our way back up to do internal page insertions and splits.
62  */
63 
64 typedef struct BTSTACK {
65 	pgno_t		bts_pgno;
66 	struct BTSTACK	*bts_next;
67 } BTSTACK;
68 
69 /*
70  *  Every btree page has a header that looks like this.  Flags are given
71  *  in the #define's for the F_ flags (see below).
72  */
73 
74 typedef struct BTHEADER {
75 	pgno_t h_pgno;		/* page number of this page */
76 	pgno_t h_prevpg;	/* left sibling */
77 	pgno_t h_nextpg;	/* right sibling */
78 
79 #define F_LEAF		0x01	/* leaf page, contains user data */
80 #define F_CONT		0x02	/* continuation page (large items) */
81 #define F_DIRTY		0x04	/* need to write to disk */
82 #define F_PRESERVE	0x08	/* never delete this chain of pages */
83 
84 	u_long h_flags;		/* page state */
85 	index_t h_lower;	/* lower bound of free space on page */
86 	index_t h_upper;	/* upper bound of free space on page */
87 	index_t h_linp[1];	/* VARIABLE LENGTH DATA AT END OF STRUCT */
88 } BTHEADER;
89 
90 /*
91  *  HTBUCKETs are hash table buckets for looking up pages of in-memory
92  *  btrees by page number.  We use this indirection, rather than direct
93  *  pointers, so that the code for manipulating in-memory trees is the
94  *  same as that for manipulating on-disk trees.
95  */
96 
97 typedef struct HTBUCKET {
98 	pgno_t		ht_pgno;
99 	BTHEADER	*ht_page;
100 	struct HTBUCKET	*ht_next;
101 } HTBUCKET;
102 
103 typedef HTBUCKET	**HTABLE;
104 
105 /* minimum size we'll let a page be */
106 #define MINPSIZE	512
107 
108 /* default cache size, in bytes */
109 #define DEFCACHE	(20 * 1024)
110 
111 /* hash table size for in-memory trees */
112 #define	HTSIZE		128
113 
114 /* generate a hash key from a page number */
115 #define HASHKEY(pgno)	((pgno - 1) % HTSIZE)
116 
117 /*
118  *  Disk btrees have a file descriptor, and may also have an lru buffer
119  *  cache, if the user asked for one.
120  */
121 
122 typedef struct BTDISK {
123 	int	d_fd;
124 	char	*d_cache;
125 } BTDISK;
126 
127 /*
128  *  Cursors keep track of the current location in a sequential scan of
129  *  the database.  Since btrees impose a total ordering on keys, we can
130  *  walk forward or backward through the database from any point.  Cursors
131  *  survive updates to the tree, and can be used to delete a particular
132  *  record.
133  */
134 
135 typedef struct CURSOR {
136 	pgno_t		c_pgno;		/* pgno of current item in scan */
137 	index_t		c_index;	/* index of current item in scan */
138 	char		*c_key;		/* current key, used for updates */
139 
140 #define CRSR_BEFORE	0x01
141 
142 	u_char		c_flags;	/* to handle updates properly */
143 } CURSOR;
144 
145 /*
146  *  The private btree data structure.  The user passes a pointer to one of
147  *  these when we are to manipulate a tree, but the BTREE type is opaque
148  *  to him.
149  */
150 
151 typedef struct BTREEDATA_P {
152 	char		*bt_fname;		/* NULL for in-memory trees */
153 	union {
154 		BTDISK	bt_d;			/* for on-disk btrees */
155 		HTABLE	bt_ht;			/* hash table for mem trees */
156 	} bt_s;
157 	size_t		bt_psize;		/* page size for btree pages */
158 	int		(*bt_compare)();	/* key comparison function */
159 	pgno_t		bt_npages;		/* number of pages in tree */
160 	BTHEADER	*bt_curpage;		/* current page contents */
161 	pgno_t		bt_free;		/* free pg list for big data */
162 	CURSOR		bt_cursor;		/* cursor for scans */
163 	BTSTACK		*bt_stack;		/* parent stack for inserts */
164 	u_long		bt_lorder;		/* byte order (endian.h) */
165 
166 #define BTF_METAOK	0x01	/* meta-data written to start of file */
167 #define BTF_SEQINIT	0x02	/* we have called bt_seq */
168 #define BTF_ISWRITE	0x04	/* tree was opened for write */
169 #define BTF_NODUPS	0x08	/* tree created for unique keys */
170 
171 	u_long		bt_flags;		/* btree state */
172 } BTREEDATA_P;
173 
174 typedef BTREEDATA_P	*BTREE_P;
175 
176 /*
177  *  The first thing in a btree file is a BTMETA structure.  The rest of
178  *  the first page is empty, so that all disk operations are page-aligned.
179  */
180 
181 typedef struct BTMETA {
182 	u_long	m_magic;
183 	u_long	m_version;
184 	size_t	m_psize;
185 	pgno_t	m_free;
186 	u_long	m_flags;
187 	u_long	m_lorder;
188 } BTMETA;
189 
190 #define P_NONE		0		/* invalid page number in tree */
191 #define P_ROOT		1		/* page number of root pg in btree */
192 
193 #define NORELEASE	0		/* don't release a page during write */
194 #define RELEASE		1		/* release a page during write */
195 
196 #define INSERT		0		/* doing an insert operation */
197 #define DELETE		1		/* doing a delete operation */
198 
199 /* get the next free index on a btree page */
200 #define NEXTINDEX(p)	((((int)(p)->h_lower) - ((int)((((char *)(&(p)->h_linp[0]))) - ((char *) (p)))))/(sizeof(index_t)))
201 
202 /* is a BTITEM actually on the btree page? */
203 #define VALIDITEM(t, i)	((i)->bti_index < NEXTINDEX((t)->bt_curpage))
204 
205 /* guarantee longword alignment so structure refs work */
206 #define LONGALIGN(p) (((long)(p) + 3) & ~ 0x03)
207 
208 /* get a particular datum (or idatum) off a page */
209 #define GETDATUM(h,i)	 (((char *) h) + h->h_linp[i])
210 
211 /* is a {key,datum} too big to put on a single page? */
212 #define TOOBIG(t, sz)	(sz >= t->bt_psize / 5)
213 
214 /* is this a disk tree or a memory tree? */
215 #define ISDISK(t)	(t->bt_fname != (char *) NULL)
216 
217 /* does the disk tree use a cache? */
218 #define ISCACHE(t)	(t->bt_s.bt_d.d_cache != (char *) NULL)
219 
220 /*
221  *  DATUMs are for user data -- one appears on leaf pages for every
222  *  tree entry.  The d_bytes[] array contains the key first, then the data.
223  *
224  *  If either the key or the datum is too big to store on a single page,
225  *  a bit is set in the flags entry, and the d_bytes[] array contains a
226  *  pgno pointing to the page at which the data is actually stored.
227  *
228  *  Note on alignment:  every DATUM is guaranteed to be longword aligned
229  *  on the disk page.  In order to force longword alignment of user key
230  *  and data values, we must guarantee that the d_bytes[] array starts
231  *  on a longword boundary.  This is the reason that d_flags is a u_long,
232  *  rather than a u_char (it really only needs to be two bits big).  This
233  *  is necessary because we call the user's comparison function with a
234  *  pointer to the start of the d_bytes array.  We don't need to force
235  *  longword alignment of the data following the key, since that is copied
236  *  to a longword-aligned buffer before being returned to the user.
237  */
238 
239 typedef struct DATUM {
240 	size_t d_ksize;		/* size of key */
241 	size_t d_dsize;		/* size of data */
242 
243 #define D_BIGDATA	0x01	/* indirect datum ptr flag */
244 #define D_BIGKEY	0x02	/* indirect key ptr flag */
245 
246 	u_long d_flags;		/* flags (indirect bit) */
247 	char d_bytes[1];	/* VARIABLE LENGTH DATA AT END OF STRUCT */
248 } DATUM;
249 
250 /* BTITEMs are used to return (page, index, datum) tuples from searches */
251 typedef struct BTITEM {
252 	pgno_t bti_pgno;
253 	index_t bti_index;
254 	DATUM *bti_datum;
255 } BTITEM;
256 
257 /*
258  *  IDATUMs are for data stored on internal pages.  This is the (key, pgno)
259  *  pair, such that key 'key' is the first entry on page 'pgno'.  If our
260  *  internal page contains keys (a) and (b) next to each other, then all
261  *  items >= to (a) and < (b) go on the same page as (a).  There are some
262  *  gotchas with duplicate keys, however.  See the split code for details.
263  *
264  *  If a key is too big to fit on a single page, then the i_bytes[] array
265  *  contains a pgno pointing to the start of a chain that actually stores
266  *  the bytes.  Since items on internal pages are never deleted from the
267  *  tree, these indirect chains are marked as special, so that they won't
268  *  be deleted if the corresponding leaf item is deleted.
269  *
270  *  As for DATUMs, IDATUMs have a u_long flag entry (rather than u_char)
271  *  in order to guarantee that user keys are longword aligned on the disk
272  *  page.
273  */
274 
275 typedef struct IDATUM {
276 	size_t i_size;
277 	pgno_t i_pgno;
278 	u_long i_flags;		/* see DATUM.d_flags, above */
279 	char i_bytes[1];	/* VARIABLE LENGTH DATA AT END OF STRUCT */
280 } IDATUM;
281 
282 /* all private interfaces have a leading _ in their names */
283 extern BTITEM	*_bt_search();
284 extern BTITEM	*_bt_searchr();
285 extern BTHEADER	*_bt_allocpg();
286 extern index_t	_bt_binsrch();
287 extern int	_bt_isonpage();
288 extern BTITEM	*_bt_first();
289 extern int	_bt_release();
290 extern int	_bt_wrtmeta();
291 extern int	_bt_delindir();
292 extern int	_bt_pgout();
293 extern int	_bt_pgin();
294 extern int	_bt_fixscan();
295 extern int	_bt_indirect();
296 extern int	_bt_crsrdel();
297 extern int	_bt_push();
298 extern pgno_t	_bt_pop();
299 extern int	strcmp();
300 
301