1 /*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Mike Olson. 7 * 8 * %sccs.include.redist.c% 9 */ 10 11 /* 12 * @(#)btree.h 5.2 (Berkeley) 02/22/91 13 */ 14 15 typedef char *BTREE; /* should really be (void *) */ 16 17 /* #define DEBUG */ 18 19 #define RET_ERROR -1 20 #define RET_SUCCESS 0 21 #define RET_SPECIAL 1 22 23 #ifndef TRUE 24 #define TRUE 1 25 #define FALSE 0 26 #endif /* ndef TRUE */ 27 28 #ifndef NULL 29 #define NULL 0 30 #endif /* ndef NULL */ 31 32 /* these are defined in lrucache.c */ 33 extern char *lruinit(); 34 extern char *lruget(); 35 extern char *lrugetnew(); 36 extern int lrusync(); 37 extern int lruwrite(); 38 extern int lrurelease(); 39 extern void lrufree(); 40 41 /* these are defined here */ 42 extern BTREE bt_open(); 43 extern int bt_close(); 44 extern int bt_delete(); 45 extern int bt_get(); 46 extern int bt_put(); 47 extern int bt_seq(); 48 extern int bt_sync(); 49 50 /* 51 * Private types. What you choose for these depends on how big you 52 * want to let files get, and how big you want to let pages get. 53 */ 54 55 typedef u_long index_t; /* so # bytes on a page fits in a long */ 56 typedef u_long pgno_t; /* so # of pages in a btree fits in a long */ 57 58 /* 59 * When we do searches, we push the parent page numbers onto a stack 60 * as we descend the tree. This is so that for insertions, we can 61 * find our way back up to do internal page insertions and splits. 62 */ 63 64 typedef struct BTSTACK { 65 pgno_t bts_pgno; 66 struct BTSTACK *bts_next; 67 } BTSTACK; 68 69 /* 70 * Every btree page has a header that looks like this. Flags are given 71 * in the #define's for the F_ flags (see below). 72 */ 73 74 typedef struct BTHEADER { 75 pgno_t h_pgno; /* page number of this page */ 76 pgno_t h_prevpg; /* left sibling */ 77 pgno_t h_nextpg; /* right sibling */ 78 79 #define F_LEAF 0x01 /* leaf page, contains user data */ 80 #define F_CONT 0x02 /* continuation page (large items) */ 81 #define F_DIRTY 0x04 /* need to write to disk */ 82 #define F_PRESERVE 0x08 /* never delete this chain of pages */ 83 84 u_long h_flags; /* page state */ 85 index_t h_lower; /* lower bound of free space on page */ 86 index_t h_upper; /* upper bound of free space on page */ 87 index_t h_linp[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 88 } BTHEADER; 89 90 /* 91 * HTBUCKETs are hash table buckets for looking up pages of in-memory 92 * btrees by page number. We use this indirection, rather than direct 93 * pointers, so that the code for manipulating in-memory trees is the 94 * same as that for manipulating on-disk trees. 95 */ 96 97 typedef struct HTBUCKET { 98 pgno_t ht_pgno; 99 BTHEADER *ht_page; 100 struct HTBUCKET *ht_next; 101 } HTBUCKET; 102 103 typedef HTBUCKET **HTABLE; 104 105 /* minimum size we'll let a page be */ 106 #define MINPSIZE 512 107 108 /* default cache size, in bytes */ 109 #define DEFCACHE (20 * 1024) 110 111 /* hash table size for in-memory trees */ 112 #define HTSIZE 128 113 114 /* generate a hash key from a page number */ 115 #define HASHKEY(pgno) ((pgno - 1) % HTSIZE) 116 117 /* 118 * Disk btrees have a file descriptor, and may also have an lru buffer 119 * cache, if the user asked for one. 120 */ 121 122 typedef struct BTDISK { 123 int d_fd; 124 char *d_cache; 125 } BTDISK; 126 127 /* 128 * Cursors keep track of the current location in a sequential scan of 129 * the database. Since btrees impose a total ordering on keys, we can 130 * walk forward or backward through the database from any point. Cursors 131 * survive updates to the tree, and can be used to delete a particular 132 * record. 133 */ 134 135 typedef struct CURSOR { 136 pgno_t c_pgno; /* pgno of current item in scan */ 137 index_t c_index; /* index of current item in scan */ 138 char *c_key; /* current key, used for updates */ 139 140 #define CRSR_BEFORE 0x01 141 142 u_char c_flags; /* to handle updates properly */ 143 } CURSOR; 144 145 /* 146 * The private btree data structure. The user passes a pointer to one of 147 * these when we are to manipulate a tree, but the BTREE type is opaque 148 * to him. 149 */ 150 151 typedef struct BTREEDATA_P { 152 char *bt_fname; /* NULL for in-memory trees */ 153 union { 154 BTDISK bt_d; /* for on-disk btrees */ 155 HTABLE bt_ht; /* hash table for mem trees */ 156 } bt_s; 157 size_t bt_psize; /* page size for btree pages */ 158 int (*bt_compare)(); /* key comparison function */ 159 pgno_t bt_npages; /* number of pages in tree */ 160 BTHEADER *bt_curpage; /* current page contents */ 161 pgno_t bt_free; /* free pg list for big data */ 162 CURSOR bt_cursor; /* cursor for scans */ 163 BTSTACK *bt_stack; /* parent stack for inserts */ 164 u_long bt_lorder; /* byte order (endian.h) */ 165 166 #define BTF_METAOK 0x01 /* meta-data written to start of file */ 167 #define BTF_SEQINIT 0x02 /* we have called bt_seq */ 168 #define BTF_ISWRITE 0x04 /* tree was opened for write */ 169 #define BTF_NODUPS 0x08 /* tree created for unique keys */ 170 171 u_long bt_flags; /* btree state */ 172 } BTREEDATA_P; 173 174 typedef BTREEDATA_P *BTREE_P; 175 176 /* 177 * The first thing in a btree file is a BTMETA structure. The rest of 178 * the first page is empty, so that all disk operations are page-aligned. 179 */ 180 181 typedef struct BTMETA { 182 u_long m_magic; 183 u_long m_version; 184 size_t m_psize; 185 pgno_t m_free; 186 u_long m_flags; 187 u_long m_lorder; 188 } BTMETA; 189 190 #define P_NONE 0 /* invalid page number in tree */ 191 #define P_ROOT 1 /* page number of root pg in btree */ 192 193 #define NORELEASE 0 /* don't release a page during write */ 194 #define RELEASE 1 /* release a page during write */ 195 196 #define INSERT 0 /* doing an insert operation */ 197 #define DELETE 1 /* doing a delete operation */ 198 199 /* get the next free index on a btree page */ 200 #define NEXTINDEX(p) ((((int)(p)->h_lower) - ((int)((((char *)(&(p)->h_linp[0]))) - ((char *) (p)))))/(sizeof(index_t))) 201 202 /* is a BTITEM actually on the btree page? */ 203 #define VALIDITEM(t, i) ((i)->bti_index < NEXTINDEX((t)->bt_curpage)) 204 205 /* guarantee longword alignment so structure refs work */ 206 #define LONGALIGN(p) (((long)(p) + 3) & ~ 0x03) 207 208 /* get a particular datum (or idatum) off a page */ 209 #define GETDATUM(h,i) (((char *) h) + h->h_linp[i]) 210 211 /* is a {key,datum} too big to put on a single page? */ 212 #define TOOBIG(t, sz) (sz >= t->bt_psize / 5) 213 214 /* is this a disk tree or a memory tree? */ 215 #define ISDISK(t) (t->bt_fname != (char *) NULL) 216 217 /* does the disk tree use a cache? */ 218 #define ISCACHE(t) (t->bt_s.bt_d.d_cache != (char *) NULL) 219 220 /* 221 * DATUMs are for user data -- one appears on leaf pages for every 222 * tree entry. The d_bytes[] array contains the key first, then the data. 223 * 224 * If either the key or the datum is too big to store on a single page, 225 * a bit is set in the flags entry, and the d_bytes[] array contains a 226 * pgno pointing to the page at which the data is actually stored. 227 * 228 * Note on alignment: every DATUM is guaranteed to be longword aligned 229 * on the disk page. In order to force longword alignment of user key 230 * and data values, we must guarantee that the d_bytes[] array starts 231 * on a longword boundary. This is the reason that d_flags is a u_long, 232 * rather than a u_char (it really only needs to be two bits big). This 233 * is necessary because we call the user's comparison function with a 234 * pointer to the start of the d_bytes array. We don't need to force 235 * longword alignment of the data following the key, since that is copied 236 * to a longword-aligned buffer before being returned to the user. 237 */ 238 239 typedef struct DATUM { 240 size_t d_ksize; /* size of key */ 241 size_t d_dsize; /* size of data */ 242 243 #define D_BIGDATA 0x01 /* indirect datum ptr flag */ 244 #define D_BIGKEY 0x02 /* indirect key ptr flag */ 245 246 u_long d_flags; /* flags (indirect bit) */ 247 char d_bytes[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 248 } DATUM; 249 250 /* BTITEMs are used to return (page, index, datum) tuples from searches */ 251 typedef struct BTITEM { 252 pgno_t bti_pgno; 253 index_t bti_index; 254 DATUM *bti_datum; 255 } BTITEM; 256 257 /* 258 * IDATUMs are for data stored on internal pages. This is the (key, pgno) 259 * pair, such that key 'key' is the first entry on page 'pgno'. If our 260 * internal page contains keys (a) and (b) next to each other, then all 261 * items >= to (a) and < (b) go on the same page as (a). There are some 262 * gotchas with duplicate keys, however. See the split code for details. 263 * 264 * If a key is too big to fit on a single page, then the i_bytes[] array 265 * contains a pgno pointing to the start of a chain that actually stores 266 * the bytes. Since items on internal pages are never deleted from the 267 * tree, these indirect chains are marked as special, so that they won't 268 * be deleted if the corresponding leaf item is deleted. 269 * 270 * As for DATUMs, IDATUMs have a u_long flag entry (rather than u_char) 271 * in order to guarantee that user keys are longword aligned on the disk 272 * page. 273 */ 274 275 typedef struct IDATUM { 276 size_t i_size; 277 pgno_t i_pgno; 278 u_long i_flags; /* see DATUM.d_flags, above */ 279 char i_bytes[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 280 } IDATUM; 281 282 /* all private interfaces have a leading _ in their names */ 283 extern BTITEM *_bt_search(); 284 extern BTITEM *_bt_searchr(); 285 extern BTHEADER *_bt_allocpg(); 286 extern index_t _bt_binsrch(); 287 extern int _bt_isonpage(); 288 extern BTITEM *_bt_first(); 289 extern int _bt_release(); 290 extern int _bt_wrtmeta(); 291 extern int _bt_delindir(); 292 extern int _bt_pgout(); 293 extern int _bt_pgin(); 294 extern int _bt_fixscan(); 295 extern int _bt_indirect(); 296 extern int _bt_crsrdel(); 297 extern int _bt_push(); 298 extern pgno_t _bt_pop(); 299 extern int strcmp(); 300 301