xref: /dragonfly/sys/vfs/hammer/hammer_btree.h (revision 030b3383)
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_btree.h,v 1.24 2008/06/26 04:06:22 dillon Exp $
35  */
36 
37 #ifndef VFS_HAMMER_BTREE_H_
38 #define VFS_HAMMER_BTREE_H_
39 
40 /*
41  * HAMMER B-Tree index
42  *
43  * HAMMER implements a modified B+Tree.   B+Trees store records only
44  * at their leaves and HAMMER's modification is to adjust the internal
45  * elements so there is a boundary element on each side instead of sub-tree
46  * pointers.
47  *
48  * We just call our modified B+Tree a 'B-Tree' in HAMMER documentation to
49  * reduce confusion.
50  *
51  * A B-Tree internal node looks like this:
52  *
53  *	B N N N N N N B   <-- boundary and internal elements
54  *       S S S S S S S    <-- subtree pointers
55  *
56  * A B-Tree leaf node looks like this:
57  *
58  *	L L L L L L L L   <-- leaf elemenets
59  *			      (there is also a previous and next-leaf pointer)
60  *
61  * The recursion radix of an internal node is reduced by 1 relative to
62  * a normal B-Tree in order to accomodate the right-hand boundary.
63  * The left-hand boundary (B in the left) is integrated into the first
64  * element so it doesn't require 2 elements to accomodate boundaries.
65  *
66  * The big benefit to using a B-Tree with built-in bounds information is
67  * that it makes it possible to cache pointers into the middle of the tree
68  * and not have to start searches, insertions, OR deletions at the root node.
69  * The boundary elements allow searches to progress in a definitive direction
70  * from any point in the tree without revisting nodes.  It is also possible
71  * to terminate searches early and make minor adjustments to the boundaries
72  * (within the confines of the parent's boundaries) on the fly.  This greatly
73  * improves the efficiency of many operations.
74  *
75  * All of the structures below are on-disk structures.
76  */
77 
78 /*
79  * Common base for all B-Tree element types (40 bytes)
80  *
81  * btype field represents a type of B-Tree ondisk structure that this
82  * B-Tree element points to, but not a type of B-Tree node that this
83  * B-Tree element is a part of.  btype could be HAMMER_BTREE_TYPE_RECORD
84  * as well as HAMMER_BTREE_TYPE_INTERNAL and HAMMER_BTREE_TYPE_LEAF,
85  * while B-Tree node type is never HAMMER_BTREE_TYPE_RECORD.
86  *
87  * The following fields are keys used by hammer_btree_cmp() to compare
88  * B-Tree elements listed from higher to lower priority on comparison.
89  * B-Tree elements are first grouped by localization value, and then
90  * obj_id within a subtree of the same localization value, and so on.
91  *
92  * 1. localization
93  * 2. obj_id
94  * 3. rec_type
95  * 4. key
96  * 5. create_id
97  */
98 struct hammer_base_elm {
99 	int64_t	obj_id;		/* 00 object record is associated with */
100 	int64_t key;		/* 08 indexing key (offset or namekey) */
101 
102 	hammer_tid_t create_tid; /* 10 transaction id for record creation */
103 	hammer_tid_t delete_tid; /* 18 transaction id for record update/del */
104 
105 	uint16_t rec_type;	/* 20 HAMMER_RECTYPE_ */
106 	uint8_t obj_type;	/* 22 HAMMER_OBJTYPE_ */
107 	uint8_t btype;		/* 23 B-Tree element type */
108 	uint32_t localization;	/* 24 B-Tree localization parameter */
109 				/* 28 */
110 };
111 
112 typedef struct hammer_base_elm *hammer_base_elm_t;
113 
114 /*
115  * Localization has sorting priority over the obj_id,rec_type,key,tid
116  * and is used to localize inodes for very fast directory scans.
117  *
118  * Localization can also be used to create pseudo-filesystems within
119  * a HAMMER filesystem.  Pseudo-filesystems would be suitable
120  * replication targets.
121  *
122  * Upper 16 bits of the localization field in struct hammer_base_elm
123  * represents pseudo-filesystem id ranging from default 0 to 65535,
124  * and lower 16 bits represents its localization type in bitfield
125  * where 0x1 means the element is for inode and 0x2 means the element
126  * is for anything other than inode.  Note that 0x3 (0x1|0x2) is not
127  * a valid type, while 0 and 0xFFFF are valid types for some cases.
128  *
129  * The root inode (not the PFS root inode but the real root) uses
130  * HAMMER_DEF_LOCALIZATION for its incore ip->obj_localization.
131  * HAMMER_DEF_LOCALIZATION implies PFS 0 and no localization type.
132  */
133 #define HAMMER_LOCALIZE_INODE		0x00000001
134 #define HAMMER_LOCALIZE_MISC		0x00000002  /* not inode */
135 #define HAMMER_LOCALIZE_MASK		0x0000FFFF
136 #define HAMMER_LOCALIZE_PSEUDOFS_MASK	0xFFFF0000
137 
138 #define HAMMER_MIN_LOCALIZATION		0x00000000U
139 #define HAMMER_MAX_LOCALIZATION		0x0000FFFFU
140 #define HAMMER_DEF_LOCALIZATION		0x00000000U
141 
142 #define lo_to_pfs(lo)					\
143 	((int)(((lo) & HAMMER_LOCALIZE_PSEUDOFS_MASK) >> 16))
144 #define pfs_to_lo(pfs)					\
145 	((((uint32_t)(pfs)) << 16) & HAMMER_LOCALIZE_PSEUDOFS_MASK)
146 
147 /*
148  * Internal element (40 + 24 = 64 bytes).
149  *
150  * An internal element contains a recursion to another B-Tree node.
151  */
152 struct hammer_btree_internal_elm {
153 	struct hammer_base_elm base;
154 	hammer_tid_t	mirror_tid;		/* mirroring support */
155 	hammer_off_t	subtree_offset;
156 	int32_t		unused02;
157 	int32_t		unused03;
158 };
159 
160 typedef struct hammer_btree_internal_elm *hammer_btree_internal_elm_t;
161 
162 /*
163  * Leaf B-Tree element (40 + 24 = 64 bytes).
164  *
165  * NOTE: create_ts/delete_ts are not used by any core algorithms, they are
166  *       used only by userland to present nominal real-time date strings
167  *	 to the user.
168  */
169 struct hammer_btree_leaf_elm {
170 	struct hammer_base_elm base;
171 	uint32_t	create_ts;
172 	uint32_t	delete_ts;
173 	hammer_off_t	data_offset;
174 	int32_t		data_len;
175 	hammer_crc_t	data_crc;
176 };
177 
178 typedef struct hammer_btree_leaf_elm *hammer_btree_leaf_elm_t;
179 
180 /*
181  * Rollup btree leaf element types - 64 byte structure
182  */
183 union hammer_btree_elm {
184 	struct hammer_base_elm		base;
185 	struct hammer_btree_leaf_elm	leaf;
186 	struct hammer_btree_internal_elm internal;
187 };
188 
189 typedef union hammer_btree_elm *hammer_btree_elm_t;
190 
191 /*
192  * B-Tree node (64x64 = 4K structure)
193  *
194  * Each node contains 63 elements.  The last element for an internal node
195  * is the right-boundary so internal nodes have one fewer logical elements
196  * then leaf nodes.
197  *
198  * 'count' always refers to the number of elements and is non-inclusive of
199  * the right-hand boundary for an internal node. For a leaf node, 'count'
200  * refers to the number of elements and there is no idea of boundaries.
201  *
202  * The use of a fairly large radix is designed to reduce the number of
203  * discrete disk accesses required to locate something.  Keep in mind
204  * that nodes are allocated out of 16K hammer buffers so supported values
205  * are (256-1), (128-1), (64-1), (32-1), or (16-1). HAMMER uses 63-way
206  * so the node size is (64x(1+(64-1))) = 4KB.
207  *
208  * NOTE: FUTURE EXPANSION: The reserved fields in hammer_node_ondisk are
209  * reserved for left/right leaf linkage fields, flags, and other future
210  * features.
211  */
212 #define HAMMER_BTREE_TYPE_INTERNAL	((uint8_t)'I')
213 #define HAMMER_BTREE_TYPE_LEAF		((uint8_t)'L')
214 #define HAMMER_BTREE_TYPE_RECORD	((uint8_t)'R')
215 #define HAMMER_BTREE_TYPE_DELETED	((uint8_t)'D')
216 #define HAMMER_BTREE_TYPE_NONE		((uint8_t)0)
217 
218 #define HAMMER_BTREE_LEAF_ELMS	63
219 #define HAMMER_BTREE_INT_ELMS	(HAMMER_BTREE_LEAF_ELMS - 1)
220 
221 struct hammer_node_ondisk {
222 	/*
223 	 * B-Tree node header (64 bytes)
224 	 */
225 	hammer_crc_t	crc;		/* MUST BE FIRST FIELD OF STRUCTURE */
226 	uint32_t	reserved00;
227 	hammer_off_t	parent;		/* 0 if at root of B-Tree */
228 	int32_t		count;		/* maximum 62 for INTERNAL, 63 for LEAF */
229 	uint8_t		type;		/* B-Tree node type (INTERNAL or LEAF) */
230 	uint8_t		reserved01;
231 	uint16_t	reserved02;
232 	hammer_off_t	reserved03;
233 	hammer_off_t	reserved04;
234 	hammer_off_t	reserved05;
235 	hammer_off_t	reserved06;
236 	hammer_tid_t	mirror_tid;	/* mirroring support (aggregator) */
237 
238 	/*
239 	 * B-Tree node element array (64x63 bytes)
240 	 *
241 	 * Internal nodes have one less logical element
242 	 * (meaning: the same number of physical elements) in order to
243 	 * accomodate the right-hand boundary.  The left-hand boundary
244 	 * is integrated into the first element.  Leaf nodes have no
245 	 * boundary elements.
246 	 */
247 	union hammer_btree_elm elms[HAMMER_BTREE_LEAF_ELMS];
248 };
249 
250 typedef struct hammer_node_ondisk *hammer_node_ondisk_t;
251 
252 #define HAMMER_BTREE_CRCSIZE	\
253 	(sizeof(struct hammer_node_ondisk) - sizeof(hammer_crc_t))
254 
255 /*
256  * Return 1 if elm is a node element of an internal node,
257  * otherwise return 0.
258  */
259 static __inline
260 int
261 hammer_is_internal_node_elm(hammer_btree_elm_t elm)
262 {
263 	switch (elm->base.btype) {
264 	case HAMMER_BTREE_TYPE_INTERNAL:
265 	case HAMMER_BTREE_TYPE_LEAF:
266 		return(1);
267 	}
268 	return(0);
269 }
270 
271 /*
272  * Return 1 if elm is a node element of a leaf node,
273  * otherwise return 0.
274  */
275 static __inline
276 int
277 hammer_is_leaf_node_elm(hammer_btree_elm_t elm)
278 {
279 	switch (elm->base.btype) {
280 	case HAMMER_BTREE_TYPE_RECORD:
281 		return(1);
282 	}
283 	return(0);
284 }
285 
286 static __inline
287 int
288 hammer_node_max_elements(uint8_t type)
289 {
290 	switch (type) {
291 	case HAMMER_BTREE_TYPE_LEAF:
292 		return(HAMMER_BTREE_LEAF_ELMS);
293 	case HAMMER_BTREE_TYPE_INTERNAL:
294 		return(HAMMER_BTREE_INT_ELMS);
295 	}
296 	return(-1);  /* invalid type */
297 }
298 
299 static __inline
300 char
301 hammer_elm_btype(hammer_btree_elm_t elm)
302 {
303 	switch(elm->base.btype) {
304 	case HAMMER_BTREE_TYPE_INTERNAL:
305 	case HAMMER_BTREE_TYPE_LEAF:
306 	case HAMMER_BTREE_TYPE_RECORD:
307 	case HAMMER_BTREE_TYPE_DELETED:
308 		return(elm->base.btype);  /* ascii */
309 	case HAMMER_BTREE_TYPE_NONE:
310 		return('*');
311 	default:
312 		return('?');
313 	}
314 }
315 
316 #endif /* !VFS_HAMMER_BTREE_H_ */
317