1 /* 2 * Copyright (c) 2007 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_btree.h,v 1.24 2008/06/26 04:06:22 dillon Exp $ 35 */ 36 37 #ifndef VFS_HAMMER_BTREE_H_ 38 #define VFS_HAMMER_BTREE_H_ 39 40 /* 41 * HAMMER B-Tree index 42 * 43 * HAMMER implements a modified B+Tree. B+Trees store records only 44 * at their leaves and HAMMER's modification is to adjust the internal 45 * elements so there is a boundary element on each side instead of sub-tree 46 * pointers. 47 * 48 * We just call our modified B+Tree a 'B-Tree' in HAMMER documentation to 49 * reduce confusion. 50 * 51 * A B-Tree internal node looks like this: 52 * 53 * B N N N N N N B <-- boundary and internal elements 54 * S S S S S S S <-- subtree pointers 55 * 56 * A B-Tree leaf node looks like this: 57 * 58 * L L L L L L L L <-- leaf elemenets 59 * (there is also a previous and next-leaf pointer) 60 * 61 * The recursion radix of an internal node is reduced by 1 relative to 62 * a normal B-Tree in order to accomodate the right-hand boundary. 63 * The left-hand boundary (B in the left) is integrated into the first 64 * element so it doesn't require 2 elements to accomodate boundaries. 65 * 66 * The big benefit to using a B-Tree with built-in bounds information is 67 * that it makes it possible to cache pointers into the middle of the tree 68 * and not have to start searches, insertions, OR deletions at the root node. 69 * The boundary elements allow searches to progress in a definitive direction 70 * from any point in the tree without revisting nodes. It is also possible 71 * to terminate searches early and make minor adjustments to the boundaries 72 * (within the confines of the parent's boundaries) on the fly. This greatly 73 * improves the efficiency of many operations. 74 * 75 * All of the structures below are on-disk structures. 76 */ 77 78 /* 79 * Common base for all B-Tree element types (40 bytes) 80 * 81 * btype field represents a type of B-Tree ondisk structure that this 82 * B-Tree element points to, but not a type of B-Tree node that this 83 * B-Tree element is a part of. btype could be HAMMER_BTREE_TYPE_RECORD 84 * as well as HAMMER_BTREE_TYPE_INTERNAL and HAMMER_BTREE_TYPE_LEAF, 85 * while B-Tree node type is never HAMMER_BTREE_TYPE_RECORD. 86 * 87 * The following fields are keys used by hammer_btree_cmp() to compare 88 * B-Tree elements listed from higher to lower priority on comparison. 89 * B-Tree elements are first grouped by localization value, and then 90 * obj_id within a subtree of the same localization value, and so on. 91 * 92 * 1. localization 93 * 2. obj_id 94 * 3. rec_type 95 * 4. key 96 * 5. create_tid 97 */ 98 typedef struct hammer_base_elm { 99 int64_t obj_id; /* 00 object record is associated with */ 100 int64_t key; /* 08 indexing key (offset or namekey) */ 101 102 hammer_tid_t create_tid; /* 10 transaction id for record creation */ 103 hammer_tid_t delete_tid; /* 18 transaction id for record update/del */ 104 105 uint16_t rec_type; /* 20 HAMMER_RECTYPE_ */ 106 uint8_t obj_type; /* 22 HAMMER_OBJTYPE_ */ 107 uint8_t btype; /* 23 B-Tree element type */ 108 uint32_t localization; /* 24 B-Tree localization parameter */ 109 /* 28 */ 110 } *hammer_base_elm_t; 111 112 /* 113 * Localization has sorting priority over the obj_id,rec_type,key,tid 114 * and is used to localize inodes for very fast directory scans. 115 * 116 * Localization can also be used to create pseudo-filesystems within 117 * a HAMMER filesystem. Pseudo-filesystems would be suitable 118 * replication targets. 119 * 120 * Upper 16 bits of the localization field in struct hammer_base_elm 121 * represents pseudo-filesystem id ranging from default 0 to 65535, 122 * and lower 16 bits represents its localization type in bitfield 123 * where 0x1 means the element is for inode and 0x2 means the element 124 * is for anything other than inode. Note that 0x3 (0x1|0x2) is not 125 * a valid type, while 0 and 0xFFFF are valid types for some cases. 126 * 127 * The root inode (not the PFS root inode but the real root) uses 128 * HAMMER_DEF_LOCALIZATION for its incore ip->obj_localization. 129 * HAMMER_DEF_LOCALIZATION implies PFS#0 and no localization type. 130 */ 131 #define HAMMER_LOCALIZE_INODE 0x00000001 132 #define HAMMER_LOCALIZE_MISC 0x00000002 /* not inode */ 133 #define HAMMER_LOCALIZE_MASK 0x0000FFFF 134 #define HAMMER_LOCALIZE_PSEUDOFS_MASK 0xFFFF0000 135 136 #define HAMMER_MIN_LOCALIZATION 0x00000000U 137 #define HAMMER_MAX_LOCALIZATION 0x0000FFFFU 138 #define HAMMER_DEF_LOCALIZATION 0x00000000U 139 140 #define HAMMER_MIN_ONDISK_LOCALIZATION \ 141 HAMMER_MIN_LOCALIZATION 142 #define HAMMER_MAX_ONDISK_LOCALIZATION \ 143 (HAMMER_MAX_LOCALIZATION | HAMMER_LOCALIZE_PSEUDOFS_MASK) 144 145 #define lo_to_pfs(lo) \ 146 ((int)(((lo) & HAMMER_LOCALIZE_PSEUDOFS_MASK) >> 16)) 147 #define pfs_to_lo(pfs) \ 148 ((((uint32_t)(pfs)) << 16) & HAMMER_LOCALIZE_PSEUDOFS_MASK) 149 150 /* 151 * Internal element (40 + 24 = 64 bytes). 152 * 153 * An internal element contains a recursion to another B-Tree node. 154 */ 155 typedef struct hammer_btree_internal_elm { 156 struct hammer_base_elm base; 157 hammer_tid_t mirror_tid; /* mirroring support */ 158 hammer_off_t subtree_offset; 159 int32_t reserved01; 160 int32_t reserved02; 161 } *hammer_btree_internal_elm_t; 162 163 /* 164 * Leaf B-Tree element (40 + 24 = 64 bytes). 165 * 166 * NOTE: create_ts/delete_ts are not used by any core algorithms, they are 167 * used only by userland to present nominal real-time date strings 168 * to the user. 169 */ 170 typedef struct hammer_btree_leaf_elm { 171 struct hammer_base_elm base; 172 uint32_t create_ts; 173 uint32_t delete_ts; 174 hammer_off_t data_offset; 175 int32_t data_len; 176 hammer_crc_t data_crc; 177 } *hammer_btree_leaf_elm_t; 178 179 /* 180 * Rollup btree leaf element types - 64 byte structure 181 */ 182 typedef union hammer_btree_elm { 183 struct hammer_base_elm base; 184 struct hammer_btree_leaf_elm leaf; 185 struct hammer_btree_internal_elm internal; 186 } *hammer_btree_elm_t; 187 188 /* 189 * B-Tree node (64x64 = 4K structure) 190 * 191 * Each node contains 63 elements. The last element for an internal node 192 * is the right-boundary so internal nodes have one fewer logical elements 193 * then leaf nodes. 194 * 195 * 'count' always refers to the number of elements and is non-inclusive of 196 * the right-hand boundary for an internal node. For a leaf node, 'count' 197 * refers to the number of elements and there is no idea of boundaries. 198 * 199 * The use of a fairly large radix is designed to reduce the number of 200 * discrete disk accesses required to locate something. Keep in mind 201 * that nodes are allocated out of 16K hammer buffers so supported values 202 * are (256-1), (128-1), (64-1), (32-1), or (16-1). HAMMER uses 63-way 203 * so the node size is (64x(1+(64-1))) = 4KB. 204 * 205 * NOTE: FUTURE EXPANSION: The reserved fields in hammer_node_ondisk are 206 * reserved for left/right leaf linkage fields, flags, and other future 207 * features. 208 */ 209 #define HAMMER_BTREE_TYPE_INTERNAL ((uint8_t)'I') 210 #define HAMMER_BTREE_TYPE_LEAF ((uint8_t)'L') 211 #define HAMMER_BTREE_TYPE_RECORD ((uint8_t)'R') 212 #define HAMMER_BTREE_TYPE_DELETED ((uint8_t)'D') 213 #define HAMMER_BTREE_TYPE_NONE ((uint8_t)0) 214 215 #define HAMMER_BTREE_LEAF_ELMS 63 216 #define HAMMER_BTREE_INT_ELMS (HAMMER_BTREE_LEAF_ELMS - 1) 217 218 typedef struct hammer_node_ondisk { 219 /* 220 * B-Tree node header (64 bytes) 221 */ 222 hammer_crc_t crc; /* MUST BE FIRST FIELD OF STRUCTURE */ 223 uint32_t reserved01; 224 hammer_off_t parent; /* 0 if at root of B-Tree */ 225 int32_t count; /* maximum 62 for INTERNAL, 63 for LEAF */ 226 uint8_t type; /* B-Tree node type (INTERNAL or LEAF) */ 227 uint8_t reserved02; 228 uint16_t reserved03; 229 hammer_off_t reserved04; 230 hammer_off_t reserved05; 231 hammer_off_t reserved06; 232 hammer_off_t reserved07; 233 hammer_tid_t mirror_tid; /* mirroring support (aggregator) */ 234 235 /* 236 * B-Tree node element array (64x63 bytes) 237 * 238 * Internal nodes have one less logical element 239 * (meaning: the same number of physical elements) in order to 240 * accomodate the right-hand boundary. The left-hand boundary 241 * is integrated into the first element. Leaf nodes have no 242 * boundary elements. 243 */ 244 union hammer_btree_elm elms[HAMMER_BTREE_LEAF_ELMS]; 245 } *hammer_node_ondisk_t; 246 247 #define HAMMER_BTREE_CRCSIZE \ 248 (sizeof(struct hammer_node_ondisk) - sizeof(hammer_crc_t)) 249 250 /* 251 * Return 1 if elm is a node element of an internal node, 252 * otherwise return 0. 253 */ 254 static __inline 255 int 256 hammer_is_internal_node_elm(hammer_btree_elm_t elm) 257 { 258 switch (elm->base.btype) { 259 case HAMMER_BTREE_TYPE_INTERNAL: 260 case HAMMER_BTREE_TYPE_LEAF: 261 return(1); 262 } 263 return(0); 264 } 265 266 /* 267 * Return 1 if elm is a node element of a leaf node, 268 * otherwise return 0. 269 */ 270 static __inline 271 int 272 hammer_is_leaf_node_elm(hammer_btree_elm_t elm) 273 { 274 switch (elm->base.btype) { 275 case HAMMER_BTREE_TYPE_RECORD: 276 return(1); 277 } 278 return(0); 279 } 280 281 static __inline 282 int 283 hammer_node_max_elements(uint8_t type) 284 { 285 switch (type) { 286 case HAMMER_BTREE_TYPE_LEAF: 287 return(HAMMER_BTREE_LEAF_ELMS); 288 case HAMMER_BTREE_TYPE_INTERNAL: 289 return(HAMMER_BTREE_INT_ELMS); 290 } 291 return(-1); /* invalid type */ 292 } 293 294 static __inline 295 char 296 hammer_elm_btype(hammer_btree_elm_t elm) 297 { 298 switch(elm->base.btype) { 299 case HAMMER_BTREE_TYPE_INTERNAL: 300 case HAMMER_BTREE_TYPE_LEAF: 301 case HAMMER_BTREE_TYPE_RECORD: 302 case HAMMER_BTREE_TYPE_DELETED: 303 return(elm->base.btype); /* ascii */ 304 case HAMMER_BTREE_TYPE_NONE: 305 return('*'); 306 default: 307 return('?'); 308 } 309 } 310 311 #endif /* !VFS_HAMMER_BTREE_H_ */ 312