1 /* 2 * Copyright (c) 2007 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_btree.h,v 1.24 2008/06/26 04:06:22 dillon Exp $ 35 */ 36 37 /* 38 * HAMMER B-Tree index 39 * 40 * HAMMER implements a modified B+Tree. B+Trees store records only 41 * at their leaves and HAMMER's modification is to adjust the internal 42 * elements so there is a boundary element on each side instead of sub-tree 43 * pointers. 44 * 45 * We just call our modified B+Tree a 'B-Tree' in HAMMER documentation to 46 * reduce confusion. 47 * 48 * A B-Tree internal node looks like this: 49 * 50 * B N N N N N N B <-- boundary and internal elements 51 * S S S S S S S <-- subtree pointers 52 * 53 * A B-Tree leaf node looks like this: 54 * 55 * L L L L L L L L <-- leaf elemenets 56 * (there is also a previous and next-leaf pointer) 57 * 58 * The recursion radix of an internal node is reduced by 1 relative to 59 * a normal B-Tree in order to accomodate the right-hand boundary. 60 * 61 * The big benefit to using a B-Tree with built-in bounds information is 62 * that it makes it possible to cache pointers into the middle of the tree 63 * and not have to start searches, insertions, OR deletions at the root node. 64 * The boundary elements allow searches to progress in a definitive direction 65 * from any point in the tree without revisting nodes. It is also possible 66 * to terminate searches early and make minor adjustments to the boundaries 67 * (within the confines of the parent's boundaries) on the fly. This greatly 68 * improves the efficiency of many operations. 69 * 70 * HAMMER B-Trees are per-cluster. The global multi-cluster B-Tree is 71 * constructed by allowing internal nodes to link to the roots of other 72 * clusters. Fields in the cluster header then reference back to its 73 * parent and use the cluster generation number to detect stale linkages. 74 * 75 * The B-Tree balancing code can operate within a cluster or across the 76 * filesystem's ENTIRE B-Tree super-structure. A cluster's B-Tree root 77 * can be a leaf node in the worse case. A cluster is guarenteed to have 78 * sufficient free space to hold a single completely full leaf in the 79 * degenerate case. 80 * 81 * All of the structures below are on-disk structures. 82 */ 83 84 /* 85 * Common base for all B-Tree element types (40 bytes) 86 * 87 * obj_type is set to the object type the record represents if an inode, 88 * directory entry, or an inter-cluster reference. A cluster range is 89 * special in that the B-Tree nodes represent a range within the B-Tree 90 * inclusive of rec_type field, so obj_type must be used to detect the 91 * cluster range entries. 92 * 93 * btype is only used by the elements making up an internal or leaf B-Tree 94 * node and applies to the node rather then to the key. This means that 95 * btype must be assigned/reassigned after any update to the base_elm making 96 * up a B-Tree element. 97 */ 98 struct hammer_base_elm { 99 int64_t obj_id; /* 00 object record is associated with */ 100 int64_t key; /* 08 indexing key (offset or namekey) */ 101 102 hammer_tid_t create_tid; /* 10 transaction id for record creation */ 103 hammer_tid_t delete_tid; /* 18 transaction id for record update/del */ 104 105 u_int16_t rec_type; /* 20 _RECTYPE_ */ 106 u_int8_t obj_type; /* 22 _OBJTYPE_ (restricted) */ 107 u_int8_t btype; /* 23 B-Tree element type */ 108 u_int32_t localization; /* 24 B-Tree localization parameter */ 109 /* 28 */ 110 }; 111 112 typedef struct hammer_base_elm *hammer_base_elm_t; 113 114 /* 115 * Localization has sorting priority over the obj_id and is used to 116 * localize inodes for very fast directory scans. 117 * 118 * Localization can also be used to create pseudo-filesystems within 119 * a HAMMER filesystem. Pseudo-filesystems would be suitable 120 * replication targets. 121 */ 122 #define HAMMER_LOCALIZE_RESERVED00 0x00000000 123 #define HAMMER_LOCALIZE_INODE 0x00000001 124 #define HAMMER_LOCALIZE_MISC 0x00000002 125 #define HAMMER_LOCALIZE_RESERVED03 0x00000003 126 #define HAMMER_LOCALIZE_MASK 0x0000FFFF 127 #define HAMMER_LOCALIZE_PSEUDOFS_MASK 0xFFFF0000 128 #define HAMMER_LOCALIZE_PSEUDOFS_INC 0x00010000 129 130 #define HAMMER_MIN_LOCALIZATION 0x00000000U 131 #define HAMMER_MAX_LOCALIZATION 0x0000FFFFU 132 #define HAMMER_DEF_LOCALIZATION 0x00000000U 133 134 /* 135 * Internal element (40 + 24 = 64 bytes). 136 * 137 * An internal element contains the left-hand boundary, right-hand boundary, 138 * and a recursion to another B-Tree node. 139 */ 140 struct hammer_btree_internal_elm { 141 struct hammer_base_elm base; 142 hammer_tid_t mirror_tid; /* mirroring support */ 143 hammer_off_t subtree_offset; 144 int32_t unused02; 145 int32_t unused03; 146 }; 147 148 typedef struct hammer_btree_internal_elm *hammer_btree_internal_elm_t; 149 150 /* 151 * Leaf B-Tree element (40 + 24 = 64 bytes). 152 * 153 * NOTE: create_ts/delete_ts are not used by any core algorithms, they are 154 * used only by userland to present nominal real-time date strings 155 * to the user. 156 */ 157 struct hammer_btree_leaf_elm { 158 struct hammer_base_elm base; 159 u_int32_t create_ts; 160 u_int32_t delete_ts; 161 hammer_off_t data_offset; 162 int32_t data_len; 163 hammer_crc_t data_crc; 164 }; 165 166 typedef struct hammer_btree_leaf_elm *hammer_btree_leaf_elm_t; 167 168 /* 169 * Rollup btree leaf element types - 64 byte structure 170 */ 171 union hammer_btree_elm { 172 struct hammer_base_elm base; 173 struct hammer_btree_leaf_elm leaf; 174 struct hammer_btree_internal_elm internal; 175 }; 176 177 typedef union hammer_btree_elm *hammer_btree_elm_t; 178 179 /* 180 * B-Tree node (normal or meta) (64x64 = 4K structure) 181 * 182 * Each node contains 63 elements. The last element for an internal node 183 * is the right-boundary so internal nodes have one fewer logical elements 184 * then leaf nodes. 185 * 186 * 'count' always refers to the number of elements and is non-inclusive of 187 * the right-hand boundary for an internal node. 188 * 189 * The use of a fairly large radix is designed to reduce the number of 190 * discrete disk accesses required to locate something. Keep in mind 191 * that nodes are allocated out of 16K hammer buffers so supported values 192 * are (256-1), (128-1), (64-1), (32-1), or (16-1). 193 * 194 * NOTE: The node head for an internal does not contain the subtype 195 * (The B-Tree node type for the nodes referenced by its elements). 196 * Instead, each element specifies the subtype (elm->base.subtype). 197 * This allows us to maintain an unbalanced B-Tree and to easily identify 198 * special inter-cluster link elements. 199 * 200 * NOTE: FUTURE EXPANSION: The reserved fields in hammer_node_ondisk are 201 * reserved for left/right leaf linkage fields, flags, and other future 202 * features. 203 */ 204 #define HAMMER_BTREE_LEAF_ELMS 63 205 #define HAMMER_BTREE_INT_ELMS (HAMMER_BTREE_LEAF_ELMS - 1) 206 207 /* 208 * It is safe to combine two adjacent nodes if the total number of elements 209 * is less then or equal to the *_FILL constant. 210 */ 211 #define HAMMER_BTREE_LEAF_FILL (HAMMER_BTREE_LEAF_ELMS - 3) 212 #define HAMMER_BTREE_INT_FILL (HAMMER_BTREE_INT_ELMS - 3) 213 214 #define HAMMER_BTREE_TYPE_INTERNAL ((u_int8_t)'I') 215 #define HAMMER_BTREE_TYPE_LEAF ((u_int8_t)'L') 216 #define HAMMER_BTREE_TYPE_RECORD ((u_int8_t)'R') 217 #define HAMMER_BTREE_TYPE_DELETED ((u_int8_t)'D') 218 219 struct hammer_node_ondisk { 220 /* 221 * B-Tree node header (64 bytes) 222 */ 223 hammer_crc_t crc; /* MUST BE FIRST FIELD OF STRUCTURE */ 224 u_int32_t signature; 225 hammer_off_t parent; /* 0 if at root of cluster */ 226 int32_t count; 227 u_int8_t type; 228 u_int8_t reserved01; 229 u_int16_t reserved02; 230 hammer_off_t reserved03; /* future link_left */ 231 hammer_off_t reserved04; /* future link_right */ 232 hammer_off_t reserved05; 233 hammer_off_t reserved06; 234 hammer_tid_t mirror_tid; /* mirroring support (aggregator) */ 235 236 /* 237 * Element array. Internal nodes have one less logical element 238 * (meaning: the same number of physical elements) in order to 239 * accomodate the right-hand boundary. The left-hand boundary 240 * is integrated into the first element. Leaf nodes have no 241 * boundary elements. 242 */ 243 union hammer_btree_elm elms[HAMMER_BTREE_LEAF_ELMS]; 244 }; 245 246 #define HAMMER_BTREE_SIGNATURE_GOOD 0xB3A49586 247 #define HAMMER_BTREE_SIGNATURE_DESTROYED 0x4A3B2C1D 248 #define HAMMER_BTREE_CRCSIZE \ 249 (sizeof(struct hammer_node_ondisk) - sizeof(hammer_crc_t)) 250 251 typedef struct hammer_node_ondisk *hammer_node_ondisk_t; 252 253