1 /* 2 * Copyright (c) 2007 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_btree.h,v 1.5 2007/11/19 00:53:40 dillon Exp $ 35 */ 36 37 /* 38 * HAMMER B-Tree index 39 * 40 * HAMMER implements a modified B+Tree. B+Trees store records only 41 * at their leaves and HAMMER's modification is to adjust the internal 42 * elements so there is a boundary element on each side instead of sub-tree 43 * pointers. 44 * 45 * We just call our modified B+Tree a 'B-Tree' in HAMMER documentation to 46 * reduce confusion. 47 * 48 * A B-Tree internal node looks like this: 49 * 50 * B N N N N N N B <-- boundary and internal elements 51 * S S S S S S S <-- subtree pointers 52 * 53 * A B-Tree leaf node looks like this: 54 * 55 * L L L L L L L L <-- leaf elemenets 56 * (there is also a previous and next-leaf pointer) 57 * 58 * The recursion radix of an internal node is reduced by 1 relative to 59 * a normal B-Tree in order to accomodate the right-hand boundary. 60 * 61 * The big benefit to using a B-Tree with built-in bounds information is 62 * that it makes it possible to cache pointers into the middle of the tree 63 * and not have to start searches, insertions, OR deletions at the root node. 64 * The boundary elements allow searches to progress in a definitive direction 65 * from any point in the tree without revisting nodes. This greatly improves 66 * the efficiency of many operations, most especially record appends. 67 * 68 * HAMMER B-Trees are per-cluster. The global multi-cluster B-Tree is 69 * constructed by allowing internal nodes to link to the roots of other 70 * clusters. Fields in the cluster header then reference back to its 71 * parent and use the cluster generation number to detect stale linkages. 72 * 73 * The B-Tree balancing code can operate within a cluster or across the 74 * filesystem's ENTIRE B-Tree super-structure. A cluster's B-Tree root 75 * can be a leaf node in the worse case. A cluster is guarenteed to have 76 * sufficient free space to hold a single completely full leaf in the 77 * degenerate case. 78 * 79 * All of the structures below are on-disk structures. 80 */ 81 82 /* 83 * Common base for all B-Tree element types (40 bytes) 84 * 85 * obj_type is set to the object type the record represents if an inode, 86 * directory entry, or an inter-cluster reference. A cluster range is 87 * special in that the B-Tree nodes represent a range within the B-Tree 88 * inclusive of rec_type field, so obj_type must be used to detect the 89 * cluster range entries. 90 * 91 * subtree_type is only used by internal B-Tree elements and indicates 92 * whether we are referencing a cluster, a leaf, or an internal node. 93 */ 94 struct hammer_base_elm { 95 int64_t obj_id; /* 00 object record is associated with */ 96 int64_t key; /* 08 indexing key (offset or namekey) */ 97 98 hammer_tid_t create_tid; /* 10 transaction id for record creation */ 99 hammer_tid_t delete_tid; /* 18 transaction id for record update/del */ 100 101 u_int16_t rec_type; /* 20 _RECTYPE_ */ 102 u_int8_t obj_type; /* 22 _OBJTYPE_ (restricted) */ 103 u_int8_t subtree_type; /* 23 B-Tree element type */ 104 int32_t reserved07; /* 24 (future) */ 105 /* 28 */ 106 }; 107 108 typedef struct hammer_base_elm *hammer_base_elm_t; 109 110 /* 111 * Internal element (40 + 16 = 56 bytes). 112 * 113 * An internal element contains the left-hand boundary and a recursion to 114 * another B-Tree node. If rec_offset is 0 it points to another node in the 115 * current cluster at subtree_offset. Otherwise it points to the root 116 * of another cluster at volno/cluid. 117 * 118 * sub-cluster references have an associated record while intra-cluster 119 * references do not. 120 * 121 * subtree_count is the number of elements in an intra-cluster reference. 122 * For inter-cluster references subtree_count is chaining depth heuristic 123 * used to help balance the B-Tree globally. 124 */ 125 struct hammer_btree_internal_elm { 126 struct hammer_base_elm base; 127 int32_t rec_offset; /* 0 indicates internal reference */ 128 int32_t subtree_offset; /* offset or cluster id */ 129 int32_t subtree_volno; /* unused or volume number */ 130 int32_t subtree_count; /* hint: can be too small, but not too big */ 131 }; 132 133 #define subtree_cluid subtree_offset 134 #define subtree_type base.subtree_type 135 136 /* 137 * Leaf B-Tree element (40 + 16 = 56 bytes). 138 * 139 * A leaf 140 */ 141 struct hammer_btree_leaf_elm { 142 struct hammer_base_elm base; 143 int32_t rec_offset; 144 int32_t data_offset; 145 int32_t data_len; 146 u_int32_t data_crc; 147 }; 148 149 /* 150 * Rollup btree leaf element types - 56 byte structure 151 */ 152 union hammer_btree_elm { 153 struct hammer_base_elm base; 154 struct hammer_btree_leaf_elm leaf; 155 struct hammer_btree_internal_elm internal; 156 }; 157 158 typedef union hammer_btree_elm *hammer_btree_elm_t; 159 160 /* 161 * B-Tree node (normal or meta) - 24 + 56 * 14 = 808 bytes (8-byte aligned) 162 * 163 * 20 B-Tree nodes fit in a 16K filesystem buffer, leaving us room for 164 * the 128 byte filesystem buffer header and another 96 bytes of filler. 165 * 166 * Each node contains 14 elements. The last element for an internal node 167 * is the right-boundary so internal nodes have one fewer logical elements 168 * then leaf nodes. 169 * 170 * 'count' always refers to the number of elements and is non-inclusive of 171 * the right-hand boundary for an internal node. 172 * 173 * NOTE: The node head for an internal does not contain the subtype 174 * (The B-Tree node type for the nodes referenced by its elements). 175 * Instead, each element specifies the subtype (elm->base.subtype). 176 * This allows us to maintain an unbalanced B-Tree and to easily identify 177 * special inter-cluster link elements. 178 */ 179 #define HAMMER_BTREE_LEAF_ELMS 14 180 #define HAMMER_BTREE_INT_ELMS (HAMMER_BTREE_LEAF_ELMS - 1) 181 #define HAMMER_BTREE_NODES 20 182 183 /* 184 * It is safe to combine two adjacent nodes if the total number of elements 185 * is less then or equal to the *_FILL constant. 186 */ 187 #define HAMMER_BTREE_LEAF_FILL (HAMMER_BTREE_LEAF_ELMS - 3) 188 #define HAMMER_BTREE_INT_FILL (HAMMER_BTREE_INT_ELMS - 3) 189 190 #define HAMMER_BTREE_TYPE_INTERNAL ((u_int8_t)'I') 191 #define HAMMER_BTREE_TYPE_LEAF ((u_int8_t)'L') 192 #define HAMMER_BTREE_TYPE_CLUSTER ((u_int8_t)'C') 193 194 struct hammer_node_ondisk { 195 /* 196 * B-Tree node header (24 bytes) 197 */ 198 int32_t count; 199 int32_t parent; /* 0 if at root of cluster */ 200 u_int8_t type; 201 u_int8_t reserved01; 202 u_int16_t reserved02; 203 int32_t reserved03; /* future heuristic */ 204 int32_t reserved04; /* future link_left */ 205 int32_t reserved05; /* future link_right */ 206 207 /* 208 * Element array. Internal nodes have one less logical element 209 * (meaning: the same number of physical elements) in order to 210 * accomodate the right-hand boundary. The left-hand boundary 211 * is integrated into the first element. Leaf nodes have no 212 * boundary elements. 213 */ 214 union hammer_btree_elm elms[HAMMER_BTREE_LEAF_ELMS]; 215 }; 216 217 typedef struct hammer_node_ondisk *hammer_node_ondisk_t; 218 219 /* 220 * B-Tree filesystem buffer (16K exactly) 221 */ 222 struct hammer_fsbuf_btree { 223 struct hammer_fsbuf_head head; /* 128 */ 224 char filler[96]; 225 struct hammer_node_ondisk nodes[HAMMER_BTREE_NODES]; 226 }; 227 228 #if HAMMER_BTREE_NODES * (HAMMER_BTREE_LEAF_ELMS * 56 + 24) + 128 + 96 != 16384 229 #error "Sanity check hammer_fsbuf_btree" 230 #endif 231