1 /* 2 * Copyright (c) 2007 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.55 2008/11/13 02:18:43 dillon Exp $ 35 */ 36 37 #ifndef VFS_HAMMER_DISK_H_ 38 #define VFS_HAMMER_DISK_H_ 39 40 #ifndef _SYS_UUID_H_ 41 #include <sys/uuid.h> 42 #endif 43 44 /* 45 * The structures below represent the on-disk format for a HAMMER 46 * filesystem. Note that all fields for on-disk structures are naturally 47 * aligned. The host endian format is used - compatibility is possible 48 * if the implementation detects reversed endian and adjusts data accordingly. 49 * 50 * Most of HAMMER revolves around the concept of an object identifier. An 51 * obj_id is a 64 bit quantity which uniquely identifies a filesystem object 52 * FOR THE ENTIRE LIFE OF THE FILESYSTEM. This uniqueness allows backups 53 * and mirrors to retain varying amounts of filesystem history by removing 54 * any possibility of conflict through identifier reuse. 55 * 56 * A HAMMER filesystem may span multiple volumes. 57 * 58 * A HAMMER filesystem uses a 16K filesystem buffer size. All filesystem 59 * I/O is done in multiples of 16K. 60 * 61 * 64K X-bufs are used for blocks >= a file's 1MB mark. 62 * 63 * Per-volume storage limit: 52 bits 4096 TB 64 * Per-Zone storage limit: 60 bits 1 MTB 65 * Per-filesystem storage limit: 60 bits 1 MTB 66 */ 67 #define HAMMER_BUFSIZE 16384 68 #define HAMMER_XBUFSIZE 65536 69 #define HAMMER_HBUFSIZE (HAMMER_BUFSIZE / 2) 70 #define HAMMER_XDEMARC (1024 * 1024) 71 #define HAMMER_BUFMASK (HAMMER_BUFSIZE - 1) 72 #define HAMMER_XBUFMASK (HAMMER_XBUFSIZE - 1) 73 74 #define HAMMER_BUFSIZE64 ((uint64_t)HAMMER_BUFSIZE) 75 #define HAMMER_BUFMASK64 ((uint64_t)HAMMER_BUFMASK) 76 77 #define HAMMER_XBUFSIZE64 ((uint64_t)HAMMER_XBUFSIZE) 78 #define HAMMER_XBUFMASK64 ((uint64_t)HAMMER_XBUFMASK) 79 80 #define HAMMER_OFF_ZONE_MASK 0xF000000000000000ULL /* zone portion */ 81 #define HAMMER_OFF_VOL_MASK 0x0FF0000000000000ULL /* volume portion */ 82 #define HAMMER_OFF_SHORT_MASK 0x000FFFFFFFFFFFFFULL /* offset portion */ 83 #define HAMMER_OFF_LONG_MASK 0x0FFFFFFFFFFFFFFFULL /* offset portion */ 84 85 #define HAMMER_OFF_BAD ((hammer_off_t)-1) 86 87 /* 88 * The current limit of volumes that can make up a HAMMER FS 89 */ 90 #define HAMMER_MAX_VOLUMES 256 91 92 /* 93 * Hammer transaction ids are 64 bit unsigned integers and are usually 94 * synchronized with the time of day in nanoseconds. 95 * 96 * Hammer offsets are used for FIFO indexing and embed a cycle counter 97 * and volume number in addition to the offset. Most offsets are required 98 * to be 16 KB aligned. 99 */ 100 typedef uint64_t hammer_tid_t; 101 typedef uint64_t hammer_off_t; 102 typedef uint32_t hammer_crc_t; 103 104 #define HAMMER_MIN_TID 0ULL /* unsigned */ 105 #define HAMMER_MAX_TID 0xFFFFFFFFFFFFFFFFULL /* unsigned */ 106 #define HAMMER_MIN_KEY -0x8000000000000000LL /* signed */ 107 #define HAMMER_MAX_KEY 0x7FFFFFFFFFFFFFFFLL /* signed */ 108 #define HAMMER_MIN_OBJID HAMMER_MIN_KEY /* signed */ 109 #define HAMMER_MAX_OBJID HAMMER_MAX_KEY /* signed */ 110 #define HAMMER_MIN_RECTYPE 0x0U /* unsigned */ 111 #define HAMMER_MAX_RECTYPE 0xFFFFU /* unsigned */ 112 #define HAMMER_MIN_OFFSET 0ULL /* unsigned */ 113 #define HAMMER_MAX_OFFSET 0xFFFFFFFFFFFFFFFFULL /* unsigned */ 114 115 /* 116 * hammer_off_t has several different encodings. Note that not all zones 117 * encode a vol_no. Zone bits are not a part of filesystem capacity. 118 * 119 * zone 0: available, a big-block that contains the offset is unused 120 * zone 1 (z,v,o): raw volume relative (offset 0 is the volume header) 121 * zone 2 (z,v,o): raw buffer relative (offset 0 is the first buffer) 122 * zone 3 (z,o): undo fifo - actually zone-2 address, fixed phys array in vol hdr 123 * zone 4 (z,v,o): freemap - only real blockmap 124 * zone 8 (z,v,o): B-Tree - actually zone-2 address 125 * zone 9 (z,v,o): meta - actually zone-2 address 126 * zone 10 (z,v,o): large-data - actually zone-2 address 127 * zone 11 (z,v,o): small-data - actually zone-2 address 128 * zone 15: unavailable, usually the offset is beyond volume size 129 * 130 * layer1/layer2 direct map: 131 * Maximum HAMMER filesystem capacity from volume aspect 132 * 2^8(max volumes) * 2^52(max volume size) = 2^60 = 1EB 133 * <-------------------------------------------------------------> 134 * 8bits 52bits 135 * <------><-----------------------------------------------------> 136 * zzzzvvvvvvvvoooo oooooooooooooooo oooooooooooooooo oooooooooooooooo 137 * ----111111111111 1111112222222222 222222222ooooooo oooooooooooooooo 138 * <-----------------><------------------><----------------------> 139 * 18bits 19bits 23bits 140 * <-------------------------------------------------------------> 141 * 2^18(layer1) * 2^19(layer2) * 2^23(big-block) = 2^60 = 1EB 142 * Maximum HAMMER filesystem capacity from blockmap aspect 143 */ 144 145 #define HAMMER_ZONE_RAW_VOLUME 0x1000000000000000ULL 146 #define HAMMER_ZONE_RAW_BUFFER 0x2000000000000000ULL 147 #define HAMMER_ZONE_UNDO 0x3000000000000000ULL 148 #define HAMMER_ZONE_FREEMAP 0x4000000000000000ULL 149 #define HAMMER_ZONE_RESERVED05 0x5000000000000000ULL /* not used */ 150 #define HAMMER_ZONE_RESERVED06 0x6000000000000000ULL /* not used */ 151 #define HAMMER_ZONE_RESERVED07 0x7000000000000000ULL /* not used */ 152 #define HAMMER_ZONE_BTREE 0x8000000000000000ULL 153 #define HAMMER_ZONE_META 0x9000000000000000ULL 154 #define HAMMER_ZONE_LARGE_DATA 0xA000000000000000ULL 155 #define HAMMER_ZONE_SMALL_DATA 0xB000000000000000ULL 156 #define HAMMER_ZONE_RESERVED0C 0xC000000000000000ULL /* not used */ 157 #define HAMMER_ZONE_RESERVED0D 0xD000000000000000ULL /* not used */ 158 #define HAMMER_ZONE_RESERVED0E 0xE000000000000000ULL /* not used */ 159 #define HAMMER_ZONE_UNAVAIL 0xF000000000000000ULL 160 161 #define HAMMER_ZONE_RAW_VOLUME_INDEX 1 162 #define HAMMER_ZONE_RAW_BUFFER_INDEX 2 163 #define HAMMER_ZONE_UNDO_INDEX 3 164 #define HAMMER_ZONE_FREEMAP_INDEX 4 165 #define HAMMER_ZONE_BTREE_INDEX 8 166 #define HAMMER_ZONE_META_INDEX 9 167 #define HAMMER_ZONE_LARGE_DATA_INDEX 10 168 #define HAMMER_ZONE_SMALL_DATA_INDEX 11 169 #define HAMMER_ZONE_UNAVAIL_INDEX 15 170 171 #define HAMMER_MAX_ZONES 16 172 173 /* 174 * Test if the zone is directly mapped to zone-2 offset via freemap. 175 */ 176 #define hammer_is_zone2_mapped_index(zone) \ 177 ((zone) >= HAMMER_ZONE_BTREE_INDEX && \ 178 (zone) < HAMMER_MAX_ZONES) 179 /* 180 * Test if the zone is directly mapped to zone-2 offset. The word 181 * directly here means the zone is neither RAW_VOLUME nor UNDO zone. 182 */ 183 #define hammer_is_direct_mapped_index(zone) \ 184 (((zone) == HAMMER_ZONE_RAW_BUFFER_INDEX) || \ 185 ((zone) == HAMMER_ZONE_FREEMAP_INDEX) || \ 186 hammer_is_zone2_mapped_index(zone)) 187 188 #define HAMMER_ZONE_ENCODE(zone, ham_off) \ 189 (((hammer_off_t)(zone) << 60) | (ham_off)) 190 #define HAMMER_ZONE_DECODE(ham_off) \ 191 ((int)(((hammer_off_t)(ham_off) >> 60))) 192 193 #define HAMMER_VOL_ENCODE(vol_no) \ 194 ((hammer_off_t)((vol_no) & 255) << 52) 195 #define HAMMER_VOL_DECODE(ham_off) \ 196 ((int)(((hammer_off_t)(ham_off) >> 52) & 255)) 197 198 #define HAMMER_OFF_SHORT_ENCODE(offset) \ 199 ((hammer_off_t)(offset) & HAMMER_OFF_SHORT_MASK) 200 #define HAMMER_OFF_LONG_ENCODE(offset) \ 201 ((hammer_off_t)(offset) & HAMMER_OFF_LONG_MASK) 202 203 #define HAMMER_ENCODE(zone, vol_no, offset) \ 204 (((hammer_off_t)(zone) << 60) | \ 205 HAMMER_VOL_ENCODE(vol_no) | \ 206 HAMMER_OFF_SHORT_ENCODE(offset)) 207 #define HAMMER_ENCODE_RAW_VOLUME(vol_no, offset) \ 208 HAMMER_ENCODE(HAMMER_ZONE_RAW_VOLUME_INDEX, vol_no, offset) 209 #define HAMMER_ENCODE_RAW_BUFFER(vol_no, offset) \ 210 HAMMER_ENCODE(HAMMER_ZONE_RAW_BUFFER_INDEX, vol_no, offset) 211 #define HAMMER_ENCODE_FREEMAP(vol_no, offset) \ 212 HAMMER_ENCODE(HAMMER_ZONE_FREEMAP_INDEX, vol_no, offset) 213 214 /* 215 * Translate a zone address to zone-X address. 216 */ 217 #define hammer_xlate_to_zoneX(zone, offset) \ 218 HAMMER_ZONE_ENCODE((zone), (offset) & ~HAMMER_OFF_ZONE_MASK) 219 #define hammer_xlate_to_zone2(offset) \ 220 hammer_xlate_to_zoneX(HAMMER_ZONE_RAW_BUFFER_INDEX, (offset)) 221 222 #define hammer_data_zone(data_len) \ 223 (((data_len) >= HAMMER_BUFSIZE) ? \ 224 HAMMER_ZONE_LARGE_DATA : \ 225 HAMMER_ZONE_SMALL_DATA) 226 #define hammer_data_zone_index(data_len) \ 227 (((data_len) >= HAMMER_BUFSIZE) ? \ 228 HAMMER_ZONE_LARGE_DATA_INDEX : \ 229 HAMMER_ZONE_SMALL_DATA_INDEX) 230 231 /* 232 * Big-Block backing store 233 * 234 * A blockmap is a two-level map which translates a blockmap-backed zone 235 * offset into a raw zone 2 offset. The layer 1 handles 18 bits and the 236 * layer 2 handles 19 bits. The 8M big-block size is 23 bits so two 237 * layers gives us 18+19+23 = 60 bits of address space. 238 * 239 * When using hinting for a blockmap lookup, the hint is lost when the 240 * scan leaves the HINTBLOCK, which is typically several BIGBLOCK's. 241 * HINTBLOCK is a heuristic. 242 */ 243 #define HAMMER_HINTBLOCK_SIZE (HAMMER_BIGBLOCK_SIZE * 4) 244 #define HAMMER_HINTBLOCK_MASK64 ((uint64_t)HAMMER_HINTBLOCK_SIZE - 1) 245 #define HAMMER_BIGBLOCK_SIZE (8192 * 1024) 246 #define HAMMER_BIGBLOCK_SIZE64 ((uint64_t)HAMMER_BIGBLOCK_SIZE) 247 #define HAMMER_BIGBLOCK_MASK (HAMMER_BIGBLOCK_SIZE - 1) 248 #define HAMMER_BIGBLOCK_MASK64 ((uint64_t)HAMMER_BIGBLOCK_SIZE - 1) 249 #define HAMMER_BIGBLOCK_BITS 23 250 #if 0 251 #define HAMMER_BIGBLOCK_OVERFILL (6144 * 1024) 252 #endif 253 #if (1 << HAMMER_BIGBLOCK_BITS) != HAMMER_BIGBLOCK_SIZE 254 #error "HAMMER_BIGBLOCK_BITS BROKEN" 255 #endif 256 257 #define HAMMER_BUFFERS_PER_BIGBLOCK \ 258 (HAMMER_BIGBLOCK_SIZE / HAMMER_BUFSIZE) 259 #define HAMMER_BUFFERS_PER_BIGBLOCK_MASK \ 260 (HAMMER_BUFFERS_PER_BIGBLOCK - 1) 261 #define HAMMER_BUFFERS_PER_BIGBLOCK_MASK64 \ 262 ((hammer_off_t)HAMMER_BUFFERS_PER_BIGBLOCK_MASK) 263 264 /* 265 * Maximum number of mirrors operating in master mode (multi-master 266 * clustering and mirroring). Note that HAMMER1 does not support 267 * multi-master clustering as of 2015. 268 */ 269 #define HAMMER_MAX_MASTERS 16 270 271 /* 272 * The blockmap is somewhat of a degenerate structure. HAMMER only actually 273 * uses it in its original incarnation to implement the freemap. 274 * 275 * zone:1 raw volume (no blockmap) 276 * zone:2 raw buffer (no blockmap) 277 * zone:3 undomap (direct layer2 array in volume header) 278 * zone:4 freemap (the only real blockmap) 279 * zone:8-15 zone id used to classify big-block only, address is actually 280 * a zone-2 address. 281 */ 282 struct hammer_blockmap { 283 hammer_off_t phys_offset; /* zone-2 physical offset */ 284 hammer_off_t first_offset; /* zone-X logical offset (zone 3) */ 285 hammer_off_t next_offset; /* zone-X logical offset */ 286 hammer_off_t alloc_offset; /* zone-X logical offset */ 287 uint32_t reserved01; 288 hammer_crc_t entry_crc; 289 }; 290 291 typedef struct hammer_blockmap *hammer_blockmap_t; 292 293 #define HAMMER_BLOCKMAP_CRCSIZE \ 294 offsetof(struct hammer_blockmap, entry_crc) 295 296 /* 297 * The blockmap is a 2-layer entity made up of big-blocks. The first layer 298 * contains 262144 32-byte entries (18 bits), the second layer contains 299 * 524288 16-byte entries (19 bits), representing 8MB (23 bit) blockmaps. 300 * 18+19+23 = 60 bits. The top four bits are the zone id. 301 * 302 * Currently only the freemap utilizes both layers in all their glory. 303 * All primary data/meta-data zones actually encode a zone-2 address 304 * requiring no real blockmap translation. 305 * 306 * The freemap uses the upper 8 bits of layer-1 to identify the volume, 307 * thus any space allocated via the freemap can be directly translated 308 * to a zone:2 (or zone:8-15) address. 309 * 310 * zone-X blockmap offset: [zone:4][layer1:18][layer2:19][big-block:23] 311 */ 312 313 /* 314 * 32 bytes layer1 entry for 8MB big-block. 315 * A big-block can hold 2^23 / 2^5 = 2^18 layer1 entries, 316 * which equals bits assigned for layer1 in zone-2 address. 317 */ 318 struct hammer_blockmap_layer1 { 319 hammer_off_t blocks_free; /* big-blocks free */ 320 hammer_off_t phys_offset; /* UNAVAIL or zone-2 */ 321 hammer_off_t reserved01; 322 hammer_crc_t layer2_crc; /* xor'd crc's of HAMMER_BLOCKSIZE */ 323 /* (not yet used) */ 324 hammer_crc_t layer1_crc; /* MUST BE LAST FIELD OF STRUCTURE*/ 325 }; 326 327 typedef struct hammer_blockmap_layer1 *hammer_blockmap_layer1_t; 328 329 #define HAMMER_LAYER1_CRCSIZE \ 330 offsetof(struct hammer_blockmap_layer1, layer1_crc) 331 332 /* 333 * 16 bytes layer2 entry for 8MB big-blocks. 334 * A big-block can hold 2^23 / 2^4 = 2^19 layer2 entries, 335 * which equals bits assigned for layer2 in zone-2 address. 336 * 337 * NOTE: bytes_free is signed and can legally go negative if/when data 338 * de-dup occurs. This field will never go higher than 339 * HAMMER_BIGBLOCK_SIZE. If exactly HAMMER_BIGBLOCK_SIZE 340 * the big-block is completely free. 341 */ 342 struct hammer_blockmap_layer2 { 343 uint8_t zone; /* typed allocation zone */ 344 uint8_t unused01; 345 uint16_t unused02; 346 uint32_t append_off; /* allocatable space index */ 347 int32_t bytes_free; /* bytes free within this big-block */ 348 hammer_crc_t entry_crc; 349 }; 350 351 typedef struct hammer_blockmap_layer2 *hammer_blockmap_layer2_t; 352 353 #define HAMMER_LAYER2_CRCSIZE \ 354 offsetof(struct hammer_blockmap_layer2, entry_crc) 355 356 #define HAMMER_BLOCKMAP_UNAVAIL ((hammer_off_t)-1LL) 357 358 #define HAMMER_BLOCKMAP_RADIX1 /* 2^18 = 262144 */ \ 359 ((int)(HAMMER_BIGBLOCK_SIZE / sizeof(struct hammer_blockmap_layer1))) 360 #define HAMMER_BLOCKMAP_RADIX2 /* 2^19 = 524288 */ \ 361 ((int)(HAMMER_BIGBLOCK_SIZE / sizeof(struct hammer_blockmap_layer2))) 362 363 #define HAMMER_BLOCKMAP_LAYER1 /* 2^(18+19+23) = 1EB */ \ 364 (HAMMER_BLOCKMAP_RADIX1 * HAMMER_BLOCKMAP_LAYER2) 365 #define HAMMER_BLOCKMAP_LAYER2 /* 2^(19+23) = 4TB */ \ 366 (HAMMER_BLOCKMAP_RADIX2 * HAMMER_BIGBLOCK_SIZE64) 367 368 #define HAMMER_BLOCKMAP_LAYER1_MASK (HAMMER_BLOCKMAP_LAYER1 - 1) 369 #define HAMMER_BLOCKMAP_LAYER2_MASK (HAMMER_BLOCKMAP_LAYER2 - 1) 370 371 /* 372 * Index within layer1 or layer2 big-block for the entry representing 373 * a zone-2 physical offset. 374 */ 375 #define HAMMER_BLOCKMAP_LAYER1_INDEX(zone2_offset) \ 376 ((int)(((zone2_offset) & HAMMER_BLOCKMAP_LAYER1_MASK) / \ 377 HAMMER_BLOCKMAP_LAYER2)) 378 379 #define HAMMER_BLOCKMAP_LAYER2_INDEX(zone2_offset) \ 380 ((int)(((zone2_offset) & HAMMER_BLOCKMAP_LAYER2_MASK) / \ 381 HAMMER_BIGBLOCK_SIZE64)) 382 383 /* 384 * Byte offset within layer1 or layer2 big-block for the entry representing 385 * a zone-2 physical offset. Multiply the index by sizeof(blockmap_layer). 386 */ 387 #define HAMMER_BLOCKMAP_LAYER1_OFFSET(zone2_offset) \ 388 (HAMMER_BLOCKMAP_LAYER1_INDEX(zone2_offset) * \ 389 sizeof(struct hammer_blockmap_layer1)) 390 391 #define HAMMER_BLOCKMAP_LAYER2_OFFSET(zone2_offset) \ 392 (HAMMER_BLOCKMAP_LAYER2_INDEX(zone2_offset) * \ 393 sizeof(struct hammer_blockmap_layer2)) 394 395 /* 396 * HAMMER UNDO parameters. The UNDO fifo is mapped directly in the volume 397 * header with an array of layer2 structures. A maximum of (128x8MB) = 1GB 398 * may be reserved. The size of the undo fifo is usually set a newfs time 399 * but can be adjusted if the filesystem is taken offline. 400 */ 401 #define HAMMER_UNDO_LAYER2 128 /* max layer2 undo mapping entries */ 402 403 /* 404 * All on-disk HAMMER structures which make up elements of the UNDO FIFO 405 * contain a hammer_fifo_head and hammer_fifo_tail structure. This structure 406 * contains all the information required to validate the fifo element 407 * and to scan the fifo in either direction. The head is typically embedded 408 * in higher level hammer on-disk structures while the tail is typically 409 * out-of-band. hdr_size is the size of the whole mess, including the tail. 410 * 411 * All undo structures are guaranteed to not cross a 16K filesystem 412 * buffer boundary. Most undo structures are fairly small. Data spaces 413 * are not immediately reused by HAMMER so file data is not usually recorded 414 * as part of an UNDO. 415 * 416 * PAD elements are allowed to take up only 8 bytes of space as a special 417 * case, containing only hdr_signature, hdr_type, and hdr_size fields, 418 * and with the tail overloaded onto the head structure for 8 bytes total. 419 * 420 * Every undo record has a sequence number. This number is unrelated to 421 * transaction ids and instead collects the undo transactions associated 422 * with a single atomic operation. A larger transactional operation, such 423 * as a remove(), may consist of several smaller atomic operations 424 * representing raw meta-data operations. 425 * 426 * HAMMER VERSION 4 CHANGES 427 * 428 * In HAMMER version 4 the undo structure alignment is reduced from 16384 429 * to 512 bytes in order to ensure that each 512 byte sector begins with 430 * a header. The reserved01 field in the header is now a 32 bit sequence 431 * number. This allows the recovery code to detect missing sectors 432 * without relying on the 32-bit crc and to definitively identify the current 433 * undo sequence space without having to rely on information from the volume 434 * header. In addition, new REDO entries in the undo space are used to 435 * record write, write/extend, and transaction id updates. 436 * 437 * The grand result is: 438 * 439 * (1) The volume header no longer needs to be synchronized for most 440 * flush and fsync operations. 441 * 442 * (2) Most fsync operations need only lay down REDO records 443 * 444 * (3) Data overwrite for nohistory operations covered by REDO records 445 * can be supported (instead of rolling a new block allocation), 446 * by rolling UNDO for the prior contents of the data. 447 * 448 * HAMMER VERSION 5 CHANGES 449 * 450 * Hammer version 5 contains a minor adjustment making layer2's bytes_free 451 * field signed, allowing dedup to push it into the negative domain. 452 */ 453 #define HAMMER_HEAD_ALIGN 8 454 #define HAMMER_HEAD_ALIGN_MASK (HAMMER_HEAD_ALIGN - 1) 455 #define HAMMER_HEAD_DOALIGN(bytes) \ 456 (((bytes) + HAMMER_HEAD_ALIGN_MASK) & ~HAMMER_HEAD_ALIGN_MASK) 457 458 #define HAMMER_UNDO_ALIGN 512 459 #define HAMMER_UNDO_ALIGN64 ((uint64_t)512) 460 #define HAMMER_UNDO_MASK (HAMMER_UNDO_ALIGN - 1) 461 #define HAMMER_UNDO_MASK64 (HAMMER_UNDO_ALIGN64 - 1) 462 463 struct hammer_fifo_head { 464 uint16_t hdr_signature; 465 uint16_t hdr_type; 466 uint32_t hdr_size; /* Aligned size of the whole mess */ 467 uint32_t hdr_seq; /* Sequence number */ 468 hammer_crc_t hdr_crc; /* XOR crc up to field w/ crc after field */ 469 }; 470 471 #define HAMMER_FIFO_HEAD_CRCOFF offsetof(struct hammer_fifo_head, hdr_crc) 472 473 struct hammer_fifo_tail { 474 uint16_t tail_signature; 475 uint16_t tail_type; 476 uint32_t tail_size; /* aligned size of the whole mess */ 477 }; 478 479 typedef struct hammer_fifo_head *hammer_fifo_head_t; 480 typedef struct hammer_fifo_tail *hammer_fifo_tail_t; 481 482 /* 483 * Fifo header types. 484 * 485 * NOTE: 0x8000U part of HAMMER_HEAD_TYPE_PAD can be removed if the HAMMER 486 * version ever gets bumped again. It exists only to keep compatibility with 487 * older versions. 488 */ 489 #define HAMMER_HEAD_TYPE_PAD (0x0040U | 0x8000U) 490 #define HAMMER_HEAD_TYPE_DUMMY 0x0041U /* dummy entry w/seqno */ 491 #define HAMMER_HEAD_TYPE_UNDO 0x0043U /* random UNDO information */ 492 #define HAMMER_HEAD_TYPE_REDO 0x0044U /* data REDO / fast fsync */ 493 494 #define HAMMER_HEAD_SIGNATURE 0xC84EU 495 #define HAMMER_TAIL_SIGNATURE 0xC74FU 496 497 /* 498 * Misc FIFO structures. 499 * 500 * UNDO - Raw meta-data media updates. 501 */ 502 struct hammer_fifo_undo { 503 struct hammer_fifo_head head; 504 hammer_off_t undo_offset; /* zone-1,2 offset */ 505 int32_t undo_data_bytes; 506 int32_t undo_reserved01; 507 /* followed by data */ 508 }; 509 510 /* 511 * REDO (HAMMER version 4+) - Logical file writes/truncates. 512 * 513 * REDOs contain information which will be duplicated in a later meta-data 514 * update, allowing fast write()+fsync() operations. REDOs can be ignored 515 * without harming filesystem integrity but must be processed if fsync() 516 * semantics are desired. 517 * 518 * Unlike UNDOs which are processed backwards within the recovery span, 519 * REDOs must be processed forwards starting further back (starting outside 520 * the recovery span). 521 * 522 * WRITE - Write logical file (with payload). Executed both 523 * out-of-span and in-span. Out-of-span WRITEs may be 524 * filtered out by TERMs. 525 * 526 * TRUNC - Truncate logical file (no payload). Executed both 527 * out-of-span and in-span. Out-of-span WRITEs may be 528 * filtered out by TERMs. 529 * 530 * TERM_* - Indicates meta-data was committed (if out-of-span) or 531 * will be rolled-back (in-span). Any out-of-span TERMs 532 * matching earlier WRITEs remove those WRITEs from 533 * consideration as they might conflict with a later data 534 * commit (which is not being rolled-back). 535 * 536 * SYNC - The earliest in-span SYNC (the last one when scanning 537 * backwards) tells the recovery code how far out-of-span 538 * it must go to run REDOs. 539 * 540 * NOTE: WRITEs do not always have matching TERMs even under 541 * perfect conditions because truncations might remove the 542 * buffers from consideration. I/O problems can also remove 543 * buffers from consideration. 544 * 545 * TRUNCSs do not always have matching TERMs because several 546 * truncations may be aggregated together into a single TERM. 547 */ 548 struct hammer_fifo_redo { 549 struct hammer_fifo_head head; 550 int64_t redo_objid; /* file being written */ 551 hammer_off_t redo_offset; /* logical offset in file */ 552 int32_t redo_data_bytes; 553 uint32_t redo_flags; 554 uint32_t redo_localization; 555 uint32_t redo_reserved; 556 uint64_t redo_mtime; /* set mtime */ 557 }; 558 559 #define HAMMER_REDO_WRITE 0x00000001 560 #define HAMMER_REDO_TRUNC 0x00000002 561 #define HAMMER_REDO_TERM_WRITE 0x00000004 562 #define HAMMER_REDO_TERM_TRUNC 0x00000008 563 #define HAMMER_REDO_SYNC 0x00000010 564 565 union hammer_fifo_any { 566 struct hammer_fifo_head head; 567 struct hammer_fifo_undo undo; 568 struct hammer_fifo_redo redo; 569 }; 570 571 typedef struct hammer_fifo_redo *hammer_fifo_redo_t; 572 typedef struct hammer_fifo_undo *hammer_fifo_undo_t; 573 typedef union hammer_fifo_any *hammer_fifo_any_t; 574 575 /* 576 * Volume header types 577 */ 578 #define HAMMER_FSBUF_VOLUME 0xC8414D4DC5523031ULL /* HAMMER01 */ 579 #define HAMMER_FSBUF_VOLUME_REV 0x313052C54D4D41C8ULL /* (reverse endian) */ 580 581 /* 582 * HAMMER Volume header 583 * 584 * A HAMMER filesystem can be built from 1-256 block devices, each block 585 * device contains a volume header followed by however many buffers fit 586 * into the volume. 587 * 588 * One of the volumes making up a HAMMER filesystem is the root volume. 589 * The root volume is always volume #0 which is the first block device path 590 * specified by newfs_hammer(8). All HAMMER volumes have a volume header, 591 * however the root volume may be the only volume that has valid values for 592 * some fields in the header. 593 * 594 * Special field notes: 595 * 596 * vol_bot_beg - offset of boot area (mem_beg - bot_beg bytes) 597 * vol_mem_beg - offset of memory log (buf_beg - mem_beg bytes) 598 * vol_buf_beg - offset of the first buffer in volume 599 * vol_buf_end - offset of volume EOF (on buffer boundary) 600 * 601 * The memory log area allows a kernel to cache new records and data 602 * in memory without allocating space in the actual filesystem to hold 603 * the records and data. In the event that a filesystem becomes full, 604 * any records remaining in memory can be flushed to the memory log 605 * area. This allows the kernel to immediately return success. 606 * 607 * The buffer offset is a physical offset of zone-2 offset. The lower 608 * 52 bits of the zone-2 offset is added to the buffer offset of each 609 * volume to generate an actual I/O offset within the block device. 610 * 611 * NOTE: boot area and memory log are currently not used. 612 */ 613 614 /* 615 * These macros are only used by userspace when userspace commands either 616 * initialize or add a new HAMMER volume. 617 */ 618 #define HAMMER_BOOT_MINBYTES (32*1024) 619 #define HAMMER_BOOT_NOMBYTES (64LL*1024*1024) 620 #define HAMMER_BOOT_MAXBYTES (256LL*1024*1024) 621 622 #define HAMMER_MEM_MINBYTES (256*1024) 623 #define HAMMER_MEM_NOMBYTES (1LL*1024*1024*1024) 624 #define HAMMER_MEM_MAXBYTES (64LL*1024*1024*1024) 625 626 struct hammer_volume_ondisk { 627 uint64_t vol_signature; /* HAMMER_FSBUF_VOLUME for a valid header */ 628 629 int64_t vol_bot_beg; /* offset of boot area */ 630 int64_t vol_mem_beg; /* offset of memory log */ 631 int64_t vol_buf_beg; /* offset of the first buffer in volume */ 632 int64_t vol_buf_end; /* offset of volume EOF (on buffer boundary) */ 633 int64_t vol_locked; /* not used */ 634 635 uuid_t vol_fsid; /* identify filesystem */ 636 uuid_t vol_fstype; /* identify filesystem type */ 637 char vol_name[64]; /* filesystem label, not a block device path */ 638 639 int32_t vol_no; /* volume number within filesystem */ 640 int32_t vol_count; /* number of volumes making up FS */ 641 642 uint32_t vol_version; /* version control information */ 643 hammer_crc_t vol_crc; /* header crc */ 644 uint32_t vol_flags; /* volume flags */ 645 uint32_t vol_rootvol; /* which volume is the root volume? */ 646 647 int32_t vol_reserved04; 648 int32_t vol_reserved05; 649 uint32_t vol_reserved06; 650 uint32_t vol_reserved07; 651 652 int32_t vol_blocksize; /* for statfs only */ 653 int32_t vol_reserved08; 654 int64_t vol_nblocks; /* total allocatable hammer bufs */ 655 656 /* 657 * These fields are initialized and space is reserved in every 658 * volume making up a HAMMER filesytem, but only the root volume 659 * contains valid data. Note that vol0_stat_bigblocks does not 660 * include big-blocks for freemap and undomap initially allocated 661 * by newfs_hammer(8). 662 */ 663 int64_t vol0_stat_bigblocks; /* total big-blocks when fs is empty */ 664 int64_t vol0_stat_freebigblocks;/* number of free big-blocks */ 665 int64_t vol0_stat_bytes; /* for statfs only */ 666 int64_t vol0_stat_inodes; /* for statfs only */ 667 int64_t vol0_stat_records; /* total records in filesystem */ 668 hammer_off_t vol0_btree_root; /* B-Tree root */ 669 hammer_tid_t vol0_next_tid; /* highest partially synchronized TID */ 670 hammer_off_t vol0_unused03; 671 672 /* 673 * Blockmaps for zones. Not all zones use a blockmap. Note that 674 * the entire root blockmap is cached in the hammer_mount structure. 675 */ 676 struct hammer_blockmap vol0_blockmap[HAMMER_MAX_ZONES]; 677 678 /* 679 * Array of zone-2 addresses for undo FIFO. 680 */ 681 hammer_off_t vol0_undo_array[HAMMER_UNDO_LAYER2]; 682 }; 683 684 typedef struct hammer_volume_ondisk *hammer_volume_ondisk_t; 685 686 #define HAMMER_VOLF_NEEDFLUSH 0x0004 /* volume needs flush */ 687 688 #define HAMMER_VOL_CRCSIZE1 \ 689 offsetof(struct hammer_volume_ondisk, vol_crc) 690 #define HAMMER_VOL_CRCSIZE2 \ 691 (sizeof(struct hammer_volume_ondisk) - HAMMER_VOL_CRCSIZE1 - \ 692 sizeof(hammer_crc_t)) 693 694 #define HAMMER_VOL_VERSION_MIN 1 /* minimum supported version */ 695 #define HAMMER_VOL_VERSION_DEFAULT 6 /* newfs default version */ 696 #define HAMMER_VOL_VERSION_WIP 7 /* version >= this is WIP */ 697 #define HAMMER_VOL_VERSION_MAX 6 /* maximum supported version */ 698 699 #define HAMMER_VOL_VERSION_ONE 1 700 #define HAMMER_VOL_VERSION_TWO 2 /* new dirent layout (2.3+) */ 701 #define HAMMER_VOL_VERSION_THREE 3 /* new snapshot layout (2.5+) */ 702 #define HAMMER_VOL_VERSION_FOUR 4 /* new undo/flush (2.5+) */ 703 #define HAMMER_VOL_VERSION_FIVE 5 /* dedup (2.9+) */ 704 #define HAMMER_VOL_VERSION_SIX 6 /* DIRHASH_ALG1 */ 705 706 /* 707 * Translate a zone-2 address to physical address 708 */ 709 #define hammer_xlate_to_phys(volume, zone2_offset) \ 710 ((volume)->vol_buf_beg + \ 711 ((zone2_offset) & HAMMER_OFF_SHORT_MASK)) 712 713 /* 714 * Record types are fairly straightforward. The B-Tree includes the record 715 * type in its index sort. 716 */ 717 #define HAMMER_RECTYPE_UNKNOWN 0x0000 718 #define HAMMER_RECTYPE_LOWEST 0x0001 /* lowest record type avail */ 719 #define HAMMER_RECTYPE_INODE 0x0001 /* inode in obj_id space */ 720 #define HAMMER_RECTYPE_DATA 0x0010 721 #define HAMMER_RECTYPE_DIRENTRY 0x0011 722 #define HAMMER_RECTYPE_DB 0x0012 723 #define HAMMER_RECTYPE_EXT 0x0013 /* ext attributes */ 724 #define HAMMER_RECTYPE_FIX 0x0014 /* fixed attribute */ 725 #define HAMMER_RECTYPE_PFS 0x0015 /* PFS management */ 726 #define HAMMER_RECTYPE_SNAPSHOT 0x0016 /* Snapshot management */ 727 #define HAMMER_RECTYPE_CONFIG 0x0017 /* hammer cleanup config */ 728 #define HAMMER_RECTYPE_MAX 0xFFFF 729 730 #define HAMMER_RECTYPE_ENTRY_START (HAMMER_RECTYPE_INODE + 1) 731 #define HAMMER_RECTYPE_CLEAN_START HAMMER_RECTYPE_EXT 732 733 #define HAMMER_FIXKEY_SYMLINK 1 734 735 #define HAMMER_OBJTYPE_UNKNOWN 0 /* never exists on-disk as unknown */ 736 #define HAMMER_OBJTYPE_DIRECTORY 1 737 #define HAMMER_OBJTYPE_REGFILE 2 738 #define HAMMER_OBJTYPE_DBFILE 3 739 #define HAMMER_OBJTYPE_FIFO 4 740 #define HAMMER_OBJTYPE_CDEV 5 741 #define HAMMER_OBJTYPE_BDEV 6 742 #define HAMMER_OBJTYPE_SOFTLINK 7 743 #define HAMMER_OBJTYPE_PSEUDOFS 8 /* pseudo filesystem obj */ 744 #define HAMMER_OBJTYPE_SOCKET 9 745 746 /* 747 * HAMMER inode attribute data 748 * 749 * The data reference for a HAMMER inode points to this structure. Any 750 * modifications to the contents of this structure will result in a 751 * replacement operation. 752 * 753 * parent_obj_id is only valid for directories (which cannot be hard-linked), 754 * and specifies the parent directory obj_id. This field will also be set 755 * for non-directory inodes as a recovery aid, but can wind up holding 756 * stale information. However, since object id's are not reused, the worse 757 * that happens is that the recovery code is unable to use it. 758 * 759 * NOTE: Future note on directory hardlinks. We can implement a record type 760 * which allows us to point to multiple parent directories. 761 */ 762 struct hammer_inode_data { 763 uint16_t version; /* inode data version */ 764 uint16_t mode; /* basic unix permissions */ 765 uint32_t uflags; /* chflags */ 766 uint32_t rmajor; /* used by device nodes */ 767 uint32_t rminor; /* used by device nodes */ 768 uint64_t ctime; 769 int64_t parent_obj_id; /* parent directory obj_id */ 770 uuid_t uid; 771 uuid_t gid; 772 773 uint8_t obj_type; 774 uint8_t cap_flags; /* capability support flags (extension) */ 775 uint16_t reserved01; 776 uint32_t reserved02; /* RESERVED FOR POSSIBLE FUTURE BIRTHTIME */ 777 uint64_t nlinks; /* hard links */ 778 uint64_t size; /* filesystem object size */ 779 union { 780 char symlink[24]; /* HAMMER_INODE_BASESYMLEN */ 781 } ext; 782 uint64_t mtime; /* mtime must be second-to-last */ 783 uint64_t atime; /* atime must be last */ 784 }; 785 786 /* 787 * Neither mtime nor atime upates are CRCd by the B-Tree element. 788 * mtime updates have UNDO, atime updates do not. 789 */ 790 #define HAMMER_INODE_CRCSIZE \ 791 offsetof(struct hammer_inode_data, mtime) 792 793 #define HAMMER_INODE_DATA_VERSION 1 794 #define HAMMER_OBJID_ROOT 1 /* root inodes # */ 795 #define HAMMER_INODE_BASESYMLEN 24 /* see ext.symlink */ 796 797 /* 798 * Capability & implementation flags. 799 * 800 * HAMMER_INODE_CAP_DIR_LOCAL_INO - Use inode B-Tree localization 801 * for directory entries. Also see HAMMER_DIR_INODE_LOCALIZATION(). 802 */ 803 #define HAMMER_INODE_CAP_DIRHASH_MASK 0x03 /* directory: hash algorithm */ 804 #define HAMMER_INODE_CAP_DIRHASH_ALG0 0x00 805 #define HAMMER_INODE_CAP_DIRHASH_ALG1 0x01 806 #define HAMMER_INODE_CAP_DIRHASH_ALG2 0x02 807 #define HAMMER_INODE_CAP_DIRHASH_ALG3 0x03 808 #define HAMMER_INODE_CAP_DIR_LOCAL_INO 0x04 /* use inode localization */ 809 810 /* 811 * A HAMMER directory entry associates a HAMMER filesystem object with a 812 * namespace. It is possible to hook into a pseudo-filesystem (with its 813 * own inode numbering space) in the filesystem by setting the high 814 * 16 bits of the localization field. The low 16 bits must be 0 and 815 * are reserved for future use. 816 * 817 * Directory entries are indexed with a 128 bit namekey rather then an 818 * offset. A portion of the namekey is an iterator/randomizer to deal 819 * with collisions. 820 * 821 * NOTE: leaf.base.obj_type from the related B-Tree leaf entry holds 822 * the filesystem object type of obj_id, e.g. a den_type equivalent. 823 * It is not stored in hammer_entry_data. 824 * 825 * NOTE: name field / the filename data reference is NOT terminated with \0. 826 */ 827 struct hammer_entry_data { 828 int64_t obj_id; /* object being referenced */ 829 uint32_t localization; /* identify pseudo-filesystem */ 830 uint32_t reserved02; 831 char name[16]; /* name (extended) */ 832 }; 833 834 #define HAMMER_ENTRY_NAME_OFF offsetof(struct hammer_entry_data, name[0]) 835 #define HAMMER_ENTRY_SIZE(nlen) offsetof(struct hammer_entry_data, name[nlen]) 836 837 /* 838 * Symlink data which does not fit in the inode is stored in a separate 839 * FIX type record. 840 */ 841 struct hammer_symlink_data { 842 char name[16]; /* name (extended) */ 843 }; 844 845 #define HAMMER_SYMLINK_NAME_OFF offsetof(struct hammer_symlink_data, name[0]) 846 847 /* 848 * The root inode for the primary filesystem and root inode for any 849 * pseudo-fs may be tagged with an optional data structure using 850 * HAMMER_RECTYPE_PFS and localization id. This structure allows 851 * the node to be used as a mirroring master or slave. 852 * 853 * When operating as a slave CD's into the node automatically become read-only 854 * and as-of sync_end_tid. 855 * 856 * When operating as a master the read PFSD info sets sync_end_tid to 857 * the most recently flushed TID. 858 * 859 * sync_low_tid is not yet used but will represent the highest pruning 860 * end-point, after which full history is available. 861 * 862 * We need to pack this structure making it equally sized on both 32-bit and 863 * 64-bit machines as it is part of struct hammer_ioc_mrecord_pfs which is 864 * send over the wire in hammer mirror operations. Only on 64-bit machines 865 * the size of this struct differ when packed or not. This leads us to the 866 * situation where old 64-bit systems (using the non-packed structure), 867 * which were never able to mirror to/from 32-bit systems, are now no longer 868 * able to mirror to/from newer 64-bit systems (using the packed structure). 869 */ 870 struct hammer_pseudofs_data { 871 hammer_tid_t sync_low_tid; /* full history beyond this point */ 872 hammer_tid_t sync_beg_tid; /* earliest tid w/ full history avail */ 873 hammer_tid_t sync_end_tid; /* current synchronizatoin point */ 874 uint64_t sync_beg_ts; /* real-time of last completed sync */ 875 uint64_t sync_end_ts; /* initiation of current sync cycle */ 876 uuid_t shared_uuid; /* shared uuid (match required) */ 877 uuid_t unique_uuid; /* unique uuid of this master/slave */ 878 int32_t reserved01; /* reserved for future master_id */ 879 int32_t mirror_flags; /* misc flags */ 880 char label[64]; /* filesystem space label */ 881 char snapshots[64]; /* softlink dir for pruning */ 882 int32_t reserved02; /* was prune_{time,freq} */ 883 int32_t reserved03; /* was reblock_{time,freq} */ 884 int32_t reserved04; /* was snapshot_freq */ 885 int32_t prune_min; /* do not prune recent history */ 886 int32_t prune_max; /* do not retain history beyond here */ 887 int32_t reserved[16]; 888 } __packed; 889 890 typedef struct hammer_pseudofs_data *hammer_pseudofs_data_t; 891 892 #define HAMMER_PFSD_SLAVE 0x00000001 893 #define HAMMER_PFSD_DELETED 0x80000000 894 895 #define HAMMER_MAX_PFS 65536 896 #define HAMMER_MAX_PFSID (HAMMER_MAX_PFS - 1) 897 #define HAMMER_ROOT_PFSID 0 898 899 /* 900 * Snapshot meta-data { Objid = HAMMER_OBJID_ROOT, Key = tid, rectype = SNAPSHOT }. 901 * 902 * Snapshot records replace the old <fs>/snapshots/<softlink> methodology. Snapshot 903 * records are mirrored but may be independantly managed once they are laid down on 904 * a slave. 905 * 906 * NOTE: The b-tree key is signed, the tid is not, so callers must still sort the 907 * results. 908 * 909 * NOTE: Reserved fields must be zero (as usual) 910 */ 911 struct hammer_snapshot_data { 912 hammer_tid_t tid; /* the snapshot TID itself (== key) */ 913 uint64_t ts; /* real-time when snapshot was made */ 914 uint64_t reserved01; 915 uint64_t reserved02; 916 char label[64]; /* user-supplied description */ 917 uint64_t reserved03[4]; 918 }; 919 920 /* 921 * Config meta-data { ObjId = HAMMER_OBJID_ROOT, Key = 0, rectype = CONFIG }. 922 * 923 * Used to store the hammer cleanup config. This data is not mirrored. 924 */ 925 struct hammer_config_data { 926 char text[1024]; 927 }; 928 929 /* 930 * Rollup various structures embedded as record data 931 */ 932 union hammer_data_ondisk { 933 struct hammer_entry_data entry; 934 struct hammer_inode_data inode; 935 struct hammer_symlink_data symlink; 936 struct hammer_pseudofs_data pfsd; 937 struct hammer_snapshot_data snap; 938 struct hammer_config_data config; 939 }; 940 941 typedef union hammer_data_ondisk *hammer_data_ondisk_t; 942 943 /* 944 * Ondisk layout of B-Tree related structures 945 */ 946 #include "hammer_btree.h" 947 948 #define HAMMER_DIR_INODE_LOCALIZATION(ino_data) \ 949 (((ino_data)->cap_flags & HAMMER_INODE_CAP_DIR_LOCAL_INO) ? \ 950 HAMMER_LOCALIZE_INODE : \ 951 HAMMER_LOCALIZE_MISC) 952 953 #endif /* !VFS_HAMMER_DISK_H_ */ 954