1 /* 2 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/sys/journal.h,v 1.1 2005/02/28 17:40:51 dillon Exp $ 35 */ 36 37 #ifndef _SYS_JOURNAL_H_ 38 #define _SYS_JOURNAL_H_ 39 40 /* 41 * Physical file format (binary) 42 * 43 * All raw records are 128-bit aligned, but all record sizes are actual. 44 * This means that any scanning code must 16-byte-align the recsize field 45 * when calculating skips. The top level raw record has a header and a 46 * trailer to allow both forwards and backwards scanning of the journal. 47 * The alignment requirement allows the worker thread FIFO reservation 48 * API to operate efficiently, amoung other things. 49 * 50 * Logical data stream records are usually no larger then the journal's 51 * in-memory FIFO, since the journal's transactional APIs return contiguous 52 * blocks of buffer space and since logical stream records are used to avoid 53 * stalls when concurrent blocking operations are being written to the journal. 54 * Programs can depend on a logical stream record being a 'reasonable' size. 55 * 56 * Multiple logical data streams may operate concurrently in the journal, 57 * reflecting the fact that the system may be executing multiple blocking 58 * operations on the filesystem all at the same time. These logical data 59 * streams are short-lived transactional entities which use a 13 bit id 60 * plus a transaction start bit, end bit, and abort bit. 61 * 62 * Stream identifiers in the 0x00-0xFF range are special and not used for 63 * normal transactional commands. 64 * 65 * Stream id 0x00 indicates that no other streams should be active at that 66 * point in the journal, which helps the journaling code detect corruption. 67 * 68 * Stream id 0x01 is used for pad. Pads are used to align data on convenient 69 * boundaries and to deal with dead space. 70 * 71 * Stream id 0x02 indicates a discontinuity in the streamed data and typically 72 * contains information relating to the reason for the discontinuity. 73 * JTYPE_ASSOCIATE and JTYPE_DISASSOCIATE are usually emplaced in stream 0x02. 74 * 75 * Stream id 0x03 may be used to annotate the journal with text comments 76 * via mountctl commands. This can be extremely useful to note situations 77 * that may help with later recovery or audit operations. 78 * 79 * Stream id 0x04-0x7F are reserved by DragonFly for future protocol expansion. 80 * 81 * Stream id 0x80-0xFF may be used for third-party protocol expansion. 82 * 83 * Stream id's 0x0100-0x1FFF typically represent short-lived transactions 84 * (i.e. an id may be reused once the previous use has completed). The 85 * journaling system runs through these id's sequentially which means that 86 * the journaling code can handle up to 8192-256 = 7936 simultanious 87 * transactions at any given moment. 88 * 89 * The sequence number field is context-sensitive. It is typically used by 90 * a journaling stream to provide an incrementing counter and/or timestamp 91 * so recovery utilities can determine if any data is missing. 92 * 93 * The check word in the trailer may be used to provide an integrity check 94 * on the journaled data. A value of 0 always means that no check word 95 * has been calculated. 96 * 97 * The journal_rawrecbeg structure MUST be a multiple of 16 bytes. 98 * The journal_rawrecend structure MUST be a multiple of 8 bytes. 99 * 100 * NOTE: PAD RECORD SPECIAL CASE. Pad records are 16 bytes and have the 101 * rawrecend structure overlayed on the sequence number field of the 102 * rawrecbeg structure. This is necessary because stream records are 103 * 16 byte aligned, not 24 byte aligned, and dead space is not allowed. 104 * So the pad record must fit into any dead space. 105 */ 106 struct journal_rawrecbeg { 107 u_int16_t begmagic; /* recovery scan, endianess detection */ 108 u_int16_t streamid; /* start/stop bits and stream identifier */ 109 int32_t recsize; /* stream data block (incls beg & end) */ 110 int64_t seqno; /* sequence number or transaction id */ 111 /* ADDITIONAL DATA */ 112 }; 113 114 struct journal_rawrecend { 115 u_int16_t endmagic; /* recovery scan, endianess detection */ 116 u_int16_t check; /* check word or 0 */ 117 int32_t recsize; /* same as rawrecbeg->recsize, for rev scan */ 118 }; 119 120 /* 121 * Constants for stream record magic numbers. The incomplete magic 122 * number code is used internally by the memory FIFO reservation API 123 * and worker thread, allowing a block of space in the journaling 124 * stream (aka a stream block) to be reserved and then populated without 125 * stalling other threads doing their own reservation and population. 126 */ 127 #define JREC_BEGMAGIC 0x1234 128 #define JREC_ENDMAGIC 0xCDEF 129 #define JREC_INCOMPLETEMAGIC 0xFFFF 130 131 /* 132 * Stream ids are 14 bits. The top 2 bits specify when a new logical 133 * stream is being created or an existing logical stream is being terminated. 134 * A single raw stream record will set both the BEGIN and END bits if the 135 * entire transaction is encapsulated in a single stream record. 136 */ 137 #define JREC_STREAMCTL_MASK 0xE000 138 #define JREC_STREAMCTL_BEGIN 0x8000 /* start a new logical stream */ 139 #define JREC_STREAMCTL_END 0x4000 /* terminate a logical stream */ 140 #define JREC_STREAMCTL_ABORTED 0x2000 141 142 #define JREC_STREAMID_MASK 0x1FFF 143 #define JREC_STREAMID_SYNCPT (JREC_STREAMCTL_BEGIN|JREC_STREAMCTL_END|0x0000) 144 #define JREC_STREAMID_PAD (JREC_STREAMCTL_BEGIN|JREC_STREAMCTL_END|0x0001) 145 #define JREC_STREAMID_DISCONT 0x0002 /* discontinuity */ 146 #define JREC_STREAMID_ANNOTATE 0x0003 /* annotation */ 147 /* 0x0004-0x007F reserved by DragonFly */ 148 /* 0x0080-0x00FF for third party use */ 149 #define JREC_STREAMID_JMIN 0x0100 /* lowest allowed general id */ 150 #define JREC_STREAMID_JMAX 0x2000 /* (one past the highest allowed id) */ 151 152 #define JREC_DEFAULTSIZE 64 /* reasonable initial reservation */ 153 154 /* 155 * Each logical journaling stream typically represents a transaction... 156 * that is, a VFS operation. The VFS operation is written out using 157 * sub-records and may contain multiple, possibly nested sub-transactions. 158 * multiple sub-transactions occur when a VFS operation cannot be represented 159 * by a single command. This is typically the case when a journal is 160 * configured to be reversable because UNDO sequences almost always have to 161 * be specified in such cases. For example, if you ftruncate() a file the 162 * journal might have to write out a sequence of WRITE records representing 163 * the lost data, otherwise the journal would not be reversable. 164 * Sub-transactions within a particular stream do not have their own sequence 165 * number field and thus may not be parallelized (the protocol is already 166 * complex enough!). 167 * 168 * In order to support streaming operation with a limited buffer the recsize 169 * field is allowed to be 0 for subrecords with the JMASK_NESTED bit set. 170 * If this case occurs a scanner can determine that the recursion has ended 171 * by detecting a nested subrecord with the JMASK_LAST bit set. A scanner 172 * may also set the field to the proper value after the fact to make later 173 * operations more efficient. 174 * 175 * Note that this bit must be properly set even if the recsize field is 176 * non-zero. The recsize must always be properly specified for 'leaf' 177 * subrecords, however in order to allow subsystems to potentially allocate 178 * more data space then they use the protocol allows any 'dead' space to be 179 * filled with JLEAF_PAD records. 180 * 181 * The recsize field may indicate data well past the size of the current 182 * raw stream record. That is, the scanner may have to glue together 183 * multiple stream records with the same stream id to fully decode the 184 * embedded subrecords. In particular, a subrecord could very well represent 185 * hundreds of megabytes of data (e.g. if a program were to do a 186 * multi-megabyte write()) and be split up across thousands of raw streaming 187 * records, possibly interlaced with other unrelated streams from other 188 * unrelated processes. 189 * 190 * If a large sub-transaction is aborted the logical stream may be 191 * terminated without writing out all the expected data. When this occurs 192 * the stream's ending record must also have the JREC_STREAMCTL_ABORTED bit 193 * set. However, scanners should still be robust enough to detect such 194 * overflows even if the aborted bit is not set and consider them data 195 * corruption. 196 * 197 * Aborts may also occur in the normal course of operations, especially once 198 * the journaling API is integrated into the cache coherency API. A normal 199 * abort is issued by emplacing a JLEAF_ABORT record within the transaction 200 * being aborted. Such records must be the last record in the sub-transaction, 201 * so JLEAF_LAST is also usually set. In a transaction with many 202 * sub-transactions only those sub-transactions with an abort record are 203 * aborted, the rest remain valid. Abort records are considered S.O.P. for 204 * two reasons: First, limited memory buffer space may make it impossible 205 * to delete the portion of the stream being aborted (the data may have 206 * already been sent to the target). Second, the journaling code will 207 * eventually be used to support a cache coherency layer which may have to 208 * abort operations as part of the cache coherency protocol. Note that 209 * subrecord aborts are different from stream record aborts. Stream record 210 * aborts are considered to be extrodinary situations while subrecord aborts 211 * are S.O.P. 212 */ 213 214 struct journal_subrecord { 215 int16_t rectype; /* 2 control bits, 14 record type bits */ 216 int16_t reserved; /* future use */ 217 int32_t recsize; /* record size (mandatory if not NESTED) */ 218 /* ADDITIONAL DATA */ 219 }; 220 221 #define JMASK_NESTED 0x8000 /* data is a nested recursion */ 222 #define JMASK_LAST 0x4000 223 224 #define JLEAF_PAD 0x0000 225 #define JLEAF_ABORT 0x0001 226 #define JTYPE_ASSOCIATE 0x0002 227 #define JTYPE_DISASSOCIATE 0x0003 228 #define JTYPE_UNDO (JMASK_NESTED|0x0004) 229 #define JTYPE_AUDIT (JMASK_NESTED|0x0005) 230 231 #define JTYPE_SETATTR (JMASK_NESTED|0x0010) 232 #define JTYPE_WRITE (JMASK_NESTED|0x0011) 233 #define JTYPE_PUTPAGES (JMASK_NESTED|0x0012) 234 #define JTYPE_SETACL (JMASK_NESTED|0x0013) 235 #define JTYPE_SETEXTATTR (JMASK_NESTED|0x0014) 236 #define JTYPE_CREATE (JMASK_NESTED|0x0015) 237 #define JTYPE_MKNOD (JMASK_NESTED|0x0016) 238 #define JTYPE_LINK (JMASK_NESTED|0x0017) 239 #define JTYPE_SYMLINK (JMASK_NESTED|0x0018) 240 #define JTYPE_WHITEOUT (JMASK_NESTED|0x0019) 241 #define JTYPE_REMOVE (JMASK_NESTED|0x001A) 242 #define JTYPE_MKDIR (JMASK_NESTED|0x001B) 243 #define JTYPE_RMDIR (JMASK_NESTED|0x001C) 244 #define JTYPE_RENAME (JMASK_NESTED|0x001D) 245 246 #define JTYPE_VATTR (JMASK_NESTED|0x0100) 247 #define JTYPE_CRED (JMASK_NESTED|0x0101) 248 249 /* 250 * Low level record types 251 */ 252 #define JLEAF_FILEDATA 0x0401 253 #define JLEAF_PATH1 0x0402 254 #define JLEAF_PATH2 0x0403 255 #define JLEAF_PATH3 0x0404 256 #define JLEAF_PATH4 0x0405 257 #define JLEAF_UID 0x0406 258 #define JLEAF_GID 0x0407 259 #define JLEAF_MODES 0x0408 260 #define JLEAF_FFLAGS 0x0409 261 #define JLEAF_PID 0x040A 262 #define JLEAF_PPID 0x040B 263 #define JLEAF_COMM 0x040C 264 #define JLEAF_ATTRNAME 0x040D 265 #define JLEAF_RESERVED_0E 0x040E 266 #define JLEAF_RESERVED_0F 0x040F 267 #define JLEAF_SYMLINKDATA 0x0410 268 #define JLEAF_SEEKPOS 0x0411 269 #define JLEAF_INUM 0x0412 270 #define JLEAF_NLINK 0x0413 271 #define JLEAF_FSID 0x0414 272 #define JLEAF_SIZE 0x0415 273 #define JLEAF_ATIME 0x0416 274 #define JLEAF_MTIME 0x0417 275 #define JLEAF_CTIME 0x0418 276 #define JLEAF_GEN 0x0419 277 #define JLEAF_FLAGS 0x041A 278 #define JLEAF_UDEV 0x041B 279 #define JLEAF_FILEREV 0x041C 280 281 /* 282 * Low level journal data file structures 283 * 284 * NOTE: embedded strings may use the full width of the field and thus 285 * may not be 0-terminated. 286 */ 287 struct jleaf_path { 288 char path[4]; /* path from base of mount point */ 289 /* path is variable length and 0-terminated */ 290 }; 291 292 struct jleaf_vattr { 293 int32_t modes; 294 int32_t fflags; 295 struct timespec atime; 296 struct timespec mtime; 297 struct timespec ctime; 298 int64_t inum; 299 }; 300 301 struct jleaf_cred { 302 int32_t uid; 303 int32_t gid; 304 int32_t pid; 305 int32_t flags; /* suid/sgid and other flags */ 306 char line[8]; /* ttyname or other session identification */ 307 char comm[8]; /* simplified command name for reference */ 308 }; 309 310 struct jleaf_ioinfo { 311 int64_t offset; 312 }; 313 314 #endif 315