1 /* 2 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/sys/mountctl.h,v 1.5 2005/01/09 03:04:53 dillon Exp $ 35 */ 36 37 /* 38 * General constants 39 */ 40 41 #define JIDMAX 32 /* id string buf[] size (incls \0) */ 42 43 /* 44 * Data structures for the journaling API 45 */ 46 47 #define MOUNTCTL_INSTALL_VFS_JOURNAL 1 48 #define MOUNTCTL_REMOVE_VFS_JOURNAL 2 49 #define MOUNTCTL_RESYNC_VFS_JOURNAL 3 50 #define MOUNTCTL_STATUS_VFS_JOURNAL 4 51 52 #define MOUNTCTL_INSTALL_BLK_JOURNAL 8 53 #define MOUNTCTL_REMOVE_BLK_JOURNAL 9 54 #define MOUNTCTL_RESYNC_BLK_JOURNAL 10 55 #define MOUNTCTL_STATUS_BLK_JOURNAL 11 56 57 struct mountctl_install_journal { 58 char id[JIDMAX]; 59 int flags; /* journaling flags */ 60 int unused01; 61 int64_t membufsize; /* backing store */ 62 int64_t swapbufsize; /* backing store */ 63 int64_t transid; /* starting with specified transaction id */ 64 int64_t unused02; 65 int stallwarn; /* stall warning (seconds) */ 66 int stallerror; /* stall error (seconds) */ 67 int unused03; 68 int unused04; 69 }; 70 71 #define MC_JOURNAL_ACTIVE 0x00000001 /* journal is active */ 72 #define MC_JOURNAL_STOP_REQ 0x00000002 /* stop request pend */ 73 #define MC_JOURNAL_STOP_IMM 0x00000004 /* STOP+trash fifo */ 74 #define MC_JOURNAL_WWAIT 0x00000040 /* write stall */ 75 #define MC_JOURNAL_WANT_AUDIT 0x00010000 /* audit trail */ 76 #define MC_JOURNAL_WANT_REVERSABLE 0x00020000 /* reversable stream */ 77 78 struct mountctl_remove_journal { 79 char id[JIDMAX]; 80 int flags; 81 }; 82 83 #define MC_JOURNAL_REMOVE_TRASH 0x00000001 /* data -> trash */ 84 #define MC_JOURNAL_REMOVE_ASSYNC 0x00000002 /* asynchronous op */ 85 86 struct mountctl_status_journal { 87 char id[JIDMAX]; 88 int index; 89 }; 90 91 #define MC_JOURNAL_INDEX_ALL -2 92 #define MC_JOURNAL_INDEX_ID -1 93 94 struct mountctl_journal_ret_status { 95 int recsize; 96 char id[JIDMAX]; 97 int index; 98 int flags; 99 int64_t membufsize; 100 int64_t membufused; 101 int64_t membufiopend; 102 int64_t swapbufsize; 103 int64_t swapbufused; 104 int64_t swapbufiopend; 105 int64_t transidstart; 106 int64_t transidcurrent; 107 int64_t transidiopend; 108 int64_t transidacked; 109 int64_t bytessent; 110 int64_t bytesacked; 111 struct timeval lastack; 112 }; 113 114 #define MC_JOURNAL_STATUS_MORETOCOME 0x00000001 115 116 /* 117 * Physical file format (binary) 118 * 119 * All raw records are 128-bit aligned, but all record sizes are actual. 120 * This means that any scanning code must 16-byte-align the recsize field 121 * when calculating skips. The top level raw record has a header and a 122 * trailer to allow both forwards and backwards scanning of the journal. 123 * The alignment requirement allows the worker thread FIFO reservation 124 * API to operate efficiently, amoung other things. 125 * 126 * Logical data stream records are usually no larger then the journal's 127 * in-memory FIFO, since the journal's transactional APIs return contiguous 128 * blocks of buffer space and since logical stream records are used to avoid 129 * stalls when concurrent blocking operations are being written to the journal. 130 * Programs can depend on a logical stream record being a 'reasonable' size. 131 * 132 * Multiple logical data streams may operate concurrently in the journal, 133 * reflecting the fact that the system may be executing multiple blocking 134 * operations on the filesystem all at the same time. These logical data 135 * streams are short-lived transactional entities which use a 13 bit id 136 * plus a transaction start bit, end bit, and abort bit. 137 * 138 * Stream identifiers in the 0x00-0xFF range are special and not used for 139 * normal transactional commands. 140 * 141 * Stream id 0x00 indicates that no other streams should be active at that 142 * point in the journal, which helps the journaling code detect corruption. 143 * 144 * Stream id 0x01 is used for pad. Pads are used to align data on convenient 145 * boundaries and to deal with dead space. 146 * 147 * Stream id 0x02 indicates a discontinuity in the streamed data and typically 148 * contains information relating to the reason for the discontinuity. 149 * JTYPE_ASSOCIATE and JTYPE_DISASSOCIATE are usually emplaced in stream 0x02. 150 * 151 * Stream id 0x03 may be used to annotate the journal with text comments 152 * via mountctl commands. This can be extremely useful to note situations 153 * that may help with later recovery or audit operations. 154 * 155 * Stream id 0x04-0x7F are reserved by DragonFly for future protocol expansion. 156 * 157 * Stream id 0x80-0xFF may be used for third-party protocol expansion. 158 * 159 * Stream id's 0x0100-0x1FFF typically represent short-lived transactions 160 * (i.e. an id may be reused once the previous use has completed). The 161 * journaling system runs through these id's sequentially which means that 162 * the journaling code can handle up to 8192-256 = 7936 simultanious 163 * transactions at any given moment. 164 * 165 * The sequence number field is context-sensitive. It is typically used by 166 * a journaling stream to provide an incrementing counter and/or timestamp 167 * so recovery utilities can determine if any data is missing. 168 * 169 * The check word in the trailer may be used to provide an integrity check 170 * on the journaled data. A value of 0 always means that no check word 171 * has been calculated. 172 * 173 * The journal_rawrecbeg structure MUST be a multiple of 16 bytes. 174 * The journal_rawrecend structure MUST be a multiple of 8 bytes. 175 * 176 * NOTE: PAD RECORD SPECIAL CASE. Pad records are 16 bytes and have the 177 * rawrecend structure overlayed on the sequence number field of the 178 * rawrecbeg structure. This is necessary because stream records are 179 * 16 byte aligned, not 24 byte aligned, and dead space is not allowed. 180 * So the pad record must fit into any dead space. 181 */ 182 struct journal_rawrecbeg { 183 u_int16_t begmagic; /* recovery scan, endianess detection */ 184 u_int16_t streamid; /* start/stop bits and stream identifier */ 185 int32_t recsize; /* stream data block (incls beg & end) */ 186 int64_t seqno; /* sequence number or transaction id */ 187 /* ADDITIONAL DATA */ 188 }; 189 190 struct journal_rawrecend { 191 u_int16_t endmagic; /* recovery scan, endianess detection */ 192 u_int16_t check; /* check word or 0 */ 193 int32_t recsize; /* same as rawrecbeg->recsize, for rev scan */ 194 }; 195 196 /* 197 * Constants for stream record magic numbers. The incomplete magic 198 * number code is used internally by the memory FIFO reservation API 199 * and worker thread, allowing a block of space in the journaling 200 * stream (aka a stream block) to be reserved and then populated without 201 * stalling other threads doing their own reservation and population. 202 */ 203 #define JREC_BEGMAGIC 0x1234 204 #define JREC_ENDMAGIC 0xCDEF 205 #define JREC_INCOMPLETEMAGIC 0xFFFF 206 207 /* 208 * Stream ids are 14 bits. The top 2 bits specify when a new logical 209 * stream is being created or an existing logical stream is being terminated. 210 * A single raw stream record will set both the BEGIN and END bits if the 211 * entire transaction is encapsulated in a single stream record. 212 */ 213 #define JREC_STREAMCTL_MASK 0xE000 214 #define JREC_STREAMCTL_BEGIN 0x8000 /* start a new logical stream */ 215 #define JREC_STREAMCTL_END 0x4000 /* terminate a logical stream */ 216 #define JREC_STREAMCTL_ABORTED 0x2000 217 218 #define JREC_STREAMID_MASK 0x1FFF 219 #define JREC_STREAMID_SYNCPT (JREC_STREAMCTL_BEGIN|JREC_STREAMCTL_END|0x0000) 220 #define JREC_STREAMID_PAD (JREC_STREAMCTL_BEGIN|JREC_STREAMCTL_END|0x0001) 221 #define JREC_STREAMID_DISCONT 0x0002 /* discontinuity */ 222 #define JREC_STREAMID_ANNOTATE 0x0003 /* annotation */ 223 /* 0x0004-0x007F reserved by DragonFly */ 224 /* 0x0080-0x00FF for third party use */ 225 #define JREC_STREAMID_JMIN 0x0100 /* lowest allowed general id */ 226 #define JREC_STREAMID_JMAX 0x2000 /* (one past the highest allowed id) */ 227 228 #define JREC_DEFAULTSIZE 64 /* reasonable initial reservation */ 229 230 /* 231 * Each logical journaling stream typically represents a transaction... 232 * that is, a VFS operation. The VFS operation is written out using 233 * sub-records and may contain multiple, possibly nested sub-transactions. 234 * multiple sub-transactions occur when a VFS operation cannot be represented 235 * by a single command. This is typically the case when a journal is 236 * configured to be reversable because UNDO sequences almost always have to 237 * be specified in such cases. For example, if you ftruncate() a file the 238 * journal might have to write out a sequence of WRITE records representing 239 * the lost data, otherwise the journal would not be reversable. 240 * Sub-transactions within a particular stream do not have their own sequence 241 * number field and thus may not be parallelized (the protocol is already 242 * complex enough!). 243 * 244 * In order to support streaming operation with a limited buffer the recsize 245 * field is allowed to be 0 for subrecords with the JMASK_NESTED bit set. 246 * If this case occurs a scanner can determine that the recursion has ended 247 * by detecting a nested subrecord with the JMASK_LAST bit set. A scanner 248 * may also set the field to the proper value after the fact to make later 249 * operations more efficient. 250 * 251 * Note that this bit must be properly set even if the recsize field is 252 * non-zero. The recsize must always be properly specified for 'leaf' 253 * subrecords, however in order to allow subsystems to potentially allocate 254 * more data space then they use the protocol allows any 'dead' space to be 255 * filled with JLEAF_PAD records. 256 * 257 * The recsize field may indicate data well past the size of the current 258 * raw stream record. That is, the scanner may have to glue together 259 * multiple stream records with the same stream id to fully decode the 260 * embedded subrecords. In particular, a subrecord could very well represent 261 * hundreds of megabytes of data (e.g. if a program were to do a 262 * multi-megabyte write()) and be split up across thousands of raw streaming 263 * records, possibly interlaced with other unrelated streams from other 264 * unrelated processes. 265 * 266 * If a large sub-transaction is aborted the logical stream may be 267 * terminated without writing out all the expected data. When this occurs 268 * the stream's ending record must also have the JREC_STREAMCTL_ABORTED bit 269 * set. However, scanners should still be robust enough to detect such 270 * overflows even if the aborted bit is not set and consider them data 271 * corruption. 272 * 273 * Aborts may also occur in the normal course of operations, especially once 274 * the journaling API is integrated into the cache coherency API. A normal 275 * abort is issued by emplacing a JLEAF_ABORT record within the transaction 276 * being aborted. Such records must be the last record in the sub-transaction, 277 * so JLEAF_LAST is also usually set. In a transaction with many 278 * sub-transactions only those sub-transactions with an abort record are 279 * aborted, the rest remain valid. Abort records are considered S.O.P. for 280 * two reasons: First, limited memory buffer space may make it impossible 281 * to delete the portion of the stream being aborted (the data may have 282 * already been sent to the target). Second, the journaling code will 283 * eventually be used to support a cache coherency layer which may have to 284 * abort operations as part of the cache coherency protocol. Note that 285 * subrecord aborts are different from stream record aborts. Stream record 286 * aborts are considered to be extrodinary situations while subrecord aborts 287 * are S.O.P. 288 */ 289 290 struct journal_subrecord { 291 int16_t rectype; /* 2 control bits, 14 record type bits */ 292 int16_t reserved; /* future use */ 293 int32_t recsize; /* record size (mandatory if not NESTED) */ 294 /* ADDITIONAL DATA */ 295 }; 296 297 #define JMASK_NESTED 0x8000 /* data is a nested recursion */ 298 #define JMASK_LAST 0x4000 299 300 #define JLEAF_PAD 0x0000 301 #define JLEAF_ABORT 0x0001 302 #define JTYPE_ASSOCIATE 0x0002 303 #define JTYPE_DISASSOCIATE 0x0003 304 #define JTYPE_UNDO (JMASK_NESTED|0x0004) 305 #define JTYPE_AUDIT (JMASK_NESTED|0x0005) 306 307 #define JTYPE_SETATTR (JMASK_NESTED|0x0010) 308 #define JTYPE_WRITE (JMASK_NESTED|0x0011) 309 #define JTYPE_PUTPAGES (JMASK_NESTED|0x0012) 310 #define JTYPE_SETACL (JMASK_NESTED|0x0013) 311 #define JTYPE_SETEXTATTR (JMASK_NESTED|0x0014) 312 #define JTYPE_CREATE (JMASK_NESTED|0x0015) 313 #define JTYPE_MKNOD (JMASK_NESTED|0x0016) 314 #define JTYPE_LINK (JMASK_NESTED|0x0017) 315 #define JTYPE_SYMLINK (JMASK_NESTED|0x0018) 316 #define JTYPE_WHITEOUT (JMASK_NESTED|0x0019) 317 #define JTYPE_REMOVE (JMASK_NESTED|0x001A) 318 #define JTYPE_MKDIR (JMASK_NESTED|0x001B) 319 #define JTYPE_RMDIR (JMASK_NESTED|0x001C) 320 #define JTYPE_RENAME (JMASK_NESTED|0x001D) 321 322 /* 323 * Low level record types 324 */ 325 #define JLEAF_FILEDATA 0x0401 326 #define JLEAF_PATH1 0x0402 327 #define JLEAF_PATH2 0x0403 328 #define JLEAF_PATH3 0x0404 329 #define JLEAF_PATH4 0x0405 330 #define JLEAF_UID 0x0406 331 #define JLEAF_GID 0x0407 332 #define JLEAF_MODES 0x0408 333 #define JLEAF_FFLAGS 0x0409 334 #define JLEAF_PID 0x040A 335 #define JLEAF_PPID 0x040B 336 #define JLEAF_COMM 0x040C 337 #define JLEAF_RESERVED_0D 0x040D 338 #define JLEAF_RESERVED_0E 0x040E 339 #define JLEAF_RESERVED_0F 0x040F 340 #define JLEAF_SYMLINKDATA 0x0410 341 #define JLEAF_SEEKPOS 0x0411 342 #define JLEAF_INUM 0x0412 343 344 #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES) 345 346 /* 347 * Support structures for the generic journaling structure 348 */ 349 struct journal_memfifo { 350 int size; /* size (power of two) */ 351 int mask; /* index mask (size - 1) */ 352 int rindex; /* stream reader index (track fd writes) */ 353 int xindex; /* last acked / reader restart */ 354 int windex; /* stream writer index */ 355 char *membase; /* memory buffer representing the FIFO */ 356 }; 357 358 /* 359 * Generic journaling structure attached to a mount point. 360 */ 361 struct journal { 362 TAILQ_ENTRY(journal) jentry; 363 struct file *fp; 364 char id[JIDMAX]; 365 int flags; /* journaling flags */ 366 int64_t transid; 367 int64_t total_acked; 368 struct journal_memfifo fifo; 369 struct thread thread; 370 }; 371 372 /* 373 * The jrecord structure is used to build a journaling transaction. Since 374 * a single journaling transaction might encompass very large buffers it 375 * is possible for multiple transactions to be written out to the FIFO 376 * in parallel and in peacemeal. 377 */ 378 struct jrecord { 379 struct journal *jo; 380 char *stream_ptr; 381 int stream_residual; 382 int stream_reserved; 383 struct journal_rawrecbeg *rawp; 384 struct journal_subrecord *parent; 385 struct journal_subrecord *last; 386 int16_t streamid; 387 int pushcount; 388 int pushptrgood; 389 int residual; 390 int residual_align; 391 }; 392 393 #endif 394