xref: /dragonfly/sys/sys/journal.h (revision b1e9d17a)
1 /*
2  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/sys/journal.h,v 1.7 2005/08/24 20:28:33 dillon Exp $
35  */
36 
37 #ifndef _SYS_JOURNAL_H_
38 #define _SYS_JOURNAL_H_
39 
40 /*
41  * Physical file format (binary)
42  *
43  * All raw records are 128-bit aligned, but all record sizes are actual.
44  * This means that any scanning code must 16-byte-align the recsize field
45  * when calculating skips.  The top level raw record has a header and a
46  * trailer to allow both forwards and backwards scanning of the journal.
47  * The alignment requirement allows the worker thread FIFO reservation
48  * API to operate efficiently, amoung other things.
49  *
50  * Logical data stream records are usually no larger then the journal's
51  * in-memory FIFO, since the journal's transactional APIs return contiguous
52  * blocks of buffer space and since logical stream records are used to avoid
53  * stalls when concurrent blocking operations are being written to the journal.
54  * Programs can depend on a logical stream record being a 'reasonable' size.
55  *
56  * Multiple logical data streams may operate concurrently in the journal,
57  * reflecting the fact that the system may be executing multiple blocking
58  * operations on the filesystem all at the same time.  These logical data
59  * streams are short-lived transactional entities which use a 13 bit id
60  * plus a transaction start bit, end bit, and abort bit.
61  *
62  * Stream identifiers in the 0x00-0xFF range are special and not used for
63  * normal transactional commands.
64  *
65  * Stream id 0x00 indicates that no other streams should be active at that
66  * point in the journal, which helps the journaling code detect corruption.
67  *
68  * Stream id 0x01 is used for pad.  Pads are used to align data on convenient
69  * boundaries and to deal with dead space.
70  *
71  * Stream id 0x02 indicates a discontinuity in the streamed data and typically
72  * contains information relating to the reason for the discontinuity.
73  * JTYPE_ASSOCIATE and JTYPE_DISASSOCIATE are usually emplaced in stream 0x02.
74  *
75  * Stream id 0x03 may be used to annotate the journal with text comments
76  * via mountctl commands.  This can be extremely useful to note situations
77  * that may help with later recovery or audit operations.
78  *
79  * Stream id 0x04-0x7F are reserved by DragonFly for future protocol expansion.
80  *
81  * Stream id 0x80-0xFF may be used for third-party protocol expansion.
82  *
83  * Stream id's 0x0100-0x1FFF typically represent short-lived transactions
84  * (i.e. an id may be reused once the previous use has completed).  The
85  * journaling system runs through these id's sequentially which means that
86  * the journaling code can handle up to 8192-256 = 7936 simultanious
87  * transactions at any given moment.
88  *
89  * The sequence number field is context-sensitive.  It is typically used by
90  * a journaling stream to provide an incrementing counter and/or timestamp
91  * so recovery utilities can determine if any data is missing.
92  *
93  * The check word in the trailer may be used to provide an integrity check
94  * on the journaled data.  A value of 0 always means that no check word
95  * has been calculated.
96  *
97  * The journal_rawrecbeg structure MUST be a multiple of 16 bytes.
98  * The journal_rawrecend structure MUST be a multiple of 8 bytes.
99  *
100  * NOTE: PAD RECORD SPECIAL CASE.  Pad records are 16 bytes and have the
101  * rawrecend structure overlayed on the sequence number field of the
102  * rawrecbeg structure.  This is necessary because stream records are
103  * 16 byte aligned, not 24 byte aligned, and dead space is not allowed.
104  * So the pad record must fit into any dead space.
105  */
106 struct journal_rawrecbeg {
107 	u_int16_t begmagic;	/* recovery scan, endianess detection */
108 	u_int16_t streamid;	/* start/stop bits and stream identifier */
109 	int32_t recsize;	/* stream data block (incls beg & end) */
110 	int64_t transid;	/* sequence number or transaction id */
111 	/* ADDITIONAL DATA */
112 };
113 
114 struct journal_rawrecend {
115 	u_int16_t endmagic;	/* recovery scan, endianess detection */
116 	u_int16_t check;	/* check word or 0 */
117 	int32_t recsize;	/* same as rawrecbeg->recsize, for rev scan */
118 };
119 
120 struct journal_ackrecord {
121 	struct journal_rawrecbeg	rbeg;
122 	int32_t				filler0;
123 	int32_t				filler1;
124 	struct journal_rawrecend	rend;
125 };
126 
127 /*
128  * Constants for stream record magic numbers.    The incomplete magic
129  * number code is used internally by the memory FIFO reservation API
130  * and worker thread, allowing a block of space in the journaling
131  * stream (aka a stream block) to be reserved and then populated without
132  * stalling other threads doing their own reservation and population.
133  */
134 #define JREC_BEGMAGIC		0x1234
135 #define JREC_ENDMAGIC		0xCDEF
136 #define JREC_INCOMPLETEMAGIC	0xFFFF
137 
138 /*
139  * Stream ids are 14 bits.  The top 2 bits specify when a new logical
140  * stream is being created or an existing logical stream is being terminated.
141  * A single raw stream record will set both the BEGIN and END bits if the
142  * entire transaction is encapsulated in a single stream record.
143  */
144 #define JREC_STREAMCTL_MASK	0xE000
145 #define JREC_STREAMCTL_BEGIN	0x8000	/* start a new logical stream */
146 #define JREC_STREAMCTL_END	0x4000	/* terminate a logical stream */
147 #define JREC_STREAMCTL_ABORTED	0x2000
148 
149 #define JREC_STREAMID_MASK	0x1FFF
150 #define JREC_STREAMID_SYNCPT	(JREC_STREAMCTL_BEGIN|JREC_STREAMCTL_END|0x0000)
151 #define JREC_STREAMID_PAD	(JREC_STREAMCTL_BEGIN|JREC_STREAMCTL_END|0x0001)
152 #define JREC_STREAMID_DISCONT	0x0002	/* discontinuity */
153 #define JREC_STREAMID_ANNOTATE	0x0003	/* annotation */
154 #define JREC_STREAMID_ACK	0x0004	/* acknowledgement */
155 #define JREC_STREAMID_RESTART	0x0005	/* disctoninuity - journal restart */
156 				/* 0x0006-0x007F reserved by DragonFly */
157 				/* 0x0080-0x00FF for third party use */
158 #define JREC_STREAMID_JMIN	0x0100	/* lowest allowed general id */
159 #define JREC_STREAMID_JMAX	0x2000	/* (one past the highest allowed id) */
160 
161 #define JREC_DEFAULTSIZE	64	/* reasonable initial reservation */
162 
163 /*
164  * Each logical journaling stream typically represents a transaction...
165  * that is, a VFS operation.  The VFS operation is written out using
166  * sub-records and may contain multiple, possibly nested sub-transactions.
167  * multiple sub-transactions occur when a VFS operation cannot be represented
168  * by a single command.  This is typically the case when a journal is
169  * configured to be reversable because UNDO sequences almost always have to
170  * be specified in such cases.  For example, if you ftruncate() a file the
171  * journal might have to write out a sequence of WRITE records representing
172  * the lost data, otherwise the journal would not be reversable.
173  * Sub-transactions within a particular stream do not have their own sequence
174  * number field and thus may not be parallelized (the protocol is already
175  * complex enough!).
176  *
177  * In order to support streaming operation with a limited buffer the recsize
178  * field is allowed to be 0 for subrecords with the JMASK_NESTED bit set.
179  * If this case occurs a scanner can determine that the recursion has ended
180  * by detecting a nested subrecord with the JMASK_LAST bit set.  A scanner
181  * may also set the field to the proper value after the fact to make later
182  * operations more efficient.
183  *
184  * Note that this bit must be properly set even if the recsize field is
185  * non-zero.  The recsize must always be properly specified for 'leaf'
186  * subrecords, however in order to allow subsystems to potentially allocate
187  * more data space then they use the protocol allows any 'dead' space to be
188  * filled with JLEAF_PAD records.
189  *
190  * The recsize field may indicate data well past the size of the current
191  * raw stream record.  That is, the scanner may have to glue together
192  * multiple stream records with the same stream id to fully decode the
193  * embedded subrecords.  In particular, a subrecord could very well represent
194  * hundreds of megabytes of data (e.g. if a program were to do a
195  * multi-megabyte write()) and be split up across thousands of raw streaming
196  * records, possibly interlaced with other unrelated streams from other
197  * unrelated processes.
198  *
199  * If a large sub-transaction is aborted the logical stream may be
200  * terminated without writing out all the expected data.  When this occurs
201  * the stream's ending record must also have the JREC_STREAMCTL_ABORTED bit
202  * set.  However, scanners should still be robust enough to detect such
203  * overflows even if the aborted bit is not set and consider them data
204  * corruption.
205  *
206  * Aborts may also occur in the normal course of operations, especially once
207  * the journaling API is integrated into the cache coherency API.  A normal
208  * abort is issued by emplacing a JLEAF_ABORT record within the transaction
209  * being aborted.  Such records must be the last record in the sub-transaction,
210  * so JLEAF_LAST is also usually set.  In a transaction with many
211  * sub-transactions only those sub-transactions with an abort record are
212  * aborted, the rest remain valid.  Abort records are considered S.O.P. for
213  * two reasons:  First, limited memory buffer space may make it impossible
214  * to delete the portion of the stream being aborted (the data may have
215  * already been sent to the target).  Second, the journaling code will
216  * eventually be used to support a cache coherency layer which may have to
217  * abort operations as part of the cache coherency protocol.  Note that
218  * subrecord aborts are different from stream record aborts.  Stream record
219  * aborts are considered to be extrodinary situations while subrecord aborts
220  * are S.O.P.
221  */
222 
223 struct journal_subrecord {
224 	u_int16_t rectype;	/* 2 control bits, 14 record type bits */
225 	int16_t reserved;	/* future use */
226 	int32_t recsize;	/* record size (mandatory if not NESTED) */
227 	/* ADDITIONAL DATA */
228 };
229 
230 #define JMASK_NESTED		0x8000	/* data is a nested recursion */
231 #define JMASK_LAST		0x4000
232 #define JMASK_SUBRECORD		0x0400
233 #define JTYPE_MASK		(~JMASK_LAST)
234 
235 #define JLEAF_PAD		0x0000
236 #define JLEAF_ABORT		0x0001
237 #define JTYPE_ASSOCIATE		0x0002
238 #define JTYPE_DISASSOCIATE	0x0003
239 #define JTYPE_UNDO		(JMASK_NESTED|0x0004)
240 #define JTYPE_AUDIT		(JMASK_NESTED|0x0005)
241 
242 #define JTYPE_SETATTR		(JMASK_NESTED|0x0010)
243 #define JTYPE_WRITE		(JMASK_NESTED|0x0011)
244 #define JTYPE_PUTPAGES		(JMASK_NESTED|0x0012)
245 #define JTYPE_SETACL		(JMASK_NESTED|0x0013)
246 #define JTYPE_SETEXTATTR	(JMASK_NESTED|0x0014)
247 #define JTYPE_CREATE		(JMASK_NESTED|0x0015)
248 #define JTYPE_MKNOD		(JMASK_NESTED|0x0016)
249 #define JTYPE_LINK		(JMASK_NESTED|0x0017)
250 #define JTYPE_SYMLINK		(JMASK_NESTED|0x0018)
251 #define JTYPE_WHITEOUT		(JMASK_NESTED|0x0019)
252 #define JTYPE_REMOVE		(JMASK_NESTED|0x001A)
253 #define JTYPE_MKDIR		(JMASK_NESTED|0x001B)
254 #define JTYPE_RMDIR		(JMASK_NESTED|0x001C)
255 #define JTYPE_RENAME		(JMASK_NESTED|0x001D)
256 
257 #define JTYPE_VATTR		(JMASK_NESTED|0x0100)
258 #define JTYPE_CRED		(JMASK_NESTED|0x0101)
259 
260 /*
261  * Low level record types
262  */
263 #define JLEAF_FILEDATA		0x0401
264 #define JLEAF_PATH1		0x0402
265 #define JLEAF_PATH2		0x0403
266 #define JLEAF_PATH3		0x0404
267 #define JLEAF_PATH4		0x0405
268 #define JLEAF_UID		0x0406
269 #define JLEAF_GID		0x0407
270 #define JLEAF_MODES		0x0408
271 #define JLEAF_FFLAGS		0x0409
272 #define JLEAF_PID		0x040A
273 #define JLEAF_PPID		0x040B
274 #define JLEAF_COMM		0x040C
275 #define JLEAF_ATTRNAME		0x040D
276 #define JLEAF_PATH_REF		0x040E
277 #define JLEAF_RESERVED_0F	0x040F
278 #define JLEAF_SYMLINKDATA	0x0410
279 #define JLEAF_SEEKPOS		0x0411
280 #define JLEAF_INUM		0x0412
281 #define JLEAF_NLINK		0x0413
282 #define JLEAF_FSID		0x0414
283 #define JLEAF_SIZE		0x0415
284 #define JLEAF_ATIME		0x0416
285 #define JLEAF_MTIME		0x0417
286 #define JLEAF_CTIME		0x0418
287 #define JLEAF_GEN		0x0419
288 #define JLEAF_FLAGS		0x041A
289 #define JLEAF_UDEV		0x041B
290 #define JLEAF_FILEREV		0x041C
291 #define JLEAF_VTYPE		0x041D
292 #define JLEAF_ERROR		0x041E
293 
294 /*
295  * Low level journal data file structures
296  *
297  * NOTE: embedded strings may use the full width of the field and thus
298  * may not be 0-terminated.
299  */
300 struct jleaf_path {
301 	char	path[4];	/* path from base of mount point */
302 	/* path is variable length and 0-terminated */
303 };
304 
305 struct jleaf_vattr {
306 	int32_t	modes;
307 	int32_t fflags;
308 	struct timespec atime;
309 	struct timespec mtime;
310 	struct timespec ctime;
311 	int64_t inum;
312 };
313 
314 struct jleaf_cred {
315 	int32_t	uid;
316 	int32_t gid;
317 	int32_t pid;
318 	int32_t flags;		/* suid/sgid and other flags */
319 	char	line[8];	/* ttyname or other session identification */
320 	char	comm[8];	/* simplified command name for reference */
321 };
322 
323 struct jleaf_ioinfo {
324 	int64_t offset;
325 };
326 
327 #endif
328