xref: /dragonfly/sys/sys/mountctl.h (revision 5153f92b)
1 /*
2  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/sys/mountctl.h,v 1.5 2005/01/09 03:04:53 dillon Exp $
35  */
36 
37 /*
38  * General constants
39  */
40 
41 #define JIDMAX		32	/* id string buf[] size (incls \0) */
42 
43 /*
44  * Data structures for the journaling API
45  */
46 
47 #define MOUNTCTL_INSTALL_VFS_JOURNAL	1
48 #define MOUNTCTL_REMOVE_VFS_JOURNAL	2
49 #define MOUNTCTL_RESYNC_VFS_JOURNAL	3
50 #define MOUNTCTL_STATUS_VFS_JOURNAL	4
51 
52 #define MOUNTCTL_INSTALL_BLK_JOURNAL	8
53 #define MOUNTCTL_REMOVE_BLK_JOURNAL	9
54 #define MOUNTCTL_RESYNC_BLK_JOURNAL	10
55 #define MOUNTCTL_STATUS_BLK_JOURNAL	11
56 
57 struct mountctl_install_journal {
58 	char	id[JIDMAX];
59 	int	flags;		/* journaling flags */
60 	int	unused01;
61 	int64_t	membufsize;	/* backing store */
62 	int64_t	swapbufsize;	/* backing store */
63 	int64_t	transid;	/* starting with specified transaction id */
64 	int64_t unused02;
65 	int	stallwarn;	/* stall warning (seconds) */
66 	int	stallerror;	/* stall error (seconds) */
67 	int	unused03;
68 	int	unused04;
69 };
70 
71 #define MC_JOURNAL_ACTIVE		0x00000001	/* journal is active */
72 #define MC_JOURNAL_STOP_REQ		0x00000002	/* stop request pend */
73 #define MC_JOURNAL_STOP_IMM		0x00000004	/* STOP+trash fifo */
74 #define MC_JOURNAL_WWAIT		0x00000040	/* write stall */
75 #define MC_JOURNAL_WANT_AUDIT		0x00010000	/* audit trail */
76 #define MC_JOURNAL_WANT_REVERSABLE	0x00020000	/* reversable stream */
77 
78 struct mountctl_remove_journal {
79 	char	id[JIDMAX];
80 	int	flags;
81 };
82 
83 #define MC_JOURNAL_REMOVE_TRASH		0x00000001	/* data -> trash */
84 #define MC_JOURNAL_REMOVE_ASSYNC	0x00000002	/* asynchronous op */
85 
86 struct mountctl_status_journal {
87 	char	id[JIDMAX];
88 	int	index;
89 };
90 
91 #define MC_JOURNAL_INDEX_ALL		-2
92 #define MC_JOURNAL_INDEX_ID		-1
93 
94 struct mountctl_journal_ret_status {
95 	int	recsize;
96 	char	id[JIDMAX];
97 	int	index;
98 	int	flags;
99 	int64_t	membufsize;
100 	int64_t	membufused;
101 	int64_t	membufiopend;
102 	int64_t	swapbufsize;
103 	int64_t	swapbufused;
104 	int64_t	swapbufiopend;
105 	int64_t transidstart;
106 	int64_t transidcurrent;
107 	int64_t transidiopend;
108 	int64_t transidacked;
109 	int64_t bytessent;
110 	int64_t bytesacked;
111 	struct timeval lastack;
112 };
113 
114 #define MC_JOURNAL_STATUS_MORETOCOME	0x00000001
115 
116 /*
117  * Physical file format (binary)
118  *
119  * All raw records are 128-bit aligned, but all record sizes are actual.
120  * This means that any scanning code must 16-byte-align the recsize field
121  * when calculating skips.  The top level raw record has a header and a
122  * trailer to allow both forwards and backwards scanning of the journal.
123  * The alignment requirement allows the worker thread FIFO reservation
124  * API to operate efficiently, amoung other things.
125  *
126  * Logical data stream records are usually no larger then the journal's
127  * in-memory FIFO, since the journal's transactional APIs return contiguous
128  * blocks of buffer space and since logical stream records are used to avoid
129  * stalls when concurrent blocking operations are being written to the journal.
130  * Programs can depend on a logical stream record being a 'reasonable' size.
131  *
132  * Multiple logical data streams may operate concurrently in the journal,
133  * reflecting the fact that the system may be executing multiple blocking
134  * operations on the filesystem all at the same time.  These logical data
135  * streams are short-lived transactional entities which use a 13 bit id
136  * plus a transaction start bit, end bit, and abort bit.
137  *
138  * Stream identifiers in the 0x00-0xFF range are special and not used for
139  * normal transactional commands.
140  *
141  * Stream id 0x00 indicates that no other streams should be active at that
142  * point in the journal, which helps the journaling code detect corruption.
143  *
144  * Stream id 0x01 is used for pad.  Pads are used to align data on convenient
145  * boundaries and to deal with dead space.
146  *
147  * Stream id 0x02 indicates a discontinuity in the streamed data and typically
148  * contains information relating to the reason for the discontinuity.
149  * JTYPE_ASSOCIATE and JTYPE_DISASSOCIATE are usually emplaced in stream 0x02.
150  *
151  * Stream id 0x03 may be used to annotate the journal with text comments
152  * via mountctl commands.  This can be extremely useful to note situations
153  * that may help with later recovery or audit operations.
154  *
155  * Stream id 0x04-0x7F are reserved by DragonFly for future protocol expansion.
156  *
157  * Stream id 0x80-0xFF may be used for third-party protocol expansion.
158  *
159  * Stream id's 0x0100-0x1FFF typically represent short-lived transactions
160  * (i.e. an id may be reused once the previous use has completed).  The
161  * journaling system runs through these id's sequentially which means that
162  * the journaling code can handle up to 8192-256 = 7936 simultanious
163  * transactions at any given moment.
164  *
165  * The sequence number field is context-sensitive.  It is typically used by
166  * a journaling stream to provide an incrementing counter and/or timestamp
167  * so recovery utilities can determine if any data is missing.
168  *
169  * The check word in the trailer may be used to provide an integrity check
170  * on the journaled data.  A value of 0 always means that no check word
171  * has been calculated.
172  *
173  * The journal_rawrecbeg structure MUST be a multiple of 16 bytes.
174  * The journal_rawrecend structure MUST be a multiple of 8 bytes.
175  *
176  * NOTE: PAD RECORD SPECIAL CASE.  Pad records are 16 bytes and have the
177  * rawrecend structure overlayed on the sequence number field of the
178  * rawrecbeg structure.  This is necessary because stream records are
179  * 16 byte aligned, not 24 byte aligned, and dead space is not allowed.
180  * So the pad record must fit into any dead space.
181  */
182 struct journal_rawrecbeg {
183 	u_int16_t begmagic;	/* recovery scan, endianess detection */
184 	u_int16_t streamid;	/* start/stop bits and stream identifier */
185 	int32_t recsize;	/* stream data block (incls beg & end) */
186 	int64_t seqno;		/* sequence number or transaction id */
187 	/* ADDITIONAL DATA */
188 };
189 
190 struct journal_rawrecend {
191 	u_int16_t endmagic;	/* recovery scan, endianess detection */
192 	u_int16_t check;	/* check word or 0 */
193 	int32_t recsize;	/* same as rawrecbeg->recsize, for rev scan */
194 };
195 
196 /*
197  * Constants for stream record magic numbers.    The incomplete magic
198  * number code is used internally by the memory FIFO reservation API
199  * and worker thread, allowing a block of space in the journaling
200  * stream (aka a stream block) to be reserved and then populated without
201  * stalling other threads doing their own reservation and population.
202  */
203 #define JREC_BEGMAGIC		0x1234
204 #define JREC_ENDMAGIC		0xCDEF
205 #define JREC_INCOMPLETEMAGIC	0xFFFF
206 
207 /*
208  * Stream ids are 14 bits.  The top 2 bits specify when a new logical
209  * stream is being created or an existing logical stream is being terminated.
210  * A single raw stream record will set both the BEGIN and END bits if the
211  * entire transaction is encapsulated in a single stream record.
212  */
213 #define JREC_STREAMCTL_MASK	0xE000
214 #define JREC_STREAMCTL_BEGIN	0x8000	/* start a new logical stream */
215 #define JREC_STREAMCTL_END	0x4000	/* terminate a logical stream */
216 #define JREC_STREAMCTL_ABORTED	0x2000
217 
218 #define JREC_STREAMID_MASK	0x1FFF
219 #define JREC_STREAMID_SYNCPT	(JREC_STREAMCTL_BEGIN|JREC_STREAMCTL_END|0x0000)
220 #define JREC_STREAMID_PAD	(JREC_STREAMCTL_BEGIN|JREC_STREAMCTL_END|0x0001)
221 #define JREC_STREAMID_DISCONT	0x0002	/* discontinuity */
222 #define JREC_STREAMID_ANNOTATE	0x0003	/* annotation */
223 				/* 0x0004-0x007F reserved by DragonFly */
224 				/* 0x0080-0x00FF for third party use */
225 #define JREC_STREAMID_JMIN	0x0100	/* lowest allowed general id */
226 #define JREC_STREAMID_JMAX	0x2000	/* (one past the highest allowed id) */
227 
228 #define JREC_DEFAULTSIZE	64	/* reasonable initial reservation */
229 
230 /*
231  * Each logical journaling stream typically represents a transaction...
232  * that is, a VFS operation.  The VFS operation is written out using
233  * sub-records and may contain multiple, possibly nested sub-transactions.
234  * multiple sub-transactions occur when a VFS operation cannot be represented
235  * by a single command.  This is typically the case when a journal is
236  * configured to be reversable because UNDO sequences almost always have to
237  * be specified in such cases.  For example, if you ftruncate() a file the
238  * journal might have to write out a sequence of WRITE records representing
239  * the lost data, otherwise the journal would not be reversable.
240  * Sub-transactions within a particular stream do not have their own sequence
241  * number field and thus may not be parallelized (the protocol is already
242  * complex enough!).
243  *
244  * In order to support streaming operation with a limited buffer the recsize
245  * field is allowed to be 0 for subrecords with the JMASK_NESTED bit set.
246  * If this case occurs a scanner can determine that the recursion has ended
247  * by detecting a nested subrecord with the JMASK_LAST bit set.  A scanner
248  * may also set the field to the proper value after the fact to make later
249  * operations more efficient.
250  *
251  * Note that this bit must be properly set even if the recsize field is
252  * non-zero.  The recsize must always be properly specified for 'leaf'
253  * subrecords, however in order to allow subsystems to potentially allocate
254  * more data space then they use the protocol allows any 'dead' space to be
255  * filled with JLEAF_PAD records.
256  *
257  * The recsize field may indicate data well past the size of the current
258  * raw stream record.  That is, the scanner may have to glue together
259  * multiple stream records with the same stream id to fully decode the
260  * embedded subrecords.  In particular, a subrecord could very well represent
261  * hundreds of megabytes of data (e.g. if a program were to do a
262  * multi-megabyte write()) and be split up across thousands of raw streaming
263  * records, possibly interlaced with other unrelated streams from other
264  * unrelated processes.
265  *
266  * If a large sub-transaction is aborted the logical stream may be
267  * terminated without writing out all the expected data.  When this occurs
268  * the stream's ending record must also have the JREC_STREAMCTL_ABORTED bit
269  * set.  However, scanners should still be robust enough to detect such
270  * overflows even if the aborted bit is not set and consider them data
271  * corruption.
272  *
273  * Aborts may also occur in the normal course of operations, especially once
274  * the journaling API is integrated into the cache coherency API.  A normal
275  * abort is issued by emplacing a JLEAF_ABORT record within the transaction
276  * being aborted.  Such records must be the last record in the sub-transaction,
277  * so JLEAF_LAST is also usually set.  In a transaction with many
278  * sub-transactions only those sub-transactions with an abort record are
279  * aborted, the rest remain valid.  Abort records are considered S.O.P. for
280  * two reasons:  First, limited memory buffer space may make it impossible
281  * to delete the portion of the stream being aborted (the data may have
282  * already been sent to the target).  Second, the journaling code will
283  * eventually be used to support a cache coherency layer which may have to
284  * abort operations as part of the cache coherency protocol.  Note that
285  * subrecord aborts are different from stream record aborts.  Stream record
286  * aborts are considered to be extrodinary situations while subrecord aborts
287  * are S.O.P.
288  */
289 
290 struct journal_subrecord {
291 	int16_t rectype;	/* 2 control bits, 14 record type bits */
292 	int16_t reserved;	/* future use */
293 	int32_t recsize;	/* record size (mandatory if not NESTED) */
294 	/* ADDITIONAL DATA */
295 };
296 
297 #define JMASK_NESTED		0x8000	/* data is a nested recursion */
298 #define JMASK_LAST		0x4000
299 
300 #define JLEAF_PAD		0x0000
301 #define JLEAF_ABORT		0x0001
302 #define JTYPE_ASSOCIATE		0x0002
303 #define JTYPE_DISASSOCIATE	0x0003
304 #define JTYPE_UNDO		(JMASK_NESTED|0x0004)
305 #define JTYPE_AUDIT		(JMASK_NESTED|0x0005)
306 
307 #define JTYPE_SETATTR		(JMASK_NESTED|0x0010)
308 #define JTYPE_WRITE		(JMASK_NESTED|0x0011)
309 #define JTYPE_PUTPAGES		(JMASK_NESTED|0x0012)
310 #define JTYPE_SETACL		(JMASK_NESTED|0x0013)
311 #define JTYPE_SETEXTATTR	(JMASK_NESTED|0x0014)
312 #define JTYPE_CREATE		(JMASK_NESTED|0x0015)
313 #define JTYPE_MKNOD		(JMASK_NESTED|0x0016)
314 #define JTYPE_LINK		(JMASK_NESTED|0x0017)
315 #define JTYPE_SYMLINK		(JMASK_NESTED|0x0018)
316 #define JTYPE_WHITEOUT		(JMASK_NESTED|0x0019)
317 #define JTYPE_REMOVE		(JMASK_NESTED|0x001A)
318 #define JTYPE_MKDIR		(JMASK_NESTED|0x001B)
319 #define JTYPE_RMDIR		(JMASK_NESTED|0x001C)
320 #define JTYPE_RENAME		(JMASK_NESTED|0x001D)
321 
322 /*
323  * Low level record types
324  */
325 #define JLEAF_FILEDATA		0x0401
326 #define JLEAF_PATH1		0x0402
327 #define JLEAF_PATH2		0x0403
328 #define JLEAF_PATH3		0x0404
329 #define JLEAF_PATH4		0x0405
330 #define JLEAF_UID		0x0406
331 #define JLEAF_GID		0x0407
332 #define JLEAF_MODES		0x0408
333 #define JLEAF_FFLAGS		0x0409
334 #define JLEAF_PID		0x040A
335 #define JLEAF_PPID		0x040B
336 #define JLEAF_COMM		0x040C
337 #define JLEAF_RESERVED_0D	0x040D
338 #define JLEAF_RESERVED_0E	0x040E
339 #define JLEAF_RESERVED_0F	0x040F
340 #define JLEAF_SYMLINKDATA	0x0410
341 #define JLEAF_SEEKPOS		0x0411
342 #define JLEAF_INUM		0x0412
343 
344 #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES)
345 
346 /*
347  * Support structures for the generic journaling structure
348  */
349 struct journal_memfifo {
350 	int	size;		/* size (power of two) */
351 	int	mask;		/* index mask (size - 1) */
352 	int	rindex;		/* stream reader index (track fd writes) */
353 	int	xindex;		/* last acked / reader restart */
354 	int	windex;		/* stream writer index */
355 	char	*membase;	/* memory buffer representing the FIFO */
356 };
357 
358 /*
359  * Generic journaling structure attached to a mount point.
360  */
361 struct journal {
362 	TAILQ_ENTRY(journal) jentry;
363 	struct file	*fp;
364 	char		id[JIDMAX];
365 	int		flags;		/* journaling flags */
366 	int64_t		transid;
367 	int64_t		total_acked;
368 	struct journal_memfifo fifo;
369 	struct thread	thread;
370 };
371 
372 /*
373  * The jrecord structure is used to build a journaling transaction.  Since
374  * a single journaling transaction might encompass very large buffers it
375  * is possible for multiple transactions to be written out to the FIFO
376  * in parallel and in peacemeal.
377  */
378 struct jrecord {
379 	struct journal	*jo;
380 	char		*stream_ptr;
381 	int		stream_residual;
382 	int		stream_reserved;
383 	struct journal_rawrecbeg *rawp;
384 	struct journal_subrecord *parent;
385 	struct journal_subrecord *last;
386 	int16_t 	streamid;
387 	int		pushcount;
388 	int		pushptrgood;
389 	int		residual;
390 	int		residual_align;
391 };
392 
393 #endif
394