xref: /dragonfly/sys/sys/dmsg.h (revision 7c4f4eee)
1 /*
2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #ifndef _SYS_DMSG_H_
36 #define _SYS_DMSG_H_
37 
38 #ifndef _SYS_TYPES_H_
39 #include <sys/types.h>
40 #endif
41 #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES)
42 #ifndef _SYS_MALLOC_H_
43 #include <sys/malloc.h>
44 #endif
45 #ifndef _SYS_TREE_H_
46 #include <sys/tree.h>
47 #endif
48 #ifndef _SYS_THREAD_H_
49 #include <sys/thread.h>
50 #endif
51 #endif
52 #ifndef _SYS_UUID_H_
53 #include <sys/uuid.h>
54 #endif
55 
56 /*
57  * Mesh network protocol structures.
58  *
59  *				CONN PROTOCOL
60  *
61  * The mesh is constructed via point-to-point streaming links with varying
62  * levels of interconnectedness, forming a graph.  Leafs of the graph are
63  * typically kernel devices (xdisk) or VFSs (HAMMER2).  Internal nodes are
64  * usually (user level) hammer2 service demons.
65  *
66  * Upon connecting and after authentication, a LNK_CONN transaction is opened
67  * to configure the link.  The SPAN protocol is then typically run over the
68  * open LNK_CONN transaction.
69  *
70  * Terminating the LNK_CONN transaction terminates everything running over it
71  * (typically open LNK_SPAN transactions), which in turn terminates everything
72  * running over the LNK_SPANs.
73  *
74  *				SPAN PROTOCOL
75  *
76  * The SPAN protocol runs over an open LNK_CONN transaction and is used to
77  * advertise any number of services.  For example, each PFS under a HAMMER2
78  * mount will be advertised as an open LNK_SPAN transaction.
79  *
80  * Any network node on the graph running multiple connections is capable
81  * of relaying LNK_SPANs from any connection to any other connection.  This
82  * is typically done by the user-level hammer2 service demon, and typically
83  * not done by kernel devices or VFSs (though these entities must be able
84  * to manage multiple LNK_SPANs since they might advertise or need to talk
85  * to multiple services).
86  *
87  * Relaying is not necessarily trivial as it requires internal nodes to
88  * track two open transactions (on the two iocom interfaces) and translate
89  * the msgid and circuit.  In addition, the relay may have to track multiple
90  * SPANs from the same iocom or from multiple iocoms which represent the same
91  * end-point and must select the best end-point, must send notifications when
92  * a better path is available, and must allow (when connectivity is still
93  * present) any existing, open, stacked sub-transactions to complete before
94  * terminating the less efficient SPAN.
95  *
96  * Relaying is optional.  It is perfectly acceptable for the hammer2 service
97  * to plug a received socket descriptor directly into the appropriate kernel
98  * device driver.
99  *
100  *			       STACKED TRANSACTIONS
101  *
102  * Message transactions can be stacked.  That is, you can initiate a DMSG
103  * transaction relative to another open transaction.  sub-transactions can
104  * be initiate without waiting for the parent transaction to complete its
105  * handshake.
106  *
107  * This is done by entering the open transaction's msgid as the circuit field
108  * in the new transaction (typically by populating msg->parent).  The
109  * transaction tracking structure will be referenced and will track the
110  * sub-transaction.  Note that msgids must still be unique on an
111  * iocom-by-iocom basis.
112  *
113  * Messages can race closing circuits.  When a circuit is lost,
114  * messages are simulated to delete any sub-transactions.
115  *
116  *			    MESSAGE TRANSACTIONAL STATES
117  *
118  * Message transactions are handled by the CREATE, DELETE, REPLY, ABORT, and
119  * CREPLY flags.  Message state is typically recorded at the end points and
120  * will be maintained (preventing reuse of the transaction id) until a DELETE
121  * is both sent and received.
122  *
123  * One-way messages such as those used for debug commands are not recorded
124  * and do not require any transactional state.  These are sent without
125  * the CREATE, DELETE, or ABORT flags set.  ABORT is not supported for
126  * one-off messages.  The REPLY bit can be used to distinguish between
127  * command and status if desired.
128  *
129  * Transactional messages are messages which require a reply to be
130  * returned.  These messages can also consist of multiple message elements
131  * for the command or reply or both (or neither).  The command message
132  * sequence sets CREATE on the first message and DELETE on the last message.
133  * A single message command sets both (CREATE|DELETE).  The reply message
134  * sequence works the same way but of course also sets the REPLY bit.
135  *
136  * Tansactional messages can be aborted by sending a message element
137  * with the ABORT flag set.  This flag can be combined with either or both
138  * the CREATE and DELETE flags.  When combined with the CREATE flag the
139  * command is treated as non-blocking but still executes.  Whem combined
140  * with the DELETE flag no additional message elements are required.
141  *
142  * Transactions are terminated by sending a message with DELETE set.
143  * Transactions must be CREATEd and DELETEd in both directions.  If a
144  * transaction is governing stacked sub-transactions the sub-transactions
145  * are automatically terminated before the governing transaction is terminated.
146  * Terminates are handled by simulating a received DELETE and expecting the
147  * normal function callback and state machine to (ultimately) issue a
148  * terminating (DELETE) response.
149  *
150  * Transactions can operate in full-duplex as both sides are fully open
151  * (i.e. CREATE sent, CREATE|REPLY returned, DELETE not sent by anyone).
152  * Additional commands can be initiated from either side of the transaction.
153  *
154  * ABORT SPECIAL CASE - Mid-stream aborts.  A mid-stream abort can be sent
155  * when supported by the sender by sending an ABORT message with neither
156  * CREATE or DELETE set.  This effectively turns the message into a
157  * non-blocking message (but depending on what is being represented can also
158  * cut short prior data elements in the stream).
159  *
160  * ABORT SPECIAL CASE - Abort-after-DELETE.  Transactional messages have to be
161  * abortable if the stream/pipe/whatever is lost.  In this situation any
162  * forwarding relay needs to unconditionally abort commands and replies that
163  * are still active.  This is done by sending an ABORT|DELETE even in
164  * situations where a DELETE has already been sent in that direction.  This
165  * is done, for example, when links are in a half-closed state.  In this
166  * situation it is possible for the abort request to race a transition to the
167  * fully closed state.  ABORT|DELETE messages which race the fully closed
168  * state are expected to be discarded by the other end.
169  *
170  * --
171  *
172  * All base and extended message headers are 64-byte aligned, and all
173  * transports must support extended message headers up to DMSG_HDR_MAX.
174  * Currently we allow extended message headers up to 2048 bytes.  Note
175  * that the extended header size is encoded in the 'cmd' field of the header.
176  *
177  * Any in-band data is padded to a 64-byte alignment and placed directly
178  * after the extended header (after the higher-level cmd/rep structure).
179  * The actual unaligned size of the in-band data is encoded in the aux_bytes
180  * field in this case.  Maximum data sizes are negotiated during registration.
181  *
182  * Auxillary data can be in-band or out-of-band.  In-band data sets aux_descr
183  * equal to 0.  Any out-of-band data must be negotiated by the SPAN protocol.
184  *
185  * Auxillary data, whether in-band or out-of-band, must be at-least 64-byte
186  * aligned.  The aux_bytes field contains the actual byte-granular length
187  * and not the aligned length.  The crc is against the aligned length (so
188  * a faster crc algorithm can be used, theoretically).
189  *
190  * hdr_crc is calculated over the entire, ALIGNED extended header.  For
191  * the purposes of calculating the crc, the hdr_crc field is 0.  That is,
192  * if calculating the crc in HW a 32-bit '0' must be inserted in place of
193  * the hdr_crc field when reading the entire header and compared at the
194  * end (but the actual hdr_crc must be left intact in memory).  A simple
195  * counter to replace the field going into the CRC generator does the job
196  * in HW.  The CRC endian is based on the magic number field and may have
197  * to be byte-swapped, too (which is also easy to do in HW).
198  *
199  * aux_crc is calculated over the entire, ALIGNED auxillary data.
200  *
201  *			SHARED MEMORY IMPLEMENTATIONS
202  *
203  * Shared-memory implementations typically use a pipe to transmit the extended
204  * message header and shared memory to store any auxilary data.  Auxillary
205  * data in one-way (non-transactional) messages is typically required to be
206  * inline.  CRCs are still recommended and required at the beginning, but
207  * may be negotiated away later.
208  */
209 
210 #define DMSG_TERMINATE_STRING(ary)	\
211 	do { (ary)[sizeof(ary) - 1] = 0; } while (0)
212 
213 /*
214  * dmsg_hdr must be 64 bytes
215  */
216 struct dmsg_hdr {
217 	uint16_t	magic;		/* 00 sanity, synchro, endian */
218 	uint16_t	reserved02;	/* 02 */
219 	uint32_t	salt;		/* 04 random salt helps w/crypto */
220 
221 	uint64_t	msgid;		/* 08 message transaction id */
222 	uint64_t	circuit;	/* 10 circuit id or 0	*/
223 	uint64_t	reserved18;	/* 18 */
224 
225 	uint32_t	cmd;		/* 20 flags | cmd | hdr_size / ALIGN */
226 	uint32_t	aux_crc;	/* 24 auxillary data crc */
227 	uint32_t	aux_bytes;	/* 28 auxillary data length (bytes) */
228 	uint32_t	error;		/* 2C error code or 0 */
229 	uint64_t	aux_descr;	/* 30 negotiated OOB data descr */
230 	uint32_t	reserved38;	/* 38 */
231 	uint32_t	hdr_crc;	/* 3C (aligned) extended header crc */
232 };
233 
234 typedef struct dmsg_hdr dmsg_hdr_t;
235 
236 #define DMSG_HDR_MAGIC		0x4832
237 #define DMSG_HDR_MAGIC_REV	0x3248
238 #define DMSG_HDR_CRCOFF		offsetof(dmsg_hdr_t, salt)
239 #define DMSG_HDR_CRCBYTES	(sizeof(dmsg_hdr_t) - DMSG_HDR_CRCOFF)
240 
241 /*
242  * Administrative protocol limits.
243  *
244  * NOTE: A dmsg header must completely fit in the (fifo) buffer, but
245  *	 dmsg aux data does not have to completely fit.  The dmsg
246  *	 structure allows headers up to 255*64 = 16320 bytes.  There
247  *	 is no real limit on the aux_data other than what we deem
248  *	 reasonable and defenseable (i.e. not run processes or the
249  *	 kernel out of memory).  But it should be able to handle at
250  *	 least MAXPHYS bytes which is typically 128KB or 256KB.
251  */
252 #define DMSG_HDR_MAX		2048		/* <= 8192 */
253 #define DMSG_AUX_MAX		(1024*1024)	/* <= 1MB */
254 #define DMSG_BUF_SIZE		(DMSG_HDR_MAX * 4)
255 #define DMSG_BUF_MASK		(DMSG_BUF_SIZE - 1)
256 
257 /*
258  * The message (cmd) field also encodes various flags and the total size
259  * of the message header.  This allows the protocol processors to validate
260  * persistency and structural settings for every command simply by
261  * switch()ing on the (cmd) field.
262  */
263 #define DMSGF_CREATE		0x80000000U	/* msg start */
264 #define DMSGF_DELETE		0x40000000U	/* msg end */
265 #define DMSGF_REPLY		0x20000000U	/* reply path */
266 #define DMSGF_ABORT		0x10000000U	/* abort req */
267 #define DMSGF_REVTRANS		0x08000000U	/* opposite direction msgid */
268 #define DMSGF_REVCIRC		0x04000000U	/* opposite direction circuit */
269 #define DMSGF_FLAG1		0x02000000U
270 #define DMSGF_FLAG0		0x01000000U
271 
272 #define DMSGF_FLAGS		0xFF000000U	/* all flags */
273 #define DMSGF_PROTOS		0x00F00000U	/* all protos */
274 #define DMSGF_CMDS		0x000FFF00U	/* all cmds */
275 #define DMSGF_SIZE		0x000000FFU	/* N*32 */
276 
277 /*
278  * XXX Future, flag that an in-line (not part of a CREATE/DELETE) command
279  *     expects some sort of acknowledgement.  Allows protocol mismatches to
280  *     be detected.
281  */
282 #define DMSGF_CMDF_EXPECT_ACK	0x00080000U	/* in-line command no-ack */
283 
284 #define DMSGF_CMDSWMASK		(DMSGF_CMDS |	\
285 					 DMSGF_SIZE |	\
286 					 DMSGF_PROTOS |	\
287 					 DMSGF_REPLY)
288 
289 #define DMSGF_BASECMDMASK	(DMSGF_CMDS |	\
290 					 DMSGF_SIZE |	\
291 					 DMSGF_PROTOS)
292 
293 #define DMSGF_TRANSMASK		(DMSGF_CMDS |	\
294 					 DMSGF_SIZE |	\
295 					 DMSGF_PROTOS |	\
296 					 DMSGF_REPLY |	\
297 					 DMSGF_CREATE |	\
298 					 DMSGF_DELETE)
299 
300 #define DMSGF_BASEFLAGS		(DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY)
301 
302 #define DMSG_PROTO_LNK		0x00000000U
303 #define DMSG_PROTO_DBG		0x00100000U
304 #define DMSG_PROTO_HM2		0x00200000U
305 #define DMSG_PROTO_XX3		0x00300000U
306 #define DMSG_PROTO_XX4		0x00400000U
307 #define DMSG_PROTO_BLK		0x00500000U
308 #define DMSG_PROTO_VOP		0x00600000U
309 
310 /*
311  * Message command constructors, sans flags
312  */
313 #define DMSG_ALIGN		64
314 #define DMSG_ALIGNMASK		(DMSG_ALIGN - 1)
315 #define DMSG_DOALIGN(bytes)	(((bytes) + DMSG_ALIGNMASK) &		\
316 				 ~DMSG_ALIGNMASK)
317 
318 #define DMSG_HDR_ENCODE(elm)	(((uint32_t)sizeof(struct elm) +	\
319 				  DMSG_ALIGNMASK) /			\
320 				 DMSG_ALIGN)
321 
322 #define DMSG_LNK(cmd, elm)	(DMSG_PROTO_LNK |			\
323 					 ((cmd) << 8) | 		\
324 					 DMSG_HDR_ENCODE(elm))
325 
326 #define DMSG_DBG(cmd, elm)	(DMSG_PROTO_DBG |			\
327 					 ((cmd) << 8) | 		\
328 					 DMSG_HDR_ENCODE(elm))
329 
330 #define DMSG_HM2(cmd, elm)	(DMSG_PROTO_HM2 |			\
331 					 ((cmd) << 8) | 		\
332 					 DMSG_HDR_ENCODE(elm))
333 
334 #define DMSG_BLK(cmd, elm)	(DMSG_PROTO_BLK |			\
335 					 ((cmd) << 8) | 		\
336 					 DMSG_HDR_ENCODE(elm))
337 
338 #define DMSG_VOP(cmd, elm)	(DMSG_PROTO_VOP |			\
339 					 ((cmd) << 8) | 		\
340 					 DMSG_HDR_ENCODE(elm))
341 
342 /*
343  * Link layer ops basically talk to just the other side of a direct
344  * connection.
345  *
346  * LNK_PAD	- One-way message on circuit 0, ignored by target.  Used to
347  *		  pad message buffers on shared-memory transports.  Not
348  *		  typically used with TCP.
349  *
350  * LNK_PING	- One-way message on circuit-0, keep-alive, run by both sides
351  *		  typically 1/sec on idle link, link is lost after 10 seconds
352  *		  of inactivity.
353  *
354  * LNK_AUTH	- Authenticate the connection, negotiate administrative
355  *		  rights & encryption, protocol class, etc.  Only PAD and
356  *		  AUTH messages (not even PING) are accepted until
357  *		  authentication is complete.  This message also identifies
358  *		  the host.
359  *
360  * LNK_CONN	- Enable the SPAN protocol on circuit-0, possibly also
361  *		  installing a PFS filter (by cluster id, unique id, and/or
362  *		  wildcarded name).
363  *
364  * LNK_SPAN	- A SPAN transaction typically on iocom->state0 enables
365  *		  messages to be relayed to/from a particular cluster node.
366  *		  SPANs are received, sorted, aggregated, filtered, and
367  *		  retransmitted back out across all applicable connections.
368  *
369  *		  The leaf protocol also uses this to make a PFS available
370  *		  to the cluster (e.g. on-mount).
371  */
372 #define DMSG_LNK_PAD		DMSG_LNK(0x000, dmsg_hdr)
373 #define DMSG_LNK_PING		DMSG_LNK(0x001, dmsg_hdr)
374 #define DMSG_LNK_AUTH		DMSG_LNK(0x010, dmsg_lnk_auth)
375 #define DMSG_LNK_CONN		DMSG_LNK(0x011, dmsg_lnk_conn)
376 #define DMSG_LNK_SPAN		DMSG_LNK(0x012, dmsg_lnk_span)
377 #define DMSG_LNK_ERROR		DMSG_LNK(0xFFF, dmsg_hdr)
378 
379 /*
380  * Reserved command codes for third party subsystems.  Structure size is
381  * not known here so do not try to construct the full DMSG_LNK_ define.
382  */
383 #define DMSG_LNK_CMD_HAMMER2_VOLCONF	0x20
384 
385 #define DMSG_LABEL_SIZE		128	/* fixed at 128, do not change */
386 
387 /*
388  * LNK_AUTH - Authentication (often omitted)
389  */
390 struct dmsg_lnk_auth {
391 	dmsg_hdr_t	head;
392 	char		dummy[64];
393 };
394 
395 /*
396  * LNK_CONN - Register connection info for SPAN protocol
397  *	      (transaction, left open, iocom->state0 only).
398  *
399  * LNK_CONN identifies a streaming connection into the cluster.
400  *
401  * peer_mask serves to filter the SPANs we receive by peer_type.  A cluster
402  * controller typically sets this to (uint64_t)-1, indicating that it wants
403  * everything.  A block devfs interface might set it to 1 << DMSG_PEER_DISK,
404  * and a hammer2 mount might set it to 1 << DMSG_PEER_HAMMER2.
405  *
406  * media_iud allows multiple (e.g. HAMMER2) connections belonging to the same
407  * media to transmit duplicative LNK_VOLCONF updates without causing confusion
408  * in the cluster controller.
409  *
410  * pfs_clid, pfs_fsid, pfs_type, and label are peer-specific and must be
411  * left empty (zero-fill) if not supported by a particular peer.
412  */
413 struct dmsg_lnk_conn {
414 	dmsg_hdr_t	head;
415 	uuid_t		media_id;	/* media configuration id */
416 	uuid_t		peer_id;	/* unique peer uuid */
417 	uuid_t		reserved01;
418 	uint64_t	peer_mask;	/* PEER mask for SPAN filtering */
419 	uint8_t		peer_type;	/* see DMSG_PEER_xxx */
420 	uint8_t		reserved02;
421 	uint16_t	proto_version;	/* high level protocol support */
422 	uint32_t	status;		/* status flags */
423 	uint32_t	rnss;		/* node's generated rnss */
424 	uint8_t		reserved03[8];
425 	uint32_t	reserved04[14];
426 	char		peer_label[DMSG_LABEL_SIZE]; /* peer identity string */
427 };
428 
429 typedef struct dmsg_lnk_conn dmsg_lnk_conn_t;
430 
431 /*
432  * PEER types 0-63 are defined here.  There is a limit of 64 types due to
433  * the width of peer_mask.
434  *
435  * PFS types depend on the peer type.  sys/dmsg.h only defines the default.
436  * peer-specific headers define PFS types for any given peer.
437  */
438 #define DMSG_PEER_NONE			0
439 #define DMSG_PEER_ROUTER		1	/* server: cluster controller */
440 #define DMSG_PEER_BLOCK			2	/* server: block devices */
441 #define DMSG_PEER_HAMMER2		3	/* server: h2 mounted volume */
442 #define DMSG_PEER_CLIENT		63	/* a client connection */
443 #define DMSG_PEER_MAX			64
444 
445 #define DMSG_PFSTYPE_DEFAULT		0
446 #define DMSG_PFSTYPE_MASK		0x0F
447 
448 /*
449  * Structures embedded in LNK_SPAN
450  */
451 struct dmsg_media_block {
452 	uint64_t	bytes;		/* media size in bytes */
453 	uint32_t	blksize;	/* media block size */
454 	uint32_t	reserved01;
455 };
456 
457 typedef struct dmsg_media_block dmsg_media_block_t;
458 
459 /*
460  * LNK_SPAN - Initiate or relay a SPAN
461  *	      (transaction, left open, typically only on iocom->state0)
462  *
463  * This message registers an end-point with the other end of the connection,
464  * telling the other end who we are and what we can provide or intend to
465  * consume.  Multiple registrations can be maintained as open transactions
466  * with each one specifying a unique end-point.
467  *
468  * Registrations are sent from {source}=S {1...n} to {target}=0 and maintained
469  * as open transactions.  Registrations are also received and maintains as
470  * open transactions, creating a matrix of linkid's.
471  *
472  * While these transactions are open additional transactions can be executed
473  * between any two linkid's {source}=S (registrations we sent) to {target}=T
474  * (registrations we received).
475  *
476  * Closure of any registration transaction will automatically abort any open
477  * transactions using the related linkids.  Closure can be initiated
478  * voluntarily from either side with either end issuing a DELETE, or they
479  * can be ABORTed.
480  *
481  * Status updates are performed via the open transaction.
482  *
483  * --
484  *
485  * A registration identifies a node and its various PFS parameters including
486  * the PFS_TYPE.  For example, a diskless HAMMER2 client typically identifies
487  * itself as PFSTYPE_CLIENT.
488  *
489  * Any node may serve as a cluster controller, aggregating and passing
490  * on received registrations, but end-points do not have to implement this
491  * ability.  Most end-points typically implement a single client-style or
492  * server-style PFS_TYPE and rendezvous at a cluster controller.
493  *
494  * The cluster controller does not aggregate/pass-on all received
495  * registrations.  It typically filters what gets passed on based on what it
496  * receives, passing on only the best candidates.
497  *
498  * If a symmetric spanning tree is desired additional candidates whos
499  * {dist, rnss} fields match the last best candidate must also be propagated.
500  * This feature is not currently enabled.
501  *
502  * STATUS UPDATES: Status updates use the same structure but typically
503  *		   only contain incremental changes to e.g. pfs_type, with
504  *		   a text description sent as out-of-band data.
505  */
506 struct dmsg_lnk_span {
507 	dmsg_hdr_t	head;
508 	uuid_t		peer_id;
509 	uuid_t		pfs_id;		/* unique pfs id */
510 	uint8_t		pfs_type;	/* PFS type */
511 	uint8_t		peer_type;	/* PEER type */
512 	uint16_t	proto_version;	/* high level protocol support */
513 	uint32_t	status;		/* status flags */
514 	uint8_t		reserved02[8];
515 	uint32_t	dist;		/* span distance */
516 	uint32_t	rnss;		/* random number sub-sort */
517 	union {
518 		uint32_t	reserved03[14];
519 		dmsg_media_block_t block;
520 	} media;
521 
522 	/*
523 	 * NOTE: for PEER_HAMMER2 cl_label is typically empty and fs_label
524 	 *	 is the superroot directory name.
525 	 *
526 	 *	 for PEER_BLOCK cl_label is typically host/device and
527 	 *	 fs_label is typically the serial number string.
528 	 */
529 	char		peer_label[DMSG_LABEL_SIZE];	/* peer label */
530 	char		pfs_label[DMSG_LABEL_SIZE];	/* PFS label */
531 };
532 
533 typedef struct dmsg_lnk_span dmsg_lnk_span_t;
534 
535 #define DMSG_SPAN_PROTO_1	1
536 
537 /*
538  * Debug layer ops operate on any link
539  *
540  * SHELL	- Persist stream, access the debug shell on the target
541  *		  registration.  Multiple shells can be operational.
542  */
543 #define DMSG_DBG_SHELL		DMSG_DBG(0x001, dmsg_dbg_shell)
544 
545 struct dmsg_dbg_shell {
546 	dmsg_hdr_t	head;
547 };
548 typedef struct dmsg_dbg_shell dmsg_dbg_shell_t;
549 
550 /*
551  * Hammer2 layer ops (low-level chain manipulation used by cluster code)
552  *
553  * HM2_OPENPFS	- Attach a PFS
554  * HM2_FLUSHPFS - Flush a PFS
555  *
556  * HM2_LOOKUP	- Lookup chain (parent-relative transaction)
557  *		  (can request multiple chains)
558  * HM2_NEXT	- Lookup next chain (parent-relative transaction)
559  *		  (can request multiple chains)
560  * HM2_LOCK	- [Re]lock a chain (chain-relative) (non-recursive)
561  * HM2_UNLOCK	- Unlock a chain (chain-relative) (non-recursive)
562  * HM2_RESIZE	- Resize a chain (chain-relative)
563  * HM2_MODIFY	- Modify a chain (chain-relative)
564  * HM2_CREATE	- Create a chain (parent-relative)
565  * HM2_DUPLICATE- Duplicate a chain (target-parent-relative)
566  * HM2_DELDUP	- Delete-Duplicate a chain (chain-relative)
567  * HM2_DELETE	- Delete a chain (chain-relative)
568  * HM2_SNAPSHOT	- Create a snapshot (snapshot-root-relative, w/clid override)
569  */
570 #define DMSG_HM2_OPENPFS	DMSG_HM2(0x001, dmsg_hm2_openpfs)
571 
572 /*
573  * DMSG_PROTO_BLK Protocol
574  *
575  * BLK_OPEN	- Open device.  This transaction must be left open for the
576  *		  duration and the returned keyid passed in all associated
577  *		  BLK commands.  Multiple OPENs can be issued within the
578  *		  transaction.
579  *
580  * BLK_CLOSE	- Close device.  This can be used to close one of the opens
581  *		  within a BLK_OPEN transaction.  It may NOT initiate a
582  *		  transaction.  Note that a termination of the transaction
583  *		  (e.g. with LNK_ERROR or BLK_ERROR) closes all active OPENs
584  *		  for that transaction.  XXX not well defined atm.
585  *
586  * BLK_READ	- Strategy read.  Not typically streaming.
587  *
588  * BLK_WRITE	- Strategy write.  Not typically streaming.
589  *
590  * BLK_FLUSH	- Strategy flush.  Not typically streaming.
591  *
592  * BLK_FREEBLKS	- Strategy freeblks.  Not typically streaming.
593  */
594 #define DMSG_BLK_OPEN		DMSG_BLK(0x001, dmsg_blk_open)
595 #define DMSG_BLK_CLOSE		DMSG_BLK(0x002, dmsg_blk_open)
596 #define DMSG_BLK_READ		DMSG_BLK(0x003, dmsg_blk_read)
597 #define DMSG_BLK_WRITE		DMSG_BLK(0x004, dmsg_blk_write)
598 #define DMSG_BLK_FLUSH		DMSG_BLK(0x005, dmsg_blk_flush)
599 #define DMSG_BLK_FREEBLKS	DMSG_BLK(0x006, dmsg_blk_freeblks)
600 #define DMSG_BLK_ERROR		DMSG_BLK(0xFFF, dmsg_blk_error)
601 
602 struct dmsg_blk_open {
603 	dmsg_hdr_t	head;
604 	uint32_t	modes;
605 	uint32_t	reserved01;
606 };
607 
608 #define DMSG_BLKOPEN_RD		0x0001
609 #define DMSG_BLKOPEN_WR		0x0002
610 
611 /*
612  * DMSG_LNK_ERROR is returned for simple results,
613  * DMSG_BLK_ERROR is returned for extended results.
614  */
615 struct dmsg_blk_error {
616 	dmsg_hdr_t	head;
617 	uint64_t	keyid;
618 	uint32_t	resid;
619 	uint32_t	reserved02;
620 	char		buf[64];
621 };
622 
623 struct dmsg_blk_read {
624 	dmsg_hdr_t	head;
625 	uint64_t	keyid;
626 	uint64_t	offset;
627 	uint32_t	bytes;
628 	uint32_t	flags;
629 	uint32_t	reserved01;
630 	uint32_t	reserved02;
631 };
632 
633 struct dmsg_blk_write {
634 	dmsg_hdr_t	head;
635 	uint64_t	keyid;
636 	uint64_t	offset;
637 	uint32_t	bytes;
638 	uint32_t	flags;
639 	uint32_t	reserved01;
640 	uint32_t	reserved02;
641 };
642 
643 struct dmsg_blk_flush {
644 	dmsg_hdr_t	head;
645 	uint64_t	keyid;
646 	uint64_t	offset;
647 	uint32_t	bytes;
648 	uint32_t	flags;
649 	uint32_t	reserved01;
650 	uint32_t	reserved02;
651 };
652 
653 struct dmsg_blk_freeblks {
654 	dmsg_hdr_t	head;
655 	uint64_t	keyid;
656 	uint64_t	offset;
657 	uint32_t	bytes;
658 	uint32_t	flags;
659 	uint32_t	reserved01;
660 	uint32_t	reserved02;
661 };
662 
663 typedef struct dmsg_blk_open		dmsg_blk_open_t;
664 typedef struct dmsg_blk_read		dmsg_blk_read_t;
665 typedef struct dmsg_blk_write		dmsg_blk_write_t;
666 typedef struct dmsg_blk_flush		dmsg_blk_flush_t;
667 typedef struct dmsg_blk_freeblks	dmsg_blk_freeblks_t;
668 typedef struct dmsg_blk_error		dmsg_blk_error_t;
669 
670 /*
671  * NOTE!!!! ALL EXTENDED HEADER STRUCTURES MUST BE 64-BYTE ALIGNED!!!
672  *
673  * General message errors
674  *
675  *	0x00 - 0x1F	Local iocomm errors
676  *	0x20 - 0x2F	Global errors
677  */
678 #define DMSG_ERR_NOSUPP		0x20
679 #define DMSG_ERR_LOSTLINK	0x21
680 #define DMSG_ERR_IO		0x22	/* generic */
681 #define DMSG_ERR_PARAM		0x23	/* generic */
682 #define DMSG_ERR_CANTCIRC	0x24	/* (typically means lost span) */
683 
684 union dmsg_any {
685 	char			buf[DMSG_HDR_MAX];
686 	dmsg_hdr_t		head;
687 
688 	dmsg_lnk_conn_t		lnk_conn;
689 	dmsg_lnk_span_t		lnk_span;
690 
691 	dmsg_blk_open_t		blk_open;
692 	dmsg_blk_error_t	blk_error;
693 	dmsg_blk_read_t		blk_read;
694 	dmsg_blk_write_t	blk_write;
695 	dmsg_blk_flush_t	blk_flush;
696 	dmsg_blk_freeblks_t	blk_freeblks;
697 };
698 
699 typedef union dmsg_any dmsg_any_t;
700 
701 /*
702  * Kernel iocom structures and prototypes for kern/kern_dmsg.c
703  */
704 #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES)
705 
706 struct hammer2_mount;
707 struct xa_softc;
708 struct kdmsg_iocom;
709 struct kdmsg_state;
710 struct kdmsg_msg;
711 struct kdmsg_data;
712 
713 /*
714  * msg_ctl flags (atomic)
715  */
716 #define KDMSG_CLUSTERCTL_UNUSED01	0x00000001
717 #define KDMSG_CLUSTERCTL_KILLRX		0x00000002 /* staged helper exit */
718 #define KDMSG_CLUSTERCTL_KILLTX		0x00000004 /* staged helper exit */
719 #define KDMSG_CLUSTERCTL_SLEEPING	0x00000008 /* interlocked w/msglk */
720 
721 /*
722  * Transactional state structure, representing an open transaction.  The
723  * transaction might represent a cache state (and thus have a chain
724  * association), or a VOP op, LNK_SPAN, or other things.
725  *
726  * NOTE: A non-empty subq represents one ref.
727  *	 If we are inserted on a parent's subq, that's one ref (SUBINSERTED).
728  *	 If we are inserted on a RB tree, that's one ref (RBINSERTED).
729  *	 msg->state represents a ref.
730  *	 Other code references may hold refs.
731  *
732  * NOTE: The parent association stays intact as long as a state has a
733  *	 non-empty subq.  Otherwise simulated failures might not be able
734  *	 to reach the children.
735  */
736 TAILQ_HEAD(kdmsg_state_list, kdmsg_state);
737 
738 struct kdmsg_state {
739 	RB_ENTRY(kdmsg_state) rbnode;		/* indexed by msgid */
740 	struct kdmsg_state	*scan;		/* scan check */
741 	struct kdmsg_state_list	subq;		/* active stacked states */
742 	TAILQ_ENTRY(kdmsg_state) entry;		/* on parent subq */
743 	TAILQ_ENTRY(kdmsg_state) user_entry;	/* available to devices */
744 	struct kdmsg_iocom *iocom;
745 	struct kdmsg_state *parent;
746 	int		refs;			/* refs */
747 	uint32_t	icmd;			/* record cmd creating state */
748 	uint32_t	txcmd;			/* mostly for CMDF flags */
749 	uint32_t	rxcmd;			/* mostly for CMDF flags */
750 	uint64_t	msgid;			/* {parent,msgid} uniq */
751 	int		flags;
752 	int		error;
753 	void		*chain;			/* (caller's state) */
754 	int (*func)(struct kdmsg_state *, struct kdmsg_msg *);
755 	union {
756 		void *any;
757 		struct hammer2_mount *hmp;
758 		struct xa_softc *xa_sc;
759 	} any;
760 };
761 
762 #define KDMSG_STATE_SUBINSERTED	0x0001
763 #define KDMSG_STATE_DYNAMIC	0x0002
764 #define KDMSG_STATE_UNUSED0004	0x0004
765 #define KDMSG_STATE_ABORTING	0x0008		/* avoids recursive abort */
766 #define KDMSG_STATE_OPPOSITE	0x0010		/* opposite direction */
767 #define KDMSG_STATE_DYING	0x0020		/* atomic recursive circ fail */
768 #define KDMSG_STATE_INTERLOCK	0x0040
769 #define KDMSG_STATE_RBINSERTED	0x0080
770 #define KDMSG_STATE_SIGNAL	0x0400
771 #define KDMSG_STATE_NEW		0x0800		/* defer abort processing */
772 
773 struct kdmsg_msg {
774 	TAILQ_ENTRY(kdmsg_msg) qentry;		/* serialized queue */
775 	struct kdmsg_state *state;
776 	size_t		hdr_size;
777 	size_t		aux_size;
778 	char		*aux_data;
779 	uint32_t	flags;
780 	uint32_t	tcmd;			/* outer transaction cmd */
781 	dmsg_any_t	any;			/* variable sized */
782 };
783 
784 struct kdmsg_data {
785 	char		*aux_data;
786 	size_t		aux_size;
787 	struct kdmsg_iocom *iocom;
788 };
789 
790 #define KDMSG_FLAG_AUXALLOC	0x0001
791 
792 typedef struct kdmsg_link kdmsg_link_t;
793 typedef struct kdmsg_state kdmsg_state_t;
794 typedef struct kdmsg_msg kdmsg_msg_t;
795 typedef struct kdmsg_data kdmsg_data_t;
796 
797 struct kdmsg_state_tree;
798 int kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2);
799 RB_HEAD(kdmsg_state_tree, kdmsg_state);
800 RB_PROTOTYPE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
801 
802 /*
803  * Structure embedded in e.g. mount, master control structure for
804  * DMSG stream handling.
805  */
806 struct kdmsg_iocom {
807 	struct malloc_type	*mmsg;
808 	struct file		*msg_fp;	/* cluster pipe->userland */
809 	thread_t		msgrd_td;	/* cluster thread */
810 	thread_t		msgwr_td;	/* cluster thread */
811 	int			msg_ctl;	/* wakeup flags */
812 	int			msg_seq;	/* cluster msg sequence id */
813 	uint32_t		flags;
814 	struct lock		msglk;		/* lockmgr lock */
815 	TAILQ_HEAD(, kdmsg_msg) msgq;		/* transmit queue */
816 	void			*handle;
817 	void			(*auto_callback)(kdmsg_msg_t *);
818 	int			(*rcvmsg)(kdmsg_msg_t *);
819 	void			(*exit_func)(struct kdmsg_iocom *);
820 	struct kdmsg_state	state0;		/* root state for stacking */
821 	struct kdmsg_state	*conn_state;	/* active LNK_CONN state */
822 	struct kdmsg_state	*freerd_state;	/* allocation cache */
823 	struct kdmsg_state	*freewr_state;	/* allocation cache */
824 	struct kdmsg_state_tree staterd_tree;	/* active messages */
825 	struct kdmsg_state_tree statewr_tree;	/* active messages */
826 	dmsg_lnk_conn_t		auto_lnk_conn;
827 	dmsg_lnk_span_t		auto_lnk_span;
828 };
829 
830 typedef struct kdmsg_iocom	kdmsg_iocom_t;
831 
832 #define KDMSG_IOCOMF_AUTOCONN	0x0001	/* handle RX/TX LNK_CONN */
833 #define KDMSG_IOCOMF_AUTORXSPAN	0x0002	/* handle RX LNK_SPAN */
834 #define KDMSG_IOCOMF_AUTOTXSPAN	0x0008	/* handle TX LNK_SPAN */
835 #define KDMSG_IOCOMF_EXITNOACC	0x8000	/* cannot accept writes */
836 
837 #define KDMSG_IOCOMF_AUTOANY	(KDMSG_IOCOMF_AUTOCONN |	\
838 				 KDMSG_IOCOMF_AUTORXSPAN |	\
839 				 KDMSG_IOCOMF_AUTOTXSPAN)
840 
841 uint32_t kdmsg_icrc32(const void *buf, size_t size);
842 uint32_t kdmsg_icrc32c(const void *buf, size_t size, uint32_t crc);
843 
844 /*
845  * kern_dmsg.c
846  */
847 void kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, u_int32_t flags,
848 			struct malloc_type *mmsg,
849 			int (*rcvmsg)(kdmsg_msg_t *msg));
850 void kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
851 			const char *subsysname);
852 void kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
853 			void (*conn_callback)(kdmsg_msg_t *msg));
854 void kdmsg_iocom_uninit(kdmsg_iocom_t *iocom);
855 void kdmsg_drain_msgq(kdmsg_iocom_t *iocom);
856 
857 void kdmsg_msg_free(kdmsg_msg_t *msg);
858 kdmsg_msg_t *kdmsg_msg_alloc(kdmsg_state_t *state, uint32_t cmd,
859 				int (*func)(kdmsg_state_t *, kdmsg_msg_t *),
860 				void *data);
861 void kdmsg_msg_write(kdmsg_msg_t *msg);
862 void kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error);
863 void kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error);
864 void kdmsg_state_reply(kdmsg_state_t *state, uint32_t error);
865 void kdmsg_state_result(kdmsg_state_t *state, uint32_t error);
866 void kdmsg_detach_aux_data(kdmsg_msg_t *msg, kdmsg_data_t *data);
867 void kdmsg_free_aux_data(kdmsg_data_t *data);
868 
869 #endif	/* _KERNEL || _KERNEL_STRUCTURES */
870 
871 #endif	/* !_SYS_DMSG_H_ */
872