xref: /dragonfly/sys/sys/dmsg.h (revision 926deccb)
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #ifndef _SYS_DMSG_H_
36 #define _SYS_DMSG_H_
37 
38 #ifndef _SYS_MALLOC_H_
39 #include <sys/malloc.h>
40 #endif
41 #ifndef _SYS_TREE_H_
42 #include <sys/tree.h>
43 #endif
44 #ifndef _SYS_THREAD_H_
45 #include <sys/thread.h>
46 #endif
47 #ifndef _SYS_UUID_H_
48 #include <sys/uuid.h>
49 #endif
50 
51 /*
52  * Mesh network protocol structures.
53  *
54  *				CONN PROTOCOL
55  *
56  * The mesh is constructed from point-to-point streaming links with varying
57  * levels of interconnectedness, forming a graph.  Terminii in the graph
58  * are entities such as a HAMMER2 PFS or a network mount or other types
59  * of nodes.
60  *
61  * Upon connecting and after authentication, a LNK_CONN transaction is opened
62  * on circuit 0 by both ends.  This configures and enables the SPAN protocol.
63  * The LNK_CONN transaction remains open for the life of the connection.
64  *
65  *				SPAN PROTOCOL
66  *
67  * Once enabled, termini transmits a representitive LNK_SPAN out all
68  * available connections advertising what it is.  Nodes maintaing multiple
69  * connections will relay received LNK_SPANs out available connections
70  * with some filtering based on the CONN configuration.  A distance metric
71  * and per-node random value (rnss) is aggregated.
72  *
73  * Since LNK_SPANs can rapidly multiply in a complex graph, not all incoming
74  * LNK_SPANs will be relayed.  Only the top N over all collect LNK_SPANs for
75  * any given advertisement are relayed.
76  *
77  * It is possible to code the SPANning tree algorithm to guarantee that
78  * symmetrical spans will be generated after stabilization.  The RNSS field
79  * is used to help distinguish and reduce paths in complex graphs when
80  * symmetric spans are desired.  We always generate RNSS but we currently do
81  * not implement symmetrical SPAN guarantees.
82  *
83  *				CIRC PROTOCOL
84  *
85  * We aren't done yet.  Before transactions can be relayed, symmetric paths
86  * must be formed via the LNK_CIRC protocol.  The LNK_CIRC protocol
87  * establishes a virtual circuit from any node to any other node, creating
88  * a circuit id which is stored in dmsg_hdr.circuit.  Messages received on
89  * one side or forwarded to the other.  Forwarded messages bypass normal
90  * state tracking.
91  *
92  * A virtual circuit is forged by working the propogated SPANs backwards.
93  * Each node in the graph helps propagate the virtual circuit by attach the
94  * LNK_CIRC transaction it receives to a LNK_CIRC transaction it initiates
95  * out the other interface.
96  *
97  * Since SPANs are link-state transactions any change in related span(s)
98  * will also force-terminate VC's using those spans.
99  *
100  *			MESSAGE TRANSACTIONAL STATES
101  *
102  * Message state is handled by the CREATE, DELETE, REPLY, and ABORT
103  * flags.  Message state is typically recorded at the end points and
104  * at each hop until a DELETE is received from both sides.
105  *
106  * One-way messages such as those used by spanning tree commands are not
107  * recorded.  These are sent without the CREATE, DELETE, or ABORT flags set.
108  * ABORT is not supported for one-off messages.  The REPLY bit can be used
109  * to distinguish between command and status if desired.
110  *
111  * Persistent-state messages are messages which require a reply to be
112  * returned.  These messages can also consist of multiple message elements
113  * for the command or reply or both (or neither).  The command message
114  * sequence sets CREATE on the first message and DELETE on the last message.
115  * A single message command sets both (CREATE|DELETE).  The reply message
116  * sequence works the same way but of course also sets the REPLY bit.
117  *
118  * Persistent-state messages can be aborted by sending a message element
119  * with the ABORT flag set.  This flag can be combined with either or both
120  * the CREATE and DELETE flags.  When combined with the CREATE flag the
121  * command is treated as non-blocking but still executes.  Whem combined
122  * with the DELETE flag no additional message elements are required.
123  *
124  * ABORT SPECIAL CASE - Mid-stream aborts.  A mid-stream abort can be sent
125  * when supported by the sender by sending an ABORT message with neither
126  * CREATE or DELETE set.  This effectively turns the message into a
127  * non-blocking message (but depending on what is being represented can also
128  * cut short prior data elements in the stream).
129  *
130  * ABORT SPECIAL CASE - Abort-after-DELETE.  Persistent messages have to be
131  * abortable if the stream/pipe/whatever is lost.  In this situation any
132  * forwarding relay needs to unconditionally abort commands and replies that
133  * are still active.  This is done by sending an ABORT|DELETE even in
134  * situations where a DELETE has already been sent in that direction.  This
135  * is done, for example, when links are in a half-closed state.  In this
136  * situation it is possible for the abort request to race a transition to the
137  * fully closed state.  ABORT|DELETE messages which race the fully closed
138  * state are expected to be discarded by the other end.
139  *
140  * --
141  *
142  * All base and extended message headers are 64-byte aligned, and all
143  * transports must support extended message headers up to DMSG_HDR_MAX.
144  * Currently we allow extended message headers up to 2048 bytes.  Note
145  * that the extended header size is encoded in the 'cmd' field of the header.
146  *
147  * Any in-band data is padded to a 64-byte alignment and placed directly
148  * after the extended header (after the higher-level cmd/rep structure).
149  * The actual unaligned size of the in-band data is encoded in the aux_bytes
150  * field in this case.  Maximum data sizes are negotiated during registration.
151  *
152  * Auxillary data can be in-band or out-of-band.  In-band data sets aux_descr
153  * equal to 0.  Any out-of-band data must be negotiated by the SPAN protocol.
154  *
155  * Auxillary data, whether in-band or out-of-band, must be at-least 64-byte
156  * aligned.  The aux_bytes field contains the actual byte-granular length
157  * and not the aligned length.  The crc is against the aligned length (so
158  * a faster crc algorithm can be used, theoretically).
159  *
160  * hdr_crc is calculated over the entire, ALIGNED extended header.  For
161  * the purposes of calculating the crc, the hdr_crc field is 0.  That is,
162  * if calculating the crc in HW a 32-bit '0' must be inserted in place of
163  * the hdr_crc field when reading the entire header and compared at the
164  * end (but the actual hdr_crc must be left intact in memory).  A simple
165  * counter to replace the field going into the CRC generator does the job
166  * in HW.  The CRC endian is based on the magic number field and may have
167  * to be byte-swapped, too (which is also easy to do in HW).
168  *
169  * aux_crc is calculated over the entire, ALIGNED auxillary data.
170  *
171  *			SHARED MEMORY IMPLEMENTATIONS
172  *
173  * Shared-memory implementations typically use a pipe to transmit the extended
174  * message header and shared memory to store any auxilary data.  Auxillary
175  * data in one-way (non-transactional) messages is typically required to be
176  * inline.  CRCs are still recommended and required at the beginning, but
177  * may be negotiated away later.
178  */
179 struct dmsg_hdr {
180 	uint16_t	magic;		/* 00 sanity, synchro, endian */
181 	uint16_t	reserved02;	/* 02 */
182 	uint32_t	salt;		/* 04 random salt helps w/crypto */
183 
184 	uint64_t	msgid;		/* 08 message transaction id */
185 	uint64_t	circuit;	/* 10 circuit id or 0	*/
186 	uint64_t	reserved18;	/* 18 */
187 
188 	uint32_t	cmd;		/* 20 flags | cmd | hdr_size / ALIGN */
189 	uint32_t	aux_crc;	/* 24 auxillary data crc */
190 	uint32_t	aux_bytes;	/* 28 auxillary data length (bytes) */
191 	uint32_t	error;		/* 2C error code or 0 */
192 	uint64_t	aux_descr;	/* 30 negotiated OOB data descr */
193 	uint32_t	reserved38;	/* 38 */
194 	uint32_t	hdr_crc;	/* 3C (aligned) extended header crc */
195 };
196 
197 typedef struct dmsg_hdr dmsg_hdr_t;
198 
199 #define DMSG_HDR_MAGIC		0x4832
200 #define DMSG_HDR_MAGIC_REV	0x3248
201 #define DMSG_HDR_CRCOFF		offsetof(dmsg_hdr_t, salt)
202 #define DMSG_HDR_CRCBYTES	(sizeof(dmsg_hdr_t) - DMSG_HDR_CRCOFF)
203 
204 /*
205  * Administrative protocol limits.
206  */
207 #define DMSG_HDR_MAX		2048	/* <= 65535 */
208 #define DMSG_AUX_MAX		65536	/* <= 1MB */
209 #define DMSG_BUF_SIZE		(DMSG_HDR_MAX * 4)
210 #define DMSG_BUF_MASK		(DMSG_BUF_SIZE - 1)
211 
212 /*
213  * The message (cmd) field also encodes various flags and the total size
214  * of the message header.  This allows the protocol processors to validate
215  * persistency and structural settings for every command simply by
216  * switch()ing on the (cmd) field.
217  */
218 #define DMSGF_CREATE		0x80000000U	/* msg start */
219 #define DMSGF_DELETE		0x40000000U	/* msg end */
220 #define DMSGF_REPLY		0x20000000U	/* reply path */
221 #define DMSGF_ABORT		0x10000000U	/* abort req */
222 #define DMSGF_AUXOOB		0x08000000U	/* aux-data is OOB */
223 #define DMSGF_FLAG2		0x04000000U
224 #define DMSGF_FLAG1		0x02000000U
225 #define DMSGF_FLAG0		0x01000000U
226 
227 #define DMSGF_FLAGS		0xFF000000U	/* all flags */
228 #define DMSGF_PROTOS		0x00F00000U	/* all protos */
229 #define DMSGF_CMDS		0x000FFF00U	/* all cmds */
230 #define DMSGF_SIZE		0x000000FFU	/* N*32 */
231 
232 #define DMSGF_CMDSWMASK		(DMSGF_CMDS |	\
233 					 DMSGF_SIZE |	\
234 					 DMSGF_PROTOS |	\
235 					 DMSGF_REPLY)
236 
237 #define DMSGF_BASECMDMASK	(DMSGF_CMDS |	\
238 					 DMSGF_SIZE |	\
239 					 DMSGF_PROTOS)
240 
241 #define DMSGF_TRANSMASK		(DMSGF_CMDS |	\
242 					 DMSGF_SIZE |	\
243 					 DMSGF_PROTOS |	\
244 					 DMSGF_REPLY |	\
245 					 DMSGF_CREATE |	\
246 					 DMSGF_DELETE)
247 
248 #define DMSG_PROTO_LNK		0x00000000U
249 #define DMSG_PROTO_DBG		0x00100000U
250 #define DMSG_PROTO_DOM		0x00200000U
251 #define DMSG_PROTO_CAC		0x00300000U
252 #define DMSG_PROTO_QRM		0x00400000U
253 #define DMSG_PROTO_BLK		0x00500000U
254 #define DMSG_PROTO_VOP		0x00600000U
255 
256 /*
257  * Message command constructors, sans flags
258  */
259 #define DMSG_ALIGN		64
260 #define DMSG_ALIGNMASK		(DMSG_ALIGN - 1)
261 #define DMSG_DOALIGN(bytes)	(((bytes) + DMSG_ALIGNMASK) &		\
262 				 ~DMSG_ALIGNMASK)
263 
264 #define DMSG_HDR_ENCODE(elm)	(((uint32_t)sizeof(struct elm) +	\
265 				  DMSG_ALIGNMASK) /			\
266 				 DMSG_ALIGN)
267 
268 #define DMSG_LNK(cmd, elm)	(DMSG_PROTO_LNK |			\
269 					 ((cmd) << 8) | 		\
270 					 DMSG_HDR_ENCODE(elm))
271 
272 #define DMSG_DBG(cmd, elm)	(DMSG_PROTO_DBG |			\
273 					 ((cmd) << 8) | 		\
274 					 DMSG_HDR_ENCODE(elm))
275 
276 #define DMSG_DOM(cmd, elm)	(DMSG_PROTO_DOM |			\
277 					 ((cmd) << 8) | 		\
278 					 DMSG_HDR_ENCODE(elm))
279 
280 #define DMSG_CAC(cmd, elm)	(DMSG_PROTO_CAC |			\
281 					 ((cmd) << 8) | 		\
282 					 DMSG_HDR_ENCODE(elm))
283 
284 #define DMSG_QRM(cmd, elm)	(DMSG_PROTO_QRM |			\
285 					 ((cmd) << 8) | 		\
286 					 DMSG_HDR_ENCODE(elm))
287 
288 #define DMSG_BLK(cmd, elm)	(DMSG_PROTO_BLK |			\
289 					 ((cmd) << 8) | 		\
290 					 DMSG_HDR_ENCODE(elm))
291 
292 #define DMSG_VOP(cmd, elm)	(DMSG_PROTO_VOP |			\
293 					 ((cmd) << 8) | 		\
294 					 DMSG_HDR_ENCODE(elm))
295 
296 /*
297  * Link layer ops basically talk to just the other side of a direct
298  * connection.
299  *
300  * LNK_PAD	- One-way message on circuit 0, ignored by target.  Used to
301  *		  pad message buffers on shared-memory transports.  Not
302  *		  typically used with TCP.
303  *
304  * LNK_PING	- One-way message on circuit-0, keep-alive, run by both sides
305  *		  typically 1/sec on idle link, link is lost after 10 seconds
306  *		  of inactivity.
307  *
308  * LNK_AUTH	- Authenticate the connection, negotiate administrative
309  *		  rights & encryption, protocol class, etc.  Only PAD and
310  *		  AUTH messages (not even PING) are accepted until
311  *		  authentication is complete.  This message also identifies
312  *		  the host.
313  *
314  * LNK_CONN	- Enable the SPAN protocol on circuit-0, possibly also
315  *		  installing a PFS filter (by cluster id, unique id, and/or
316  *		  wildcarded name).
317  *
318  * LNK_SPAN	- A SPAN transaction on circuit-0 enables messages to be
319  *		  relayed to/from a particular cluster node.  SPANs are
320  *		  received, sorted, aggregated, filtered, and retransmitted
321  *		  back out across all applicable connections.
322  *
323  *		  The leaf protocol also uses this to make a PFS available
324  *		  to the cluster (e.g. on-mount).
325  *
326  * LNK_CIRC	- a CIRC transaction establishes a circuit from source to
327  *		  target by creating pairs of open transactions across each
328  *		  hop.
329  *
330  * LNK_VOLCONF	- Volume header configuration change.  All hammer2
331  *		  connections (hammer2 connect ...) stored in the volume
332  *		  header are spammed on circuit 0 to the hammer2
333  *		  service daemon, and any live configuration change
334  *		  thereafter.
335  */
336 #define DMSG_LNK_PAD		DMSG_LNK(0x000, dmsg_hdr)
337 #define DMSG_LNK_PING		DMSG_LNK(0x001, dmsg_hdr)
338 #define DMSG_LNK_AUTH		DMSG_LNK(0x010, dmsg_lnk_auth)
339 #define DMSG_LNK_CONN		DMSG_LNK(0x011, dmsg_lnk_conn)
340 #define DMSG_LNK_SPAN		DMSG_LNK(0x012, dmsg_lnk_span)
341 #define DMSG_LNK_CIRC		DMSG_LNK(0x013, dmsg_lnk_circ)
342 #define DMSG_LNK_VOLCONF	DMSG_LNK(0x020, dmsg_lnk_volconf)
343 #define DMSG_LNK_ERROR		DMSG_LNK(0xFFF, dmsg_hdr)
344 
345 /*
346  * LNK_AUTH - Authentication (often omitted)
347  */
348 struct dmsg_lnk_auth {
349 	dmsg_hdr_t	head;
350 	char		dummy[64];
351 };
352 
353 /*
354  * LNK_CONN - Register connection info for SPAN protocol
355  *	      (transaction, left open, circuit 0 only).
356  *
357  * LNK_CONN identifies a streaming connection into the cluster and serves
358  * to identify, enable, and specify filters for the SPAN protocol.
359  *
360  * peer_mask serves to filter the SPANs we receive by peer_type.  A cluster
361  * controller typically sets this to (uint64_t)-1, indicating that it wants
362  * everything.  A block devfs interface might set it to 1 << DMSG_PEER_DISK,
363  * and a hammer2 mount might set it to 1 << DMSG_PEER_HAMMER2.
364  *
365  * mediaid allows multiple (e.g. HAMMER2) connections belonging to the same
366  * media to transmit duplicative LNK_VOLCONF updates without causing
367  * confusion in the cluster controller.
368  *
369  * pfs_clid, pfs_fsid, pfs_type, and label are peer-specific and must be
370  * left empty (zero-fill) if not supported by a particular peer.
371  *
372  * DMSG_PEER_CLUSTER		filter: none
373  * DMSG_PEER_BLOCK		filter: label
374  * DMSG_PEER_HAMMER2		filter: pfs_clid if not empty, and label
375  */
376 struct dmsg_lnk_conn {
377 	dmsg_hdr_t	head;
378 	uuid_t		mediaid;	/* media configuration id */
379 	uuid_t		pfs_clid;	/* rendezvous pfs uuid */
380 	uuid_t		pfs_fsid;	/* unique pfs uuid */
381 	uint64_t	peer_mask;	/* PEER mask for SPAN filtering */
382 	uint8_t		peer_type;	/* see DMSG_PEER_xxx */
383 	uint8_t		pfs_type;	/* pfs type */
384 	uint16_t	proto_version;	/* high level protocol support */
385 	uint32_t	status;		/* status flags */
386 	uint32_t	rnss;		/* node's generated rnss */
387 	uint8_t		reserved02[8];
388 	uint32_t	reserved03[12];
389 	uint64_t	pfs_mask;	/* PFS mask for SPAN filtering */
390 	char		cl_label[128];	/* cluster label (for PEER_BLOCK) */
391 	char		fs_label[128];	/* PFS label (for PEER_HAMMER2) */
392 };
393 
394 typedef struct dmsg_lnk_conn dmsg_lnk_conn_t;
395 
396 #define DMSG_PFSTYPE_NONE	0
397 #define DMSG_PFSTYPE_ADMIN	1
398 #define DMSG_PFSTYPE_CLIENT	2
399 #define DMSG_PFSTYPE_CACHE	3
400 #define DMSG_PFSTYPE_COPY	4
401 #define DMSG_PFSTYPE_SLAVE	5
402 #define DMSG_PFSTYPE_SOFT_SLAVE	6
403 #define DMSG_PFSTYPE_SOFT_MASTER 7
404 #define DMSG_PFSTYPE_MASTER	8
405 #define DMSG_PFSTYPE_SERVER	9
406 #define DMSG_PFSTYPE_SNAPSHOT	10
407 #define DMSG_PFSTYPE_MAX	11	/* 0-10 */
408 
409 #define DMSG_PEER_NONE		0
410 #define DMSG_PEER_CLUSTER	1	/* a cluster controller */
411 #define DMSG_PEER_BLOCK		2	/* block devices */
412 #define DMSG_PEER_HAMMER2	3	/* hammer2-mounted volumes */
413 
414 /*
415  * Structures embedded in LNK_SPAN
416  */
417 struct dmsg_media_block {
418 	uint64_t	bytes;		/* media size in bytes */
419 	uint32_t	blksize;	/* media block size */
420 };
421 
422 typedef struct dmsg_media_block dmsg_media_block_t;
423 
424 /*
425  * LNK_SPAN - Initiate or relay a SPAN
426  *	      (transaction, left open, circuit 0 only)
427  *
428  * This message registers an end-point with the other end of the connection,
429  * telling the other end who we are and what we can provide or intend to
430  * consume.  Multiple registrations can be maintained as open transactions
431  * with each one specifying a unique end-point.
432  *
433  * Registrations are sent from {source}=S {1...n} to {target}=0 and maintained
434  * as open transactions.  Registrations are also received and maintains as
435  * open transactions, creating a matrix of linkid's.
436  *
437  * While these transactions are open additional transactions can be executed
438  * between any two linkid's {source}=S (registrations we sent) to {target}=T
439  * (registrations we received).
440  *
441  * Closure of any registration transaction will automatically abort any open
442  * transactions using the related linkids.  Closure can be initiated
443  * voluntarily from either side with either end issuing a DELETE, or they
444  * can be ABORTed.
445  *
446  * Status updates are performed via the open transaction.
447  *
448  * --
449  *
450  * A registration identifies a node and its various PFS parameters including
451  * the PFS_TYPE.  For example, a diskless HAMMER2 client typically identifies
452  * itself as PFSTYPE_CLIENT.
453  *
454  * Any node may serve as a cluster controller, aggregating and passing
455  * on received registrations, but end-points do not have to implement this
456  * ability.  Most end-points typically implement a single client-style or
457  * server-style PFS_TYPE and rendezvous at a cluster controller.
458  *
459  * The cluster controller does not aggregate/pass-on all received
460  * registrations.  It typically filters what gets passed on based on what it
461  * receives, passing on only the best candidates.
462  *
463  * If a symmetric spanning tree is desired additional candidates whos
464  * {dist, rnss} fields match the last best candidate must also be propagated.
465  * This feature is not currently enabled.
466  *
467  * STATUS UPDATES: Status updates use the same structure but typically
468  *		   only contain incremental changes to e.g. pfs_type, with
469  *		   a text description sent as out-of-band data.
470  */
471 struct dmsg_lnk_span {
472 	dmsg_hdr_t	head;
473 	uuid_t		pfs_clid;	/* rendezvous pfs uuid */
474 	uuid_t		pfs_fsid;	/* unique pfs id (differentiate node) */
475 	uint8_t		pfs_type;	/* PFS type */
476 	uint8_t		peer_type;	/* PEER type */
477 	uint16_t	proto_version;	/* high level protocol support */
478 	uint32_t	status;		/* status flags */
479 	uint8_t		reserved02[8];
480 	uint32_t	dist;		/* span distance */
481 	uint32_t	rnss;		/* random number sub-sort */
482 	union {
483 		uint32_t	reserved03[14];
484 		dmsg_media_block_t block;
485 	} media;
486 
487 	/*
488 	 * NOTE: for PEER_HAMMER2 cl_label is typically empty and fs_label
489 	 *	 is the superroot directory name.
490 	 *
491 	 *	 for PEER_BLOCK cl_label is typically host/device and
492 	 *	 fs_label is typically the serial number string.
493 	 */
494 	char		cl_label[128];	/* cluster label */
495 	char		fs_label[128];	/* PFS label */
496 };
497 
498 typedef struct dmsg_lnk_span dmsg_lnk_span_t;
499 
500 #define DMSG_SPAN_PROTO_1	1
501 
502 /*
503  * LNK_CIRC - Establish a circuit
504  *	      (transaction, left open, circuit 0 only)
505  *
506  * Establish a circuit to the specified target.  The msgid for the open
507  * transaction is used to transit messages in both directions.
508  *
509  * For circuit establishment the receiving entity looks up the outgoing
510  * relayed SPAN on the incoming iocom based on the target field and then
511  * creates peer circuit on the interface the SPAN originally came in on.
512  * Messages received on one side or forwarded to the other side and vise-versa.
513  * Any link state loss causes all related circuits to be lost.
514  */
515 struct dmsg_lnk_circ {
516 	dmsg_hdr_t	head;
517 	uint64_t	reserved01;
518 	uint64_t	target;
519 };
520 
521 typedef struct dmsg_lnk_circ dmsg_lnk_circ_t;
522 
523 /*
524  * LNK_VOLCONF
525  *
526  * All HAMMER2 directories directly under the super-root on your local
527  * media can be mounted separately, even if they share the same physical
528  * device.
529  *
530  * When you do a HAMMER2 mount you are effectively tying into a HAMMER2
531  * cluster via local media.  The local media does not have to participate
532  * in the cluster, other than to provide the dmsg_vol_data[] array and
533  * root inode for the mount.
534  *
535  * This is important: The mount device path you specify serves to bootstrap
536  * your entry into the cluster, but your mount will make active connections
537  * to ALL copy elements in the dmsg_vol_data[] array which match the
538  * PFSID of the directory in the super-root that you specified.  The local
539  * media path does not have to be mentioned in this array but becomes part
540  * of the cluster based on its type and access rights.  ALL ELEMENTS ARE
541  * TREATED ACCORDING TO TYPE NO MATTER WHICH ONE YOU MOUNT FROM.
542  *
543  * The actual cluster may be far larger than the elements you list in the
544  * dmsg_vol_data[] array.  You list only the elements you wish to
545  * directly connect to and you are able to access the rest of the cluster
546  * indirectly through those connections.
547  *
548  * This structure must be exactly 128 bytes long.
549  *
550  * WARNING!  dmsg_vol_data is embedded in the hammer2 media volume header
551  */
552 struct dmsg_vol_data {
553 	uint8_t	copyid;		/* 00	 copyid 0-255 (must match slot) */
554 	uint8_t inprog;		/* 01	 operation in progress, or 0 */
555 	uint8_t chain_to;	/* 02	 operation chaining to, or 0 */
556 	uint8_t chain_from;	/* 03	 operation chaining from, or 0 */
557 	uint16_t flags;		/* 04-05 flags field */
558 	uint8_t error;		/* 06	 last operational error */
559 	uint8_t priority;	/* 07	 priority and round-robin flag */
560 	uint8_t remote_pfs_type;/* 08	 probed direct remote PFS type */
561 	uint8_t reserved08[23];	/* 09-1F */
562 	uuid_t	pfs_clid;	/* 20-2F copy target must match this uuid */
563 	uint8_t label[16];	/* 30-3F import/export label */
564 	uint8_t path[64];	/* 40-7F target specification string or key */
565 };
566 
567 typedef struct dmsg_vol_data dmsg_vol_data_t;
568 
569 #define DMSG_VOLF_ENABLED	0x0001
570 #define DMSG_VOLF_INPROG	0x0002
571 #define DMSG_VOLF_CONN_RR	0x80	/* round-robin at same priority */
572 #define DMSG_VOLF_CONN_EF	0x40	/* media errors flagged */
573 #define DMSG_VOLF_CONN_PRI	0x0F	/* select priority 0-15 (15=best) */
574 
575 #define DMSG_COPYID_COUNT	256	/* WARNING! embedded in hammer2 vol */
576 
577 struct dmsg_lnk_volconf {
578 	dmsg_hdr_t		head;
579 	dmsg_vol_data_t		copy;	/* copy spec */
580 	int32_t			index;
581 	int32_t			unused01;
582 	uuid_t			mediaid;
583 	int64_t			reserved02[32];
584 };
585 
586 typedef struct dmsg_lnk_volconf dmsg_lnk_volconf_t;
587 
588 /*
589  * Debug layer ops operate on any link
590  *
591  * SHELL	- Persist stream, access the debug shell on the target
592  *		  registration.  Multiple shells can be operational.
593  */
594 #define DMSG_DBG_SHELL		DMSG_DBG(0x001, dmsg_dbg_shell)
595 
596 struct dmsg_dbg_shell {
597 	dmsg_hdr_t	head;
598 };
599 typedef struct dmsg_dbg_shell dmsg_dbg_shell_t;
600 
601 /*
602  * Domain layer ops operate on any link, link-0 may be used when the
603  * directory connected target is the desired registration.
604  *
605  * (nothing defined)
606  */
607 
608 /*
609  * Cache layer ops operate on any link, link-0 may be used when the
610  * directly connected target is the desired registration.
611  *
612  * LOCK		- Persist state, blockable, abortable.
613  *
614  *		  Obtain cache state (MODIFIED, EXCLUSIVE, SHARED, or INVAL)
615  *		  in any of three domains (TREE, INUM, ATTR, DIRENT) for a
616  *		  particular key relative to cache state already owned.
617  *
618  *		  TREE - Effects entire sub-tree at the specified element
619  *			 and will cause existing cache state owned by
620  *			 other nodes to be adjusted such that the request
621  *			 can be granted.
622  *
623  *		  INUM - Only effects inode creation/deletion of an existing
624  *			 element or a new element, by inumber and/or name.
625  *			 typically can be held for very long periods of time
626  *			 (think the vnode cache), directly relates to
627  *			 hammer2_chain structures representing inodes.
628  *
629  *		  ATTR - Only effects an inode's attributes, such as
630  *			 ownership, modes, etc.  Used for lookups, chdir,
631  *			 open, etc.  mtime has no affect.
632  *
633  *		  DIRENT - Only affects an inode's attributes plus the
634  *			 attributes or names related to any directory entry
635  *			 directly under this inode (non-recursively).  Can
636  *			 be retained for medium periods of time when doing
637  *			 directory scans.
638  *
639  *		  This function may block and can be aborted.  You may be
640  *		  granted cache state that is more broad than the state you
641  *		  requested (e.g. a different set of domains and/or an element
642  *		  at a higher layer in the tree).  When quorum operations
643  *		  are used you may have to reconcile these grants to the
644  *		  lowest common denominator.
645  *
646  *		  In order to grant your request either you or the target
647  *		  (or both) may have to obtain a quorum agreement.  Deadlock
648  *		  resolution may be required.  When doing it yourself you
649  *		  will typically maintain an active message to each master
650  *		  node in the system.  You can only grant the cache state
651  *		  when a quorum of nodes agree.
652  *
653  *		  The cache state includes transaction id information which
654  *		  can be used to resolve data requests.
655  */
656 #define DMSG_CAC_LOCK		DMSG_CAC(0x001, dmsg_cac_lock)
657 
658 /*
659  * Quorum layer ops operate on any link, link-0 may be used when the
660  * directly connected target is the desired registration.
661  *
662  * COMMIT	- Persist state, blockable, abortable
663  *
664  *		  Issue a COMMIT in two phases.  A quorum must acknowledge
665  *		  the operation to proceed to phase-2.  Message-update to
666  *		  proceed to phase-2.
667  */
668 #define DMSG_QRM_COMMIT		DMSG_QRM(0x001, dmsg_qrm_commit)
669 
670 /*
671  * DMSG_PROTO_BLK Protocol
672  *
673  * BLK_OPEN	- Open device.  This transaction must be left open for the
674  *		  duration and the returned keyid passed in all associated
675  *		  BLK commands.  Multiple OPENs can be issued within the
676  *		  transaction.
677  *
678  * BLK_CLOSE	- Close device.  This can be used to close one of the opens
679  *		  within a BLK_OPEN transaction.  It may NOT initiate a
680  *		  transaction.  Note that a termination of the transaction
681  *		  (e.g. with LNK_ERROR or BLK_ERROR) closes all active OPENs
682  *		  for that transaction.
683  *
684  * BLK_READ	- Strategy read.  Not typically streaming.
685  *
686  * BLK_WRITE	- Strategy write.  Not typically streaming.
687  *
688  * BLK_FLUSH	- Strategy flush.  Not typically streaming.
689  *
690  * BLK_FREEBLKS	- Strategy freeblks.  Not typically streaming.
691  */
692 #define DMSG_BLK_OPEN		DMSG_BLK(0x001, dmsg_blk_open)
693 #define DMSG_BLK_CLOSE		DMSG_BLK(0x002, dmsg_blk_open)
694 #define DMSG_BLK_READ		DMSG_BLK(0x003, dmsg_blk_read)
695 #define DMSG_BLK_WRITE		DMSG_BLK(0x004, dmsg_blk_write)
696 #define DMSG_BLK_FLUSH		DMSG_BLK(0x005, dmsg_blk_flush)
697 #define DMSG_BLK_FREEBLKS	DMSG_BLK(0x006, dmsg_blk_freeblks)
698 #define DMSG_BLK_ERROR		DMSG_BLK(0xFFF, dmsg_blk_error)
699 
700 struct dmsg_blk_open {
701 	dmsg_hdr_t	head;
702 	uint32_t	modes;
703 	uint32_t	reserved01;
704 };
705 
706 #define DMSG_BLKOPEN_RD		0x0001
707 #define DMSG_BLKOPEN_WR		0x0002
708 
709 /*
710  * DMSG_LNK_ERROR is returned for simple results,
711  * DMSG_BLK_ERROR is returned for extended results.
712  */
713 struct dmsg_blk_error {
714 	dmsg_hdr_t	head;
715 	uint64_t	keyid;
716 	uint32_t	resid;
717 	uint32_t	reserved02;
718 	char		buf[64];
719 };
720 
721 struct dmsg_blk_read {
722 	dmsg_hdr_t	head;
723 	uint64_t	keyid;
724 	uint64_t	offset;
725 	uint32_t	bytes;
726 	uint32_t	flags;
727 	uint32_t	reserved01;
728 	uint32_t	reserved02;
729 };
730 
731 struct dmsg_blk_write {
732 	dmsg_hdr_t	head;
733 	uint64_t	keyid;
734 	uint64_t	offset;
735 	uint32_t	bytes;
736 	uint32_t	flags;
737 	uint32_t	reserved01;
738 	uint32_t	reserved02;
739 };
740 
741 struct dmsg_blk_flush {
742 	dmsg_hdr_t	head;
743 	uint64_t	keyid;
744 	uint64_t	offset;
745 	uint32_t	bytes;
746 	uint32_t	flags;
747 	uint32_t	reserved01;
748 	uint32_t	reserved02;
749 };
750 
751 struct dmsg_blk_freeblks {
752 	dmsg_hdr_t	head;
753 	uint64_t	keyid;
754 	uint64_t	offset;
755 	uint32_t	bytes;
756 	uint32_t	flags;
757 	uint32_t	reserved01;
758 	uint32_t	reserved02;
759 };
760 
761 typedef struct dmsg_blk_open		dmsg_blk_open_t;
762 typedef struct dmsg_blk_read		dmsg_blk_read_t;
763 typedef struct dmsg_blk_write		dmsg_blk_write_t;
764 typedef struct dmsg_blk_flush		dmsg_blk_flush_t;
765 typedef struct dmsg_blk_freeblks	dmsg_blk_freeblks_t;
766 typedef struct dmsg_blk_error		dmsg_blk_error_t;
767 
768 /*
769  * NOTE!!!! ALL EXTENDED HEADER STRUCTURES MUST BE 64-BYTE ALIGNED!!!
770  *
771  * General message errors
772  *
773  *	0x00 - 0x1F	Local iocomm errors
774  *	0x20 - 0x2F	Global errors
775  */
776 #define DMSG_ERR_NOSUPP		0x20
777 #define DMSG_ERR_LOSTLINK	0x21
778 #define DMSG_ERR_IO		0x22	/* generic */
779 #define DMSG_ERR_PARAM		0x23	/* generic */
780 #define DMSG_ERR_CANTCIRC	0x24	/* (typically means lost span) */
781 
782 union dmsg_any {
783 	char			buf[DMSG_HDR_MAX];
784 	dmsg_hdr_t		head;
785 
786 	dmsg_lnk_conn_t		lnk_conn;
787 	dmsg_lnk_span_t		lnk_span;
788 	dmsg_lnk_circ_t		lnk_circ;
789 	dmsg_lnk_volconf_t	lnk_volconf;
790 
791 	dmsg_blk_open_t		blk_open;
792 	dmsg_blk_error_t	blk_error;
793 	dmsg_blk_read_t		blk_read;
794 	dmsg_blk_write_t	blk_write;
795 	dmsg_blk_flush_t	blk_flush;
796 	dmsg_blk_freeblks_t	blk_freeblks;
797 };
798 
799 typedef union dmsg_any dmsg_any_t;
800 
801 /*
802  * Kernel iocom structures and prototypes for kern/kern_dmsg.c
803  */
804 #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES)
805 
806 struct hammer2_pfsmount;
807 struct kdmsg_iocom;
808 struct kdmsg_state;
809 struct kdmsg_msg;
810 
811 /*
812  * msg_ctl flags (atomic)
813  */
814 #define KDMSG_CLUSTERCTL_KILL		0x00000001
815 #define KDMSG_CLUSTERCTL_KILLRX		0x00000002 /* staged helper exit */
816 #define KDMSG_CLUSTERCTL_KILLTX		0x00000004 /* staged helper exit */
817 #define KDMSG_CLUSTERCTL_SLEEPING	0x00000008 /* interlocked w/msglk */
818 
819 /*
820  * When the KDMSG_IOCOMF_AUTOCIRC flag is set the kdmsg code in
821  * the kernel automatically tries to forge a virtual circuit for
822  * any active SPAN state received.
823  *
824  * This is only done when the received SPANs are significantly filtered
825  * by the transmitted LNK_CONN.  That is, it is done only by clients who
826  * connect to specific services over the cluster.
827  */
828 struct kdmsg_circuit {
829 	RB_ENTRY(kdmsg_circuit) rbnode;		/* indexed by msgid */
830 	TAILQ_ENTRY(kdmsg_circuit) entry;	/* written by shim */
831 	struct kdmsg_iocom	*iocom;		/* written by shim */
832 	struct kdmsg_state	*span_state;
833 	struct kdmsg_state	*circ_state;	/* master circuit */
834 	struct kdmsg_state	*rcirc_state;	/* slave circuit */
835 	uint64_t		msgid;
836 	int			weight;
837 	int			recorded;	/* written by shim */
838 	int			lost;		/* written by shim */
839 	int			refs;		/* written by shim */
840 };
841 
842 typedef struct kdmsg_circuit kdmsg_circuit_t;
843 
844 /*
845  * Transactional state structure, representing an open transaction.  The
846  * transaction might represent a cache state (and thus have a chain
847  * association), or a VOP op, LNK_SPAN, or other things.
848  */
849 struct kdmsg_state {
850 	RB_ENTRY(kdmsg_state) rbnode;		/* indexed by msgid */
851 	struct kdmsg_iocom *iocom;
852 	struct kdmsg_circuit *circ;
853 	uint32_t	icmd;			/* record cmd creating state */
854 	uint32_t	txcmd;			/* mostly for CMDF flags */
855 	uint32_t	rxcmd;			/* mostly for CMDF flags */
856 	uint64_t	msgid;			/* {circuit,msgid} uniq */
857 	int		flags;
858 	int		error;
859 	void		*chain;			/* (caller's state) */
860 	struct kdmsg_msg *msg;
861 	int (*func)(struct kdmsg_state *, struct kdmsg_msg *);
862 	union {
863 		void *any;
864 		struct hammer2_pfsmount *pmp;
865 		struct kdmsg_circuit *circ;
866 	} any;
867 };
868 
869 #define KDMSG_STATE_INSERTED	0x0001
870 #define KDMSG_STATE_DYNAMIC	0x0002
871 #define KDMSG_STATE_DELPEND	0x0004		/* transmit delete pending */
872 #define KDMSG_STATE_ABORTING	0x0008		/* avoids recursive abort */
873 
874 struct kdmsg_msg {
875 	TAILQ_ENTRY(kdmsg_msg) qentry;		/* serialized queue */
876 	struct kdmsg_iocom *iocom;
877 	struct kdmsg_state *state;
878 	struct kdmsg_circuit *circ;
879 	size_t		hdr_size;
880 	size_t		aux_size;
881 	char		*aux_data;
882 	int		flags;
883 	dmsg_any_t	any;
884 };
885 
886 #define KDMSG_FLAG_AUXALLOC	0x0001
887 
888 typedef struct kdmsg_link kdmsg_link_t;
889 typedef struct kdmsg_state kdmsg_state_t;
890 typedef struct kdmsg_msg kdmsg_msg_t;
891 
892 struct kdmsg_state_tree;
893 int kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2);
894 RB_HEAD(kdmsg_state_tree, kdmsg_state);
895 RB_PROTOTYPE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
896 
897 struct kdmsg_circuit_tree;
898 int kdmsg_circuit_cmp(kdmsg_circuit_t *circ1, kdmsg_circuit_t *circ2);
899 RB_HEAD(kdmsg_circuit_tree, kdmsg_circuit);
900 RB_PROTOTYPE(kdmsg_circuit_tree, kdmsg_circuit, rbnode, kdmsg_circuit_cmp);
901 
902 /*
903  * Structure embedded in e.g. mount, master control structure for
904  * DMSG stream handling.
905  */
906 struct kdmsg_iocom {
907 	struct malloc_type	*mmsg;
908 	struct file		*msg_fp;	/* cluster pipe->userland */
909 	thread_t		msgrd_td;	/* cluster thread */
910 	thread_t		msgwr_td;	/* cluster thread */
911 	int			msg_ctl;	/* wakeup flags */
912 	int			msg_seq;	/* cluster msg sequence id */
913 	uint32_t		flags;
914 	struct lock		msglk;		/* lockmgr lock */
915 	TAILQ_HEAD(, kdmsg_msg) msgq;		/* transmit queue */
916 	void			*handle;
917 	void			(*auto_callback)(kdmsg_msg_t *);
918 	int			(*rcvmsg)(kdmsg_msg_t *);
919 	void			(*exit_func)(struct kdmsg_iocom *);
920 	struct kdmsg_state	*conn_state;	/* active LNK_CONN state */
921 	struct kdmsg_state	*freerd_state;	/* allocation cache */
922 	struct kdmsg_state	*freewr_state;	/* allocation cache */
923 	struct kdmsg_state_tree staterd_tree;	/* active messages */
924 	struct kdmsg_state_tree statewr_tree;	/* active messages */
925 	struct kdmsg_circuit_tree circ_tree;	/* active circuits */
926 	dmsg_lnk_conn_t		auto_lnk_conn;
927 	dmsg_lnk_span_t		auto_lnk_span;
928 };
929 
930 typedef struct kdmsg_iocom	kdmsg_iocom_t;
931 
932 #define KDMSG_IOCOMF_AUTOCONN	0x0001	/* handle received LNK_CONN */
933 #define KDMSG_IOCOMF_AUTOSPAN	0x0002	/* handle received LNK_SPAN */
934 #define KDMSG_IOCOMF_AUTOCIRC	0x0004	/* handle received LNK_CIRC */
935 #define KDMSG_IOCOMF_AUTOFORGE	0x0008	/* auto initiate LNK_CIRC */
936 #define KDMSG_IOCOMF_EXITNOACC	0x0010	/* cannot accept writes */
937 
938 #define KDMSG_IOCOMF_AUTOANY	(KDMSG_IOCOMF_AUTOCONN |	\
939 				 KDMSG_IOCOMF_AUTOSPAN |	\
940 				 KDMSG_IOCOMF_AUTOCIRC |	\
941 				 KDMSG_IOCOMF_AUTOFORGE)
942 
943 uint32_t kdmsg_icrc32(const void *buf, size_t size);
944 uint32_t kdmsg_icrc32c(const void *buf, size_t size, uint32_t crc);
945 
946 /*
947  * kern_dmsg.c
948  */
949 void kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, u_int32_t flags,
950 			struct malloc_type *mmsg,
951 			int (*rcvmsg)(kdmsg_msg_t *msg));
952 void kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
953 			const char *subsysname);
954 void kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
955 			void (*conn_callback)(kdmsg_msg_t *msg));
956 void kdmsg_iocom_uninit(kdmsg_iocom_t *iocom);
957 void kdmsg_drain_msgq(kdmsg_iocom_t *iocom);
958 
959 void kdmsg_msg_free(kdmsg_msg_t *msg);
960 kdmsg_msg_t *kdmsg_msg_alloc(kdmsg_iocom_t *iocom, kdmsg_circuit_t *circ,
961 				uint32_t cmd,
962 				int (*func)(kdmsg_state_t *, kdmsg_msg_t *),
963 				void *data);
964 kdmsg_msg_t *kdmsg_msg_alloc_state(kdmsg_state_t *state, uint32_t cmd,
965 				int (*func)(kdmsg_state_t *, kdmsg_msg_t *),
966 				void *data);
967 void kdmsg_msg_write(kdmsg_msg_t *msg);
968 void kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error);
969 void kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error);
970 void kdmsg_state_reply(kdmsg_state_t *state, uint32_t error);
971 void kdmsg_state_result(kdmsg_state_t *state, uint32_t error);
972 
973 void kdmsg_circ_hold(kdmsg_circuit_t *circ);
974 void kdmsg_circ_drop(kdmsg_circuit_t *circ);
975 
976 
977 #endif
978 
979 #endif
980