xref: /dragonfly/sys/sys/dmsg.h (revision cecb9aae)
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #ifndef _SYS_DMSG_H_
36 #define _SYS_DMSG_H_
37 
38 #ifndef _SYS_MALLOC_H_
39 #include <sys/malloc.h>
40 #endif
41 #ifndef _SYS_TREE_H_
42 #include <sys/tree.h>
43 #endif
44 #ifndef _SYS_THREAD_H_
45 #include <sys/thread.h>
46 #endif
47 #ifndef _SYS_UUID_H_
48 #include <sys/uuid.h>
49 #endif
50 
51 /*
52  * Mesh network protocol structures.
53  *
54  *				CONN PROTOCOL
55  *
56  * The mesh is constructed from point-to-point streaming links with varying
57  * levels of interconnectedness, forming a graph.  Terminii in the graph
58  * are entities such as a HAMMER2 PFS or a network mount or other types
59  * of nodes.
60  *
61  * Upon connecting and after authentication, a LNK_CONN transaction is opened
62  * on circuit 0 by both ends.  This configures and enables the SPAN protocol.
63  * The LNK_CONN transaction remains open for the life of the connection.
64  *
65  *				SPAN PROTOCOL
66  *
67  * Once enabled, termini transmits a representitive LNK_SPAN out all
68  * available connections advertising what it is.  Nodes maintaing multiple
69  * connections will relay received LNK_SPANs out available connections
70  * with some filtering based on the CONN configuration.  A distance metric
71  * and per-node random value (rnss) is aggregated.
72  *
73  * Since LNK_SPANs can rapidly multiply in a complex graph, not all incoming
74  * LNK_SPANs will be relayed.  Only the top N over all collect LNK_SPANs for
75  * any given advertisement are relayed.
76  *
77  * It is possible to code the SPANning tree algorithm to guarantee that
78  * symmetrical spans will be generated after stabilization.  The RNSS field
79  * is used to help distinguish and reduce paths in complex graphs when
80  * symmetric spans are desired.  We always generate RNSS but we currently do
81  * not implement symmetrical SPAN guarantees.
82  *
83  *				CIRC PROTOCOL
84  *
85  * We aren't done yet.  Before transactions can be relayed, symmetric paths
86  * must be formed via the LNK_CIRC protocol.  The LNK_CIRC protocol
87  * establishes a virtual circuit from any node to any other node, creating
88  * a circuit id which is stored in dmsg_hdr.circuit.  Messages received on
89  * one side or forwarded to the other.  Forwarded messages bypass normal
90  * state tracking.
91  *
92  * A virtual circuit is forged by working the propogated SPANs backwards.
93  * Each node in the graph helps propagate the virtual circuit by attach the
94  * LNK_CIRC transaction it receives to a LNK_CIRC transaction it initiates
95  * out the other interface.
96  *
97  * Since SPANs are link-state transactions any change in related span(s)
98  * will also force-terminate VC's using those spans.
99  *
100  *			MESSAGE TRANSACTIONAL STATES
101  *
102  * Message state is handled by the CREATE, DELETE, REPLY, and ABORT
103  * flags.  Message state is typically recorded at the end points and
104  * at each hop until a DELETE is received from both sides.
105  *
106  * One-way messages such as those used by spanning tree commands are not
107  * recorded.  These are sent without the CREATE, DELETE, or ABORT flags set.
108  * ABORT is not supported for one-off messages.  The REPLY bit can be used
109  * to distinguish between command and status if desired.
110  *
111  * Persistent-state messages are messages which require a reply to be
112  * returned.  These messages can also consist of multiple message elements
113  * for the command or reply or both (or neither).  The command message
114  * sequence sets CREATE on the first message and DELETE on the last message.
115  * A single message command sets both (CREATE|DELETE).  The reply message
116  * sequence works the same way but of course also sets the REPLY bit.
117  *
118  * Persistent-state messages can be aborted by sending a message element
119  * with the ABORT flag set.  This flag can be combined with either or both
120  * the CREATE and DELETE flags.  When combined with the CREATE flag the
121  * command is treated as non-blocking but still executes.  Whem combined
122  * with the DELETE flag no additional message elements are required.
123  *
124  * ABORT SPECIAL CASE - Mid-stream aborts.  A mid-stream abort can be sent
125  * when supported by the sender by sending an ABORT message with neither
126  * CREATE or DELETE set.  This effectively turns the message into a
127  * non-blocking message (but depending on what is being represented can also
128  * cut short prior data elements in the stream).
129  *
130  * ABORT SPECIAL CASE - Abort-after-DELETE.  Persistent messages have to be
131  * abortable if the stream/pipe/whatever is lost.  In this situation any
132  * forwarding relay needs to unconditionally abort commands and replies that
133  * are still active.  This is done by sending an ABORT|DELETE even in
134  * situations where a DELETE has already been sent in that direction.  This
135  * is done, for example, when links are in a half-closed state.  In this
136  * situation it is possible for the abort request to race a transition to the
137  * fully closed state.  ABORT|DELETE messages which race the fully closed
138  * state are expected to be discarded by the other end.
139  *
140  * --
141  *
142  * All base and extended message headers are 64-byte aligned, and all
143  * transports must support extended message headers up to DMSG_HDR_MAX.
144  * Currently we allow extended message headers up to 2048 bytes.  Note
145  * that the extended header size is encoded in the 'cmd' field of the header.
146  *
147  * Any in-band data is padded to a 64-byte alignment and placed directly
148  * after the extended header (after the higher-level cmd/rep structure).
149  * The actual unaligned size of the in-band data is encoded in the aux_bytes
150  * field in this case.  Maximum data sizes are negotiated during registration.
151  *
152  * Auxillary data can be in-band or out-of-band.  In-band data sets aux_descr
153  * equal to 0.  Any out-of-band data must be negotiated by the SPAN protocol.
154  *
155  * Auxillary data, whether in-band or out-of-band, must be at-least 64-byte
156  * aligned.  The aux_bytes field contains the actual byte-granular length
157  * and not the aligned length.  The crc is against the aligned length (so
158  * a faster crc algorithm can be used, theoretically).
159  *
160  * hdr_crc is calculated over the entire, ALIGNED extended header.  For
161  * the purposes of calculating the crc, the hdr_crc field is 0.  That is,
162  * if calculating the crc in HW a 32-bit '0' must be inserted in place of
163  * the hdr_crc field when reading the entire header and compared at the
164  * end (but the actual hdr_crc must be left intact in memory).  A simple
165  * counter to replace the field going into the CRC generator does the job
166  * in HW.  The CRC endian is based on the magic number field and may have
167  * to be byte-swapped, too (which is also easy to do in HW).
168  *
169  * aux_crc is calculated over the entire, ALIGNED auxillary data.
170  *
171  *			SHARED MEMORY IMPLEMENTATIONS
172  *
173  * Shared-memory implementations typically use a pipe to transmit the extended
174  * message header and shared memory to store any auxilary data.  Auxillary
175  * data in one-way (non-transactional) messages is typically required to be
176  * inline.  CRCs are still recommended and required at the beginning, but
177  * may be negotiated away later.
178  */
179 struct dmsg_hdr {
180 	uint16_t	magic;		/* 00 sanity, synchro, endian */
181 	uint16_t	reserved02;	/* 02 */
182 	uint32_t	salt;		/* 04 random salt helps w/crypto */
183 
184 	uint64_t	msgid;		/* 08 message transaction id */
185 	uint64_t	circuit;	/* 10 circuit id or 0	*/
186 	uint64_t	reserved18;	/* 18 */
187 
188 	uint32_t	cmd;		/* 20 flags | cmd | hdr_size / ALIGN */
189 	uint32_t	aux_crc;	/* 24 auxillary data crc */
190 	uint32_t	aux_bytes;	/* 28 auxillary data length (bytes) */
191 	uint32_t	error;		/* 2C error code or 0 */
192 	uint64_t	aux_descr;	/* 30 negotiated OOB data descr */
193 	uint32_t	reserved38;	/* 38 */
194 	uint32_t	hdr_crc;	/* 3C (aligned) extended header crc */
195 };
196 
197 typedef struct dmsg_hdr dmsg_hdr_t;
198 
199 #define DMSG_HDR_MAGIC		0x4832
200 #define DMSG_HDR_MAGIC_REV	0x3248
201 #define DMSG_HDR_CRCOFF		offsetof(dmsg_hdr_t, salt)
202 #define DMSG_HDR_CRCBYTES	(sizeof(dmsg_hdr_t) - DMSG_HDR_CRCOFF)
203 
204 /*
205  * Administrative protocol limits.
206  */
207 #define DMSG_HDR_MAX		2048	/* <= 65535 */
208 #define DMSG_AUX_MAX		65536	/* <= 1MB */
209 #define DMSG_BUF_SIZE		(DMSG_HDR_MAX * 4)
210 #define DMSG_BUF_MASK		(DMSG_BUF_SIZE - 1)
211 
212 /*
213  * The message (cmd) field also encodes various flags and the total size
214  * of the message header.  This allows the protocol processors to validate
215  * persistency and structural settings for every command simply by
216  * switch()ing on the (cmd) field.
217  */
218 #define DMSGF_CREATE		0x80000000U	/* msg start */
219 #define DMSGF_DELETE		0x40000000U	/* msg end */
220 #define DMSGF_REPLY		0x20000000U	/* reply path */
221 #define DMSGF_ABORT		0x10000000U	/* abort req */
222 #define DMSGF_AUXOOB		0x08000000U	/* aux-data is OOB */
223 #define DMSGF_FLAG2		0x04000000U
224 #define DMSGF_FLAG1		0x02000000U
225 #define DMSGF_FLAG0		0x01000000U
226 
227 #define DMSGF_FLAGS		0xFF000000U	/* all flags */
228 #define DMSGF_PROTOS		0x00F00000U	/* all protos */
229 #define DMSGF_CMDS		0x000FFF00U	/* all cmds */
230 #define DMSGF_SIZE		0x000000FFU	/* N*32 */
231 
232 #define DMSGF_CMDSWMASK		(DMSGF_CMDS |	\
233 					 DMSGF_SIZE |	\
234 					 DMSGF_PROTOS |	\
235 					 DMSGF_REPLY)
236 
237 #define DMSGF_BASECMDMASK	(DMSGF_CMDS |	\
238 					 DMSGF_SIZE |	\
239 					 DMSGF_PROTOS)
240 
241 #define DMSGF_TRANSMASK		(DMSGF_CMDS |	\
242 					 DMSGF_SIZE |	\
243 					 DMSGF_PROTOS |	\
244 					 DMSGF_REPLY |	\
245 					 DMSGF_CREATE |	\
246 					 DMSGF_DELETE)
247 
248 #define DMSG_PROTO_LNK		0x00000000U
249 #define DMSG_PROTO_DBG		0x00100000U
250 #define DMSG_PROTO_DOM		0x00200000U
251 #define DMSG_PROTO_CAC		0x00300000U
252 #define DMSG_PROTO_QRM		0x00400000U
253 #define DMSG_PROTO_BLK		0x00500000U
254 #define DMSG_PROTO_VOP		0x00600000U
255 
256 /*
257  * Message command constructors, sans flags
258  */
259 #define DMSG_ALIGN		64
260 #define DMSG_ALIGNMASK		(DMSG_ALIGN - 1)
261 #define DMSG_DOALIGN(bytes)	(((bytes) + DMSG_ALIGNMASK) &		\
262 				 ~DMSG_ALIGNMASK)
263 
264 #define DMSG_HDR_ENCODE(elm)	(((uint32_t)sizeof(struct elm) +	\
265 				  DMSG_ALIGNMASK) /			\
266 				 DMSG_ALIGN)
267 
268 #define DMSG_LNK(cmd, elm)	(DMSG_PROTO_LNK |			\
269 					 ((cmd) << 8) | 		\
270 					 DMSG_HDR_ENCODE(elm))
271 
272 #define DMSG_DBG(cmd, elm)	(DMSG_PROTO_DBG |			\
273 					 ((cmd) << 8) | 		\
274 					 DMSG_HDR_ENCODE(elm))
275 
276 #define DMSG_DOM(cmd, elm)	(DMSG_PROTO_DOM |			\
277 					 ((cmd) << 8) | 		\
278 					 DMSG_HDR_ENCODE(elm))
279 
280 #define DMSG_CAC(cmd, elm)	(DMSG_PROTO_CAC |			\
281 					 ((cmd) << 8) | 		\
282 					 DMSG_HDR_ENCODE(elm))
283 
284 #define DMSG_QRM(cmd, elm)	(DMSG_PROTO_QRM |			\
285 					 ((cmd) << 8) | 		\
286 					 DMSG_HDR_ENCODE(elm))
287 
288 #define DMSG_BLK(cmd, elm)	(DMSG_PROTO_BLK |			\
289 					 ((cmd) << 8) | 		\
290 					 DMSG_HDR_ENCODE(elm))
291 
292 #define DMSG_VOP(cmd, elm)	(DMSG_PROTO_VOP |			\
293 					 ((cmd) << 8) | 		\
294 					 DMSG_HDR_ENCODE(elm))
295 
296 /*
297  * Link layer ops basically talk to just the other side of a direct
298  * connection.
299  *
300  * LNK_PAD	- One-way message on circuit 0, ignored by target.  Used to
301  *		  pad message buffers on shared-memory transports.  Not
302  *		  typically used with TCP.
303  *
304  * LNK_PING	- One-way message on circuit-0, keep-alive, run by both sides
305  *		  typically 1/sec on idle link, link is lost after 10 seconds
306  *		  of inactivity.
307  *
308  * LNK_AUTH	- Authenticate the connection, negotiate administrative
309  *		  rights & encryption, protocol class, etc.  Only PAD and
310  *		  AUTH messages (not even PING) are accepted until
311  *		  authentication is complete.  This message also identifies
312  *		  the host.
313  *
314  * LNK_CONN	- Enable the SPAN protocol on circuit-0, possibly also
315  *		  installing a PFS filter (by cluster id, unique id, and/or
316  *		  wildcarded name).
317  *
318  * LNK_SPAN	- A SPAN transaction on circuit-0 enables messages to be
319  *		  relayed to/from a particular cluster node.  SPANs are
320  *		  received, sorted, aggregated, filtered, and retransmitted
321  *		  back out across all applicable connections.
322  *
323  *		  The leaf protocol also uses this to make a PFS available
324  *		  to the cluster (e.g. on-mount).
325  *
326  * LNK_CIRC	- a CIRC transaction establishes a circuit from source to
327  *		  target by creating pairs of open transactions across each
328  *		  hop.
329  *
330  * LNK_VOLCONF	- Volume header configuration change.  All hammer2
331  *		  connections (hammer2 connect ...) stored in the volume
332  *		  header are spammed on circuit 0 to the hammer2
333  *		  service daemon, and any live configuration change
334  *		  thereafter.
335  */
336 #define DMSG_LNK_PAD		DMSG_LNK(0x000, dmsg_hdr)
337 #define DMSG_LNK_PING		DMSG_LNK(0x001, dmsg_hdr)
338 #define DMSG_LNK_AUTH		DMSG_LNK(0x010, dmsg_lnk_auth)
339 #define DMSG_LNK_CONN		DMSG_LNK(0x011, dmsg_lnk_conn)
340 #define DMSG_LNK_SPAN		DMSG_LNK(0x012, dmsg_lnk_span)
341 #define DMSG_LNK_CIRC		DMSG_LNK(0x013, dmsg_lnk_circ)
342 #define DMSG_LNK_VOLCONF	DMSG_LNK(0x020, dmsg_lnk_volconf)
343 #define DMSG_LNK_ERROR		DMSG_LNK(0xFFF, dmsg_hdr)
344 
345 /*
346  * LNK_AUTH - Authentication (often omitted)
347  */
348 struct dmsg_lnk_auth {
349 	dmsg_hdr_t	head;
350 	char		dummy[64];
351 };
352 
353 /*
354  * LNK_CONN - Register connection info for SPAN protocol
355  *	      (transaction, left open, circuit 0 only).
356  *
357  * LNK_CONN identifies a streaming connection into the cluster and serves
358  * to identify, enable, and specify filters for the SPAN protocol.
359  *
360  * peer_mask serves to filter the SPANs we receive by peer_type.  A cluster
361  * controller typically sets this to (uint64_t)-1, indicating that it wants
362  * everything.  A block devfs interface might set it to 1 << DMSG_PEER_DISK,
363  * and a hammer2 mount might set it to 1 << DMSG_PEER_HAMMER2.
364  *
365  * mediaid allows multiple (e.g. HAMMER2) connections belonging to the same
366  * media to transmit duplicative LNK_VOLCONF updates without causing
367  * confusion in the cluster controller.
368  *
369  * pfs_clid, pfs_fsid, pfs_type, and label are peer-specific and must be
370  * left empty (zero-fill) if not supported by a particular peer.
371  *
372  * DMSG_PEER_CLUSTER		filter: none
373  * DMSG_PEER_BLOCK		filter: label
374  * DMSG_PEER_HAMMER2		filter: pfs_clid if not empty, and label
375  */
376 struct dmsg_lnk_conn {
377 	dmsg_hdr_t	head;
378 	uuid_t		mediaid;	/* media configuration id */
379 	uuid_t		pfs_clid;	/* rendezvous pfs uuid */
380 	uuid_t		pfs_fsid;	/* unique pfs uuid */
381 	uint64_t	peer_mask;	/* PEER mask for SPAN filtering */
382 	uint8_t		peer_type;	/* see DMSG_PEER_xxx */
383 	uint8_t		pfs_type;	/* pfs type */
384 	uint16_t	proto_version;	/* high level protocol support */
385 	uint32_t	status;		/* status flags */
386 	uint32_t	rnss;		/* node's generated rnss */
387 	uint8_t		reserved02[8];
388 	uint32_t	reserved03[12];
389 	uint64_t	pfs_mask;	/* PFS mask for SPAN filtering */
390 	char		cl_label[128];	/* cluster label (for PEER_BLOCK) */
391 	char		fs_label[128];	/* PFS label (for PEER_HAMMER2) */
392 };
393 
394 typedef struct dmsg_lnk_conn dmsg_lnk_conn_t;
395 
396 #define DMSG_PFSTYPE_NONE	0
397 #define DMSG_PFSTYPE_ADMIN	1
398 #define DMSG_PFSTYPE_CLIENT	2
399 #define DMSG_PFSTYPE_CACHE	3
400 #define DMSG_PFSTYPE_COPY	4
401 #define DMSG_PFSTYPE_SLAVE	5
402 #define DMSG_PFSTYPE_SOFT_SLAVE	6
403 #define DMSG_PFSTYPE_SOFT_MASTER 7
404 #define DMSG_PFSTYPE_MASTER	8
405 #define DMSG_PFSTYPE_SERVER	9
406 #define DMSG_PFSTYPE_MAX	10	/* 0-9 */
407 
408 #define DMSG_PEER_NONE		0
409 #define DMSG_PEER_CLUSTER	1	/* a cluster controller */
410 #define DMSG_PEER_BLOCK		2	/* block devices */
411 #define DMSG_PEER_HAMMER2	3	/* hammer2-mounted volumes */
412 
413 /*
414  * Structures embedded in LNK_SPAN
415  */
416 struct dmsg_media_block {
417 	uint64_t	bytes;		/* media size in bytes */
418 	uint32_t	blksize;	/* media block size */
419 };
420 
421 typedef struct dmsg_media_block dmsg_media_block_t;
422 
423 /*
424  * LNK_SPAN - Initiate or relay a SPAN
425  *	      (transaction, left open, circuit 0 only)
426  *
427  * This message registers an end-point with the other end of the connection,
428  * telling the other end who we are and what we can provide or intend to
429  * consume.  Multiple registrations can be maintained as open transactions
430  * with each one specifying a unique end-point.
431  *
432  * Registrations are sent from {source}=S {1...n} to {target}=0 and maintained
433  * as open transactions.  Registrations are also received and maintains as
434  * open transactions, creating a matrix of linkid's.
435  *
436  * While these transactions are open additional transactions can be executed
437  * between any two linkid's {source}=S (registrations we sent) to {target}=T
438  * (registrations we received).
439  *
440  * Closure of any registration transaction will automatically abort any open
441  * transactions using the related linkids.  Closure can be initiated
442  * voluntarily from either side with either end issuing a DELETE, or they
443  * can be ABORTed.
444  *
445  * Status updates are performed via the open transaction.
446  *
447  * --
448  *
449  * A registration identifies a node and its various PFS parameters including
450  * the PFS_TYPE.  For example, a diskless HAMMER2 client typically identifies
451  * itself as PFSTYPE_CLIENT.
452  *
453  * Any node may serve as a cluster controller, aggregating and passing
454  * on received registrations, but end-points do not have to implement this
455  * ability.  Most end-points typically implement a single client-style or
456  * server-style PFS_TYPE and rendezvous at a cluster controller.
457  *
458  * The cluster controller does not aggregate/pass-on all received
459  * registrations.  It typically filters what gets passed on based on what it
460  * receives, passing on only the best candidates.
461  *
462  * If a symmetric spanning tree is desired additional candidates whos
463  * {dist, rnss} fields match the last best candidate must also be propagated.
464  * This feature is not currently enabled.
465  *
466  * STATUS UPDATES: Status updates use the same structure but typically
467  *		   only contain incremental changes to e.g. pfs_type, with
468  *		   a text description sent as out-of-band data.
469  */
470 struct dmsg_lnk_span {
471 	dmsg_hdr_t	head;
472 	uuid_t		pfs_clid;	/* rendezvous pfs uuid */
473 	uuid_t		pfs_fsid;	/* unique pfs id (differentiate node) */
474 	uint8_t		pfs_type;	/* PFS type */
475 	uint8_t		peer_type;	/* PEER type */
476 	uint16_t	proto_version;	/* high level protocol support */
477 	uint32_t	status;		/* status flags */
478 	uint8_t		reserved02[8];
479 	uint32_t	dist;		/* span distance */
480 	uint32_t	rnss;		/* random number sub-sort */
481 	union {
482 		uint32_t	reserved03[14];
483 		dmsg_media_block_t block;
484 	} media;
485 
486 	/*
487 	 * NOTE: for PEER_HAMMER2 cl_label is typically empty and fs_label
488 	 *	 is the superroot directory name.
489 	 *
490 	 *	 for PEER_BLOCK cl_label is typically host/device and
491 	 *	 fs_label is typically the serial number string.
492 	 */
493 	char		cl_label[128];	/* cluster label */
494 	char		fs_label[128];	/* PFS label */
495 };
496 
497 typedef struct dmsg_lnk_span dmsg_lnk_span_t;
498 
499 #define DMSG_SPAN_PROTO_1	1
500 
501 /*
502  * LNK_CIRC - Establish a circuit
503  *	      (transaction, left open, circuit 0 only)
504  *
505  * Establish a circuit to the specified target.  The msgid for the open
506  * transaction is used to transit messages in both directions.
507  *
508  * For circuit establishment the receiving entity looks up the outgoing
509  * relayed SPAN on the incoming iocom based on the target field and then
510  * creates peer circuit on the interface the SPAN originally came in on.
511  * Messages received on one side or forwarded to the other side and vise-versa.
512  * Any link state loss causes all related circuits to be lost.
513  */
514 struct dmsg_lnk_circ {
515 	dmsg_hdr_t	head;
516 	uint64_t	reserved01;
517 	uint64_t	target;
518 };
519 
520 typedef struct dmsg_lnk_circ dmsg_lnk_circ_t;
521 
522 /*
523  * LNK_VOLCONF
524  *
525  * All HAMMER2 directories directly under the super-root on your local
526  * media can be mounted separately, even if they share the same physical
527  * device.
528  *
529  * When you do a HAMMER2 mount you are effectively tying into a HAMMER2
530  * cluster via local media.  The local media does not have to participate
531  * in the cluster, other than to provide the dmsg_vol_data[] array and
532  * root inode for the mount.
533  *
534  * This is important: The mount device path you specify serves to bootstrap
535  * your entry into the cluster, but your mount will make active connections
536  * to ALL copy elements in the dmsg_vol_data[] array which match the
537  * PFSID of the directory in the super-root that you specified.  The local
538  * media path does not have to be mentioned in this array but becomes part
539  * of the cluster based on its type and access rights.  ALL ELEMENTS ARE
540  * TREATED ACCORDING TO TYPE NO MATTER WHICH ONE YOU MOUNT FROM.
541  *
542  * The actual cluster may be far larger than the elements you list in the
543  * dmsg_vol_data[] array.  You list only the elements you wish to
544  * directly connect to and you are able to access the rest of the cluster
545  * indirectly through those connections.
546  *
547  * This structure must be exactly 128 bytes long.
548  *
549  * WARNING!  dmsg_vol_data is embedded in the hammer2 media volume header
550  */
551 struct dmsg_vol_data {
552 	uint8_t	copyid;		/* 00	 copyid 0-255 (must match slot) */
553 	uint8_t inprog;		/* 01	 operation in progress, or 0 */
554 	uint8_t chain_to;	/* 02	 operation chaining to, or 0 */
555 	uint8_t chain_from;	/* 03	 operation chaining from, or 0 */
556 	uint16_t flags;		/* 04-05 flags field */
557 	uint8_t error;		/* 06	 last operational error */
558 	uint8_t priority;	/* 07	 priority and round-robin flag */
559 	uint8_t remote_pfs_type;/* 08	 probed direct remote PFS type */
560 	uint8_t reserved08[23];	/* 09-1F */
561 	uuid_t	pfs_clid;	/* 20-2F copy target must match this uuid */
562 	uint8_t label[16];	/* 30-3F import/export label */
563 	uint8_t path[64];	/* 40-7F target specification string or key */
564 };
565 
566 typedef struct dmsg_vol_data dmsg_vol_data_t;
567 
568 #define DMSG_VOLF_ENABLED	0x0001
569 #define DMSG_VOLF_INPROG	0x0002
570 #define DMSG_VOLF_CONN_RR	0x80	/* round-robin at same priority */
571 #define DMSG_VOLF_CONN_EF	0x40	/* media errors flagged */
572 #define DMSG_VOLF_CONN_PRI	0x0F	/* select priority 0-15 (15=best) */
573 
574 #define DMSG_COPYID_COUNT	256	/* WARNING! embedded in hammer2 vol */
575 
576 struct dmsg_lnk_volconf {
577 	dmsg_hdr_t		head;
578 	dmsg_vol_data_t		copy;	/* copy spec */
579 	int32_t			index;
580 	int32_t			unused01;
581 	uuid_t			mediaid;
582 	int64_t			reserved02[32];
583 };
584 
585 typedef struct dmsg_lnk_volconf dmsg_lnk_volconf_t;
586 
587 /*
588  * Debug layer ops operate on any link
589  *
590  * SHELL	- Persist stream, access the debug shell on the target
591  *		  registration.  Multiple shells can be operational.
592  */
593 #define DMSG_DBG_SHELL		DMSG_DBG(0x001, dmsg_dbg_shell)
594 
595 struct dmsg_dbg_shell {
596 	dmsg_hdr_t	head;
597 };
598 typedef struct dmsg_dbg_shell dmsg_dbg_shell_t;
599 
600 /*
601  * Domain layer ops operate on any link, link-0 may be used when the
602  * directory connected target is the desired registration.
603  *
604  * (nothing defined)
605  */
606 
607 /*
608  * Cache layer ops operate on any link, link-0 may be used when the
609  * directly connected target is the desired registration.
610  *
611  * LOCK		- Persist state, blockable, abortable.
612  *
613  *		  Obtain cache state (MODIFIED, EXCLUSIVE, SHARED, or INVAL)
614  *		  in any of three domains (TREE, INUM, ATTR, DIRENT) for a
615  *		  particular key relative to cache state already owned.
616  *
617  *		  TREE - Effects entire sub-tree at the specified element
618  *			 and will cause existing cache state owned by
619  *			 other nodes to be adjusted such that the request
620  *			 can be granted.
621  *
622  *		  INUM - Only effects inode creation/deletion of an existing
623  *			 element or a new element, by inumber and/or name.
624  *			 typically can be held for very long periods of time
625  *			 (think the vnode cache), directly relates to
626  *			 hammer2_chain structures representing inodes.
627  *
628  *		  ATTR - Only effects an inode's attributes, such as
629  *			 ownership, modes, etc.  Used for lookups, chdir,
630  *			 open, etc.  mtime has no affect.
631  *
632  *		  DIRENT - Only affects an inode's attributes plus the
633  *			 attributes or names related to any directory entry
634  *			 directly under this inode (non-recursively).  Can
635  *			 be retained for medium periods of time when doing
636  *			 directory scans.
637  *
638  *		  This function may block and can be aborted.  You may be
639  *		  granted cache state that is more broad than the state you
640  *		  requested (e.g. a different set of domains and/or an element
641  *		  at a higher layer in the tree).  When quorum operations
642  *		  are used you may have to reconcile these grants to the
643  *		  lowest common denominator.
644  *
645  *		  In order to grant your request either you or the target
646  *		  (or both) may have to obtain a quorum agreement.  Deadlock
647  *		  resolution may be required.  When doing it yourself you
648  *		  will typically maintain an active message to each master
649  *		  node in the system.  You can only grant the cache state
650  *		  when a quorum of nodes agree.
651  *
652  *		  The cache state includes transaction id information which
653  *		  can be used to resolve data requests.
654  */
655 #define DMSG_CAC_LOCK		DMSG_CAC(0x001, dmsg_cac_lock)
656 
657 /*
658  * Quorum layer ops operate on any link, link-0 may be used when the
659  * directly connected target is the desired registration.
660  *
661  * COMMIT	- Persist state, blockable, abortable
662  *
663  *		  Issue a COMMIT in two phases.  A quorum must acknowledge
664  *		  the operation to proceed to phase-2.  Message-update to
665  *		  proceed to phase-2.
666  */
667 #define DMSG_QRM_COMMIT		DMSG_QRM(0x001, dmsg_qrm_commit)
668 
669 /*
670  * DMSG_PROTO_BLK Protocol
671  *
672  * BLK_OPEN	- Open device.  This transaction must be left open for the
673  *		  duration and the returned keyid passed in all associated
674  *		  BLK commands.  Multiple OPENs can be issued within the
675  *		  transaction.
676  *
677  * BLK_CLOSE	- Close device.  This can be used to close one of the opens
678  *		  within a BLK_OPEN transaction.  It may NOT initiate a
679  *		  transaction.  Note that a termination of the transaction
680  *		  (e.g. with LNK_ERROR or BLK_ERROR) closes all active OPENs
681  *		  for that transaction.
682  *
683  * BLK_READ	- Strategy read.  Not typically streaming.
684  *
685  * BLK_WRITE	- Strategy write.  Not typically streaming.
686  *
687  * BLK_FLUSH	- Strategy flush.  Not typically streaming.
688  *
689  * BLK_FREEBLKS	- Strategy freeblks.  Not typically streaming.
690  */
691 #define DMSG_BLK_OPEN		DMSG_BLK(0x001, dmsg_blk_open)
692 #define DMSG_BLK_CLOSE		DMSG_BLK(0x002, dmsg_blk_open)
693 #define DMSG_BLK_READ		DMSG_BLK(0x003, dmsg_blk_read)
694 #define DMSG_BLK_WRITE		DMSG_BLK(0x004, dmsg_blk_write)
695 #define DMSG_BLK_FLUSH		DMSG_BLK(0x005, dmsg_blk_flush)
696 #define DMSG_BLK_FREEBLKS	DMSG_BLK(0x006, dmsg_blk_freeblks)
697 #define DMSG_BLK_ERROR		DMSG_BLK(0xFFF, dmsg_blk_error)
698 
699 struct dmsg_blk_open {
700 	dmsg_hdr_t	head;
701 	uint32_t	modes;
702 	uint32_t	reserved01;
703 };
704 
705 #define DMSG_BLKOPEN_RD		0x0001
706 #define DMSG_BLKOPEN_WR		0x0002
707 
708 /*
709  * DMSG_LNK_ERROR is returned for simple results,
710  * DMSG_BLK_ERROR is returned for extended results.
711  */
712 struct dmsg_blk_error {
713 	dmsg_hdr_t	head;
714 	uint64_t	keyid;
715 	uint32_t	resid;
716 	uint32_t	reserved02;
717 	char		buf[64];
718 };
719 
720 struct dmsg_blk_read {
721 	dmsg_hdr_t	head;
722 	uint64_t	keyid;
723 	uint64_t	offset;
724 	uint32_t	bytes;
725 	uint32_t	flags;
726 	uint32_t	reserved01;
727 	uint32_t	reserved02;
728 };
729 
730 struct dmsg_blk_write {
731 	dmsg_hdr_t	head;
732 	uint64_t	keyid;
733 	uint64_t	offset;
734 	uint32_t	bytes;
735 	uint32_t	flags;
736 	uint32_t	reserved01;
737 	uint32_t	reserved02;
738 };
739 
740 struct dmsg_blk_flush {
741 	dmsg_hdr_t	head;
742 	uint64_t	keyid;
743 	uint64_t	offset;
744 	uint32_t	bytes;
745 	uint32_t	flags;
746 	uint32_t	reserved01;
747 	uint32_t	reserved02;
748 };
749 
750 struct dmsg_blk_freeblks {
751 	dmsg_hdr_t	head;
752 	uint64_t	keyid;
753 	uint64_t	offset;
754 	uint32_t	bytes;
755 	uint32_t	flags;
756 	uint32_t	reserved01;
757 	uint32_t	reserved02;
758 };
759 
760 typedef struct dmsg_blk_open		dmsg_blk_open_t;
761 typedef struct dmsg_blk_read		dmsg_blk_read_t;
762 typedef struct dmsg_blk_write		dmsg_blk_write_t;
763 typedef struct dmsg_blk_flush		dmsg_blk_flush_t;
764 typedef struct dmsg_blk_freeblks	dmsg_blk_freeblks_t;
765 typedef struct dmsg_blk_error		dmsg_blk_error_t;
766 
767 /*
768  * NOTE!!!! ALL EXTENDED HEADER STRUCTURES MUST BE 64-BYTE ALIGNED!!!
769  *
770  * General message errors
771  *
772  *	0x00 - 0x1F	Local iocomm errors
773  *	0x20 - 0x2F	Global errors
774  */
775 #define DMSG_ERR_NOSUPP		0x20
776 #define DMSG_ERR_LOSTLINK	0x21
777 #define DMSG_ERR_IO		0x22	/* generic */
778 #define DMSG_ERR_PARAM		0x23	/* generic */
779 #define DMSG_ERR_CANTCIRC	0x24	/* (typically means lost span) */
780 
781 union dmsg_any {
782 	char			buf[DMSG_HDR_MAX];
783 	dmsg_hdr_t		head;
784 
785 	dmsg_lnk_conn_t		lnk_conn;
786 	dmsg_lnk_span_t		lnk_span;
787 	dmsg_lnk_circ_t		lnk_circ;
788 	dmsg_lnk_volconf_t	lnk_volconf;
789 
790 	dmsg_blk_open_t		blk_open;
791 	dmsg_blk_error_t	blk_error;
792 	dmsg_blk_read_t		blk_read;
793 	dmsg_blk_write_t	blk_write;
794 	dmsg_blk_flush_t	blk_flush;
795 	dmsg_blk_freeblks_t	blk_freeblks;
796 };
797 
798 typedef union dmsg_any dmsg_any_t;
799 
800 /*
801  * Kernel iocom structures and prototypes for kern/kern_dmsg.c
802  */
803 #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES)
804 
805 struct hammer2_pfsmount;
806 struct kdmsg_iocom;
807 struct kdmsg_state;
808 struct kdmsg_msg;
809 
810 /*
811  * msg_ctl flags (atomic)
812  */
813 #define KDMSG_CLUSTERCTL_KILL		0x00000001
814 #define KDMSG_CLUSTERCTL_KILLRX		0x00000002 /* staged helper exit */
815 #define KDMSG_CLUSTERCTL_KILLTX		0x00000004 /* staged helper exit */
816 #define KDMSG_CLUSTERCTL_SLEEPING	0x00000008 /* interlocked w/msglk */
817 
818 /*
819  * When the KDMSG_IOCOMF_AUTOCIRC flag is set the kdmsg code in
820  * the kernel automatically tries to forge a virtual circuit for
821  * any active SPAN state received.
822  *
823  * This is only done when the received SPANs are significantly filtered
824  * by the transmitted LNK_CONN.  That is, it is done only by clients who
825  * connect to specific services over the cluster.
826  */
827 struct kdmsg_circuit {
828 	RB_ENTRY(kdmsg_circuit) rbnode;		/* indexed by msgid */
829 	TAILQ_ENTRY(kdmsg_circuit) entry;	/* written by shim */
830 	struct kdmsg_iocom	*iocom;		/* written by shim */
831 	struct kdmsg_state	*span_state;
832 	struct kdmsg_state	*circ_state;	/* master circuit */
833 	struct kdmsg_state	*rcirc_state;	/* slave circuit */
834 	uint64_t		msgid;
835 	int			weight;
836 	int			recorded;	/* written by shim */
837 	int			lost;		/* written by shim */
838 	int			refs;		/* written by shim */
839 };
840 
841 typedef struct kdmsg_circuit kdmsg_circuit_t;
842 
843 /*
844  * Transactional state structure, representing an open transaction.  The
845  * transaction might represent a cache state (and thus have a chain
846  * association), or a VOP op, LNK_SPAN, or other things.
847  */
848 struct kdmsg_state {
849 	RB_ENTRY(kdmsg_state) rbnode;		/* indexed by msgid */
850 	struct kdmsg_iocom *iocom;
851 	struct kdmsg_circuit *circ;
852 	uint32_t	icmd;			/* record cmd creating state */
853 	uint32_t	txcmd;			/* mostly for CMDF flags */
854 	uint32_t	rxcmd;			/* mostly for CMDF flags */
855 	uint64_t	msgid;			/* {circuit,msgid} uniq */
856 	int		flags;
857 	int		error;
858 	void		*chain;			/* (caller's state) */
859 	struct kdmsg_msg *msg;
860 	int (*func)(struct kdmsg_state *, struct kdmsg_msg *);
861 	union {
862 		void *any;
863 		struct hammer2_pfsmount *pmp;
864 		struct kdmsg_circuit *circ;
865 	} any;
866 };
867 
868 #define KDMSG_STATE_INSERTED	0x0001
869 #define KDMSG_STATE_DYNAMIC	0x0002
870 #define KDMSG_STATE_DELPEND	0x0004		/* transmit delete pending */
871 #define KDMSG_STATE_ABORTING	0x0008		/* avoids recursive abort */
872 
873 struct kdmsg_msg {
874 	TAILQ_ENTRY(kdmsg_msg) qentry;		/* serialized queue */
875 	struct kdmsg_iocom *iocom;
876 	struct kdmsg_state *state;
877 	struct kdmsg_circuit *circ;
878 	size_t		hdr_size;
879 	size_t		aux_size;
880 	char		*aux_data;
881 	int		flags;
882 	dmsg_any_t	any;
883 };
884 
885 #define KDMSG_FLAG_AUXALLOC	0x0001
886 
887 typedef struct kdmsg_link kdmsg_link_t;
888 typedef struct kdmsg_state kdmsg_state_t;
889 typedef struct kdmsg_msg kdmsg_msg_t;
890 
891 struct kdmsg_state_tree;
892 int kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2);
893 RB_HEAD(kdmsg_state_tree, kdmsg_state);
894 RB_PROTOTYPE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
895 
896 struct kdmsg_circuit_tree;
897 int kdmsg_circuit_cmp(kdmsg_circuit_t *circ1, kdmsg_circuit_t *circ2);
898 RB_HEAD(kdmsg_circuit_tree, kdmsg_circuit);
899 RB_PROTOTYPE(kdmsg_circuit_tree, kdmsg_circuit, rbnode, kdmsg_circuit_cmp);
900 
901 /*
902  * Structure embedded in e.g. mount, master control structure for
903  * DMSG stream handling.
904  */
905 struct kdmsg_iocom {
906 	struct malloc_type	*mmsg;
907 	struct file		*msg_fp;	/* cluster pipe->userland */
908 	thread_t		msgrd_td;	/* cluster thread */
909 	thread_t		msgwr_td;	/* cluster thread */
910 	int			msg_ctl;	/* wakeup flags */
911 	int			msg_seq;	/* cluster msg sequence id */
912 	uint32_t		flags;
913 	struct lock		msglk;		/* lockmgr lock */
914 	TAILQ_HEAD(, kdmsg_msg) msgq;		/* transmit queue */
915 	void			*handle;
916 	void			(*auto_callback)(kdmsg_msg_t *);
917 	int			(*rcvmsg)(kdmsg_msg_t *);
918 	void			(*exit_func)(struct kdmsg_iocom *);
919 	struct kdmsg_state	*conn_state;	/* active LNK_CONN state */
920 	struct kdmsg_state	*freerd_state;	/* allocation cache */
921 	struct kdmsg_state	*freewr_state;	/* allocation cache */
922 	struct kdmsg_state_tree staterd_tree;	/* active messages */
923 	struct kdmsg_state_tree statewr_tree;	/* active messages */
924 	struct kdmsg_circuit_tree circ_tree;	/* active circuits */
925 	dmsg_lnk_conn_t		auto_lnk_conn;
926 	dmsg_lnk_span_t		auto_lnk_span;
927 };
928 
929 typedef struct kdmsg_iocom	kdmsg_iocom_t;
930 
931 #define KDMSG_IOCOMF_AUTOCONN	0x0001	/* handle received LNK_CONN */
932 #define KDMSG_IOCOMF_AUTOSPAN	0x0002	/* handle received LNK_SPAN */
933 #define KDMSG_IOCOMF_AUTOCIRC	0x0004	/* handle received LNK_CIRC */
934 #define KDMSG_IOCOMF_AUTOFORGE	0x0008	/* auto initiate LNK_CIRC */
935 #define KDMSG_IOCOMF_EXITNOACC	0x0010	/* cannot accept writes */
936 
937 #define KDMSG_IOCOMF_AUTOANY	(KDMSG_IOCOMF_AUTOCONN |	\
938 				 KDMSG_IOCOMF_AUTOSPAN |	\
939 				 KDMSG_IOCOMF_AUTOCIRC |	\
940 				 KDMSG_IOCOMF_AUTOFORGE)
941 
942 uint32_t kdmsg_icrc32(const void *buf, size_t size);
943 uint32_t kdmsg_icrc32c(const void *buf, size_t size, uint32_t crc);
944 
945 /*
946  * kern_dmsg.c
947  */
948 void kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, u_int32_t flags,
949 			struct malloc_type *mmsg,
950 			int (*rcvmsg)(kdmsg_msg_t *msg));
951 void kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
952 			const char *subsysname);
953 void kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
954 			void (*conn_callback)(kdmsg_msg_t *msg));
955 void kdmsg_iocom_uninit(kdmsg_iocom_t *iocom);
956 void kdmsg_drain_msgq(kdmsg_iocom_t *iocom);
957 
958 void kdmsg_msg_free(kdmsg_msg_t *msg);
959 kdmsg_msg_t *kdmsg_msg_alloc(kdmsg_iocom_t *iocom, kdmsg_circuit_t *circ,
960 				uint32_t cmd,
961 				int (*func)(kdmsg_state_t *, kdmsg_msg_t *),
962 				void *data);
963 kdmsg_msg_t *kdmsg_msg_alloc_state(kdmsg_state_t *state, uint32_t cmd,
964 				int (*func)(kdmsg_state_t *, kdmsg_msg_t *),
965 				void *data);
966 void kdmsg_msg_write(kdmsg_msg_t *msg);
967 void kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error);
968 void kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error);
969 void kdmsg_state_reply(kdmsg_state_t *state, uint32_t error);
970 void kdmsg_state_result(kdmsg_state_t *state, uint32_t error);
971 
972 void kdmsg_circ_hold(kdmsg_circuit_t *circ);
973 void kdmsg_circ_drop(kdmsg_circuit_t *circ);
974 
975 
976 #endif
977 
978 #endif
979