1 /* 2 * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #ifndef _SYS_DMSG_H_ 36 #define _SYS_DMSG_H_ 37 38 #ifndef _SYS_MALLOC_H_ 39 #include <sys/malloc.h> 40 #endif 41 #ifndef _SYS_TREE_H_ 42 #include <sys/tree.h> 43 #endif 44 #ifndef _SYS_THREAD_H_ 45 #include <sys/thread.h> 46 #endif 47 #ifndef _SYS_UUID_H_ 48 #include <sys/uuid.h> 49 #endif 50 51 /* 52 * Mesh network protocol structures. 53 * 54 * CONN PROTOCOL 55 * 56 * The mesh is constructed from point-to-point streaming links with varying 57 * levels of interconnectedness, forming a graph. Terminii in the graph 58 * are entities such as a HAMMER2 PFS or a network mount or other types 59 * of nodes. 60 * 61 * Upon connecting and after authentication, a LNK_CONN transaction is opened 62 * on circuit 0 by both ends. This configures and enables the SPAN protocol. 63 * The LNK_CONN transaction remains open for the life of the connection. 64 * 65 * SPAN PROTOCOL 66 * 67 * Once enabled, termini transmits a representitive LNK_SPAN out all 68 * available connections advertising what it is. Nodes maintaing multiple 69 * connections will relay received LNK_SPANs out available connections 70 * with some filtering based on the CONN configuration. A distance metric 71 * and per-node random value (rnss) is aggregated. 72 * 73 * Since LNK_SPANs can rapidly multiply in a complex graph, not all incoming 74 * LNK_SPANs will be relayed. Only the top N over all collect LNK_SPANs for 75 * any given advertisement are relayed. 76 * 77 * It is possible to code the SPANning tree algorithm to guarantee that 78 * symmetrical spans will be generated after stabilization. The RNSS field 79 * is used to help distinguish and reduce paths in complex graphs when 80 * symmetric spans are desired. We always generate RNSS but we currently do 81 * not implement symmetrical SPAN guarantees. 82 * 83 * CIRC PROTOCOL 84 * 85 * We aren't done yet. Before transactions can be relayed, symmetric paths 86 * must be formed via the LNK_CIRC protocol. The LNK_CIRC protocol 87 * establishes a virtual circuit from any node to any other node, creating 88 * a circuit id which is stored in dmsg_hdr.circuit. Messages received on 89 * one side or forwarded to the other. Forwarded messages bypass normal 90 * state tracking. 91 * 92 * A virtual circuit is forged by working the propogated SPANs backwards. 93 * Each node in the graph helps propagate the virtual circuit by attach the 94 * LNK_CIRC transaction it receives to a LNK_CIRC transaction it initiates 95 * out the other interface. 96 * 97 * Since SPANs are link-state transactions any change in related span(s) 98 * will also force-terminate VC's using those spans. 99 * 100 * MESSAGE TRANSACTIONAL STATES 101 * 102 * Message state is handled by the CREATE, DELETE, REPLY, and ABORT 103 * flags. Message state is typically recorded at the end points and 104 * at each hop until a DELETE is received from both sides. 105 * 106 * One-way messages such as those used by spanning tree commands are not 107 * recorded. These are sent without the CREATE, DELETE, or ABORT flags set. 108 * ABORT is not supported for one-off messages. The REPLY bit can be used 109 * to distinguish between command and status if desired. 110 * 111 * Persistent-state messages are messages which require a reply to be 112 * returned. These messages can also consist of multiple message elements 113 * for the command or reply or both (or neither). The command message 114 * sequence sets CREATE on the first message and DELETE on the last message. 115 * A single message command sets both (CREATE|DELETE). The reply message 116 * sequence works the same way but of course also sets the REPLY bit. 117 * 118 * Persistent-state messages can be aborted by sending a message element 119 * with the ABORT flag set. This flag can be combined with either or both 120 * the CREATE and DELETE flags. When combined with the CREATE flag the 121 * command is treated as non-blocking but still executes. Whem combined 122 * with the DELETE flag no additional message elements are required. 123 * 124 * ABORT SPECIAL CASE - Mid-stream aborts. A mid-stream abort can be sent 125 * when supported by the sender by sending an ABORT message with neither 126 * CREATE or DELETE set. This effectively turns the message into a 127 * non-blocking message (but depending on what is being represented can also 128 * cut short prior data elements in the stream). 129 * 130 * ABORT SPECIAL CASE - Abort-after-DELETE. Persistent messages have to be 131 * abortable if the stream/pipe/whatever is lost. In this situation any 132 * forwarding relay needs to unconditionally abort commands and replies that 133 * are still active. This is done by sending an ABORT|DELETE even in 134 * situations where a DELETE has already been sent in that direction. This 135 * is done, for example, when links are in a half-closed state. In this 136 * situation it is possible for the abort request to race a transition to the 137 * fully closed state. ABORT|DELETE messages which race the fully closed 138 * state are expected to be discarded by the other end. 139 * 140 * -- 141 * 142 * All base and extended message headers are 64-byte aligned, and all 143 * transports must support extended message headers up to DMSG_HDR_MAX. 144 * Currently we allow extended message headers up to 2048 bytes. Note 145 * that the extended header size is encoded in the 'cmd' field of the header. 146 * 147 * Any in-band data is padded to a 64-byte alignment and placed directly 148 * after the extended header (after the higher-level cmd/rep structure). 149 * The actual unaligned size of the in-band data is encoded in the aux_bytes 150 * field in this case. Maximum data sizes are negotiated during registration. 151 * 152 * Auxillary data can be in-band or out-of-band. In-band data sets aux_descr 153 * equal to 0. Any out-of-band data must be negotiated by the SPAN protocol. 154 * 155 * Auxillary data, whether in-band or out-of-band, must be at-least 64-byte 156 * aligned. The aux_bytes field contains the actual byte-granular length 157 * and not the aligned length. The crc is against the aligned length (so 158 * a faster crc algorithm can be used, theoretically). 159 * 160 * hdr_crc is calculated over the entire, ALIGNED extended header. For 161 * the purposes of calculating the crc, the hdr_crc field is 0. That is, 162 * if calculating the crc in HW a 32-bit '0' must be inserted in place of 163 * the hdr_crc field when reading the entire header and compared at the 164 * end (but the actual hdr_crc must be left intact in memory). A simple 165 * counter to replace the field going into the CRC generator does the job 166 * in HW. The CRC endian is based on the magic number field and may have 167 * to be byte-swapped, too (which is also easy to do in HW). 168 * 169 * aux_crc is calculated over the entire, ALIGNED auxillary data. 170 * 171 * SHARED MEMORY IMPLEMENTATIONS 172 * 173 * Shared-memory implementations typically use a pipe to transmit the extended 174 * message header and shared memory to store any auxilary data. Auxillary 175 * data in one-way (non-transactional) messages is typically required to be 176 * inline. CRCs are still recommended and required at the beginning, but 177 * may be negotiated away later. 178 */ 179 struct dmsg_hdr { 180 uint16_t magic; /* 00 sanity, synchro, endian */ 181 uint16_t reserved02; /* 02 */ 182 uint32_t salt; /* 04 random salt helps w/crypto */ 183 184 uint64_t msgid; /* 08 message transaction id */ 185 uint64_t circuit; /* 10 circuit id or 0 */ 186 uint64_t reserved18; /* 18 */ 187 188 uint32_t cmd; /* 20 flags | cmd | hdr_size / ALIGN */ 189 uint32_t aux_crc; /* 24 auxillary data crc */ 190 uint32_t aux_bytes; /* 28 auxillary data length (bytes) */ 191 uint32_t error; /* 2C error code or 0 */ 192 uint64_t aux_descr; /* 30 negotiated OOB data descr */ 193 uint32_t reserved38; /* 38 */ 194 uint32_t hdr_crc; /* 3C (aligned) extended header crc */ 195 }; 196 197 typedef struct dmsg_hdr dmsg_hdr_t; 198 199 #define DMSG_HDR_MAGIC 0x4832 200 #define DMSG_HDR_MAGIC_REV 0x3248 201 #define DMSG_HDR_CRCOFF offsetof(dmsg_hdr_t, salt) 202 #define DMSG_HDR_CRCBYTES (sizeof(dmsg_hdr_t) - DMSG_HDR_CRCOFF) 203 204 /* 205 * Administrative protocol limits. 206 */ 207 #define DMSG_HDR_MAX 2048 /* <= 65535 */ 208 #define DMSG_AUX_MAX 65536 /* <= 1MB */ 209 #define DMSG_BUF_SIZE (DMSG_HDR_MAX * 4) 210 #define DMSG_BUF_MASK (DMSG_BUF_SIZE - 1) 211 212 /* 213 * The message (cmd) field also encodes various flags and the total size 214 * of the message header. This allows the protocol processors to validate 215 * persistency and structural settings for every command simply by 216 * switch()ing on the (cmd) field. 217 */ 218 #define DMSGF_CREATE 0x80000000U /* msg start */ 219 #define DMSGF_DELETE 0x40000000U /* msg end */ 220 #define DMSGF_REPLY 0x20000000U /* reply path */ 221 #define DMSGF_ABORT 0x10000000U /* abort req */ 222 #define DMSGF_AUXOOB 0x08000000U /* aux-data is OOB */ 223 #define DMSGF_FLAG2 0x04000000U 224 #define DMSGF_FLAG1 0x02000000U 225 #define DMSGF_FLAG0 0x01000000U 226 227 #define DMSGF_FLAGS 0xFF000000U /* all flags */ 228 #define DMSGF_PROTOS 0x00F00000U /* all protos */ 229 #define DMSGF_CMDS 0x000FFF00U /* all cmds */ 230 #define DMSGF_SIZE 0x000000FFU /* N*32 */ 231 232 #define DMSGF_CMDSWMASK (DMSGF_CMDS | \ 233 DMSGF_SIZE | \ 234 DMSGF_PROTOS | \ 235 DMSGF_REPLY) 236 237 #define DMSGF_BASECMDMASK (DMSGF_CMDS | \ 238 DMSGF_SIZE | \ 239 DMSGF_PROTOS) 240 241 #define DMSGF_TRANSMASK (DMSGF_CMDS | \ 242 DMSGF_SIZE | \ 243 DMSGF_PROTOS | \ 244 DMSGF_REPLY | \ 245 DMSGF_CREATE | \ 246 DMSGF_DELETE) 247 248 #define DMSG_PROTO_LNK 0x00000000U 249 #define DMSG_PROTO_DBG 0x00100000U 250 #define DMSG_PROTO_DOM 0x00200000U 251 #define DMSG_PROTO_CAC 0x00300000U 252 #define DMSG_PROTO_QRM 0x00400000U 253 #define DMSG_PROTO_BLK 0x00500000U 254 #define DMSG_PROTO_VOP 0x00600000U 255 256 /* 257 * Message command constructors, sans flags 258 */ 259 #define DMSG_ALIGN 64 260 #define DMSG_ALIGNMASK (DMSG_ALIGN - 1) 261 #define DMSG_DOALIGN(bytes) (((bytes) + DMSG_ALIGNMASK) & \ 262 ~DMSG_ALIGNMASK) 263 264 #define DMSG_HDR_ENCODE(elm) (((uint32_t)sizeof(struct elm) + \ 265 DMSG_ALIGNMASK) / \ 266 DMSG_ALIGN) 267 268 #define DMSG_LNK(cmd, elm) (DMSG_PROTO_LNK | \ 269 ((cmd) << 8) | \ 270 DMSG_HDR_ENCODE(elm)) 271 272 #define DMSG_DBG(cmd, elm) (DMSG_PROTO_DBG | \ 273 ((cmd) << 8) | \ 274 DMSG_HDR_ENCODE(elm)) 275 276 #define DMSG_DOM(cmd, elm) (DMSG_PROTO_DOM | \ 277 ((cmd) << 8) | \ 278 DMSG_HDR_ENCODE(elm)) 279 280 #define DMSG_CAC(cmd, elm) (DMSG_PROTO_CAC | \ 281 ((cmd) << 8) | \ 282 DMSG_HDR_ENCODE(elm)) 283 284 #define DMSG_QRM(cmd, elm) (DMSG_PROTO_QRM | \ 285 ((cmd) << 8) | \ 286 DMSG_HDR_ENCODE(elm)) 287 288 #define DMSG_BLK(cmd, elm) (DMSG_PROTO_BLK | \ 289 ((cmd) << 8) | \ 290 DMSG_HDR_ENCODE(elm)) 291 292 #define DMSG_VOP(cmd, elm) (DMSG_PROTO_VOP | \ 293 ((cmd) << 8) | \ 294 DMSG_HDR_ENCODE(elm)) 295 296 /* 297 * Link layer ops basically talk to just the other side of a direct 298 * connection. 299 * 300 * LNK_PAD - One-way message on circuit 0, ignored by target. Used to 301 * pad message buffers on shared-memory transports. Not 302 * typically used with TCP. 303 * 304 * LNK_PING - One-way message on circuit-0, keep-alive, run by both sides 305 * typically 1/sec on idle link, link is lost after 10 seconds 306 * of inactivity. 307 * 308 * LNK_AUTH - Authenticate the connection, negotiate administrative 309 * rights & encryption, protocol class, etc. Only PAD and 310 * AUTH messages (not even PING) are accepted until 311 * authentication is complete. This message also identifies 312 * the host. 313 * 314 * LNK_CONN - Enable the SPAN protocol on circuit-0, possibly also 315 * installing a PFS filter (by cluster id, unique id, and/or 316 * wildcarded name). 317 * 318 * LNK_SPAN - A SPAN transaction on circuit-0 enables messages to be 319 * relayed to/from a particular cluster node. SPANs are 320 * received, sorted, aggregated, filtered, and retransmitted 321 * back out across all applicable connections. 322 * 323 * The leaf protocol also uses this to make a PFS available 324 * to the cluster (e.g. on-mount). 325 * 326 * LNK_CIRC - a CIRC transaction establishes a circuit from source to 327 * target by creating pairs of open transactions across each 328 * hop. 329 * 330 * LNK_VOLCONF - Volume header configuration change. All hammer2 331 * connections (hammer2 connect ...) stored in the volume 332 * header are spammed on circuit 0 to the hammer2 333 * service daemon, and any live configuration change 334 * thereafter. 335 */ 336 #define DMSG_LNK_PAD DMSG_LNK(0x000, dmsg_hdr) 337 #define DMSG_LNK_PING DMSG_LNK(0x001, dmsg_hdr) 338 #define DMSG_LNK_AUTH DMSG_LNK(0x010, dmsg_lnk_auth) 339 #define DMSG_LNK_CONN DMSG_LNK(0x011, dmsg_lnk_conn) 340 #define DMSG_LNK_SPAN DMSG_LNK(0x012, dmsg_lnk_span) 341 #define DMSG_LNK_CIRC DMSG_LNK(0x013, dmsg_lnk_circ) 342 #define DMSG_LNK_VOLCONF DMSG_LNK(0x020, dmsg_lnk_volconf) 343 #define DMSG_LNK_ERROR DMSG_LNK(0xFFF, dmsg_hdr) 344 345 /* 346 * LNK_AUTH - Authentication (often omitted) 347 */ 348 struct dmsg_lnk_auth { 349 dmsg_hdr_t head; 350 char dummy[64]; 351 }; 352 353 /* 354 * LNK_CONN - Register connection info for SPAN protocol 355 * (transaction, left open, circuit 0 only). 356 * 357 * LNK_CONN identifies a streaming connection into the cluster and serves 358 * to identify, enable, and specify filters for the SPAN protocol. 359 * 360 * peer_mask serves to filter the SPANs we receive by peer_type. A cluster 361 * controller typically sets this to (uint64_t)-1, indicating that it wants 362 * everything. A block devfs interface might set it to 1 << DMSG_PEER_DISK, 363 * and a hammer2 mount might set it to 1 << DMSG_PEER_HAMMER2. 364 * 365 * mediaid allows multiple (e.g. HAMMER2) connections belonging to the same 366 * media to transmit duplicative LNK_VOLCONF updates without causing 367 * confusion in the cluster controller. 368 * 369 * pfs_clid, pfs_fsid, pfs_type, and label are peer-specific and must be 370 * left empty (zero-fill) if not supported by a particular peer. 371 * 372 * DMSG_PEER_CLUSTER filter: none 373 * DMSG_PEER_BLOCK filter: label 374 * DMSG_PEER_HAMMER2 filter: pfs_clid if not empty, and label 375 */ 376 struct dmsg_lnk_conn { 377 dmsg_hdr_t head; 378 uuid_t mediaid; /* media configuration id */ 379 uuid_t pfs_clid; /* rendezvous pfs uuid */ 380 uuid_t pfs_fsid; /* unique pfs uuid */ 381 uint64_t peer_mask; /* PEER mask for SPAN filtering */ 382 uint8_t peer_type; /* see DMSG_PEER_xxx */ 383 uint8_t pfs_type; /* pfs type */ 384 uint16_t proto_version; /* high level protocol support */ 385 uint32_t status; /* status flags */ 386 uint32_t rnss; /* node's generated rnss */ 387 uint8_t reserved02[8]; 388 uint32_t reserved03[12]; 389 uint64_t pfs_mask; /* PFS mask for SPAN filtering */ 390 char cl_label[128]; /* cluster label (for PEER_BLOCK) */ 391 char fs_label[128]; /* PFS label (for PEER_HAMMER2) */ 392 }; 393 394 typedef struct dmsg_lnk_conn dmsg_lnk_conn_t; 395 396 #define DMSG_PFSTYPE_NONE 0 397 #define DMSG_PFSTYPE_ADMIN 1 398 #define DMSG_PFSTYPE_CLIENT 2 399 #define DMSG_PFSTYPE_CACHE 3 400 #define DMSG_PFSTYPE_COPY 4 401 #define DMSG_PFSTYPE_SLAVE 5 402 #define DMSG_PFSTYPE_SOFT_SLAVE 6 403 #define DMSG_PFSTYPE_SOFT_MASTER 7 404 #define DMSG_PFSTYPE_MASTER 8 405 #define DMSG_PFSTYPE_SERVER 9 406 #define DMSG_PFSTYPE_MAX 10 /* 0-9 */ 407 408 #define DMSG_PEER_NONE 0 409 #define DMSG_PEER_CLUSTER 1 /* a cluster controller */ 410 #define DMSG_PEER_BLOCK 2 /* block devices */ 411 #define DMSG_PEER_HAMMER2 3 /* hammer2-mounted volumes */ 412 413 /* 414 * Structures embedded in LNK_SPAN 415 */ 416 struct dmsg_media_block { 417 uint64_t bytes; /* media size in bytes */ 418 uint32_t blksize; /* media block size */ 419 }; 420 421 typedef struct dmsg_media_block dmsg_media_block_t; 422 423 /* 424 * LNK_SPAN - Initiate or relay a SPAN 425 * (transaction, left open, circuit 0 only) 426 * 427 * This message registers an end-point with the other end of the connection, 428 * telling the other end who we are and what we can provide or intend to 429 * consume. Multiple registrations can be maintained as open transactions 430 * with each one specifying a unique end-point. 431 * 432 * Registrations are sent from {source}=S {1...n} to {target}=0 and maintained 433 * as open transactions. Registrations are also received and maintains as 434 * open transactions, creating a matrix of linkid's. 435 * 436 * While these transactions are open additional transactions can be executed 437 * between any two linkid's {source}=S (registrations we sent) to {target}=T 438 * (registrations we received). 439 * 440 * Closure of any registration transaction will automatically abort any open 441 * transactions using the related linkids. Closure can be initiated 442 * voluntarily from either side with either end issuing a DELETE, or they 443 * can be ABORTed. 444 * 445 * Status updates are performed via the open transaction. 446 * 447 * -- 448 * 449 * A registration identifies a node and its various PFS parameters including 450 * the PFS_TYPE. For example, a diskless HAMMER2 client typically identifies 451 * itself as PFSTYPE_CLIENT. 452 * 453 * Any node may serve as a cluster controller, aggregating and passing 454 * on received registrations, but end-points do not have to implement this 455 * ability. Most end-points typically implement a single client-style or 456 * server-style PFS_TYPE and rendezvous at a cluster controller. 457 * 458 * The cluster controller does not aggregate/pass-on all received 459 * registrations. It typically filters what gets passed on based on what it 460 * receives, passing on only the best candidates. 461 * 462 * If a symmetric spanning tree is desired additional candidates whos 463 * {dist, rnss} fields match the last best candidate must also be propagated. 464 * This feature is not currently enabled. 465 * 466 * STATUS UPDATES: Status updates use the same structure but typically 467 * only contain incremental changes to e.g. pfs_type, with 468 * a text description sent as out-of-band data. 469 */ 470 struct dmsg_lnk_span { 471 dmsg_hdr_t head; 472 uuid_t pfs_clid; /* rendezvous pfs uuid */ 473 uuid_t pfs_fsid; /* unique pfs id (differentiate node) */ 474 uint8_t pfs_type; /* PFS type */ 475 uint8_t peer_type; /* PEER type */ 476 uint16_t proto_version; /* high level protocol support */ 477 uint32_t status; /* status flags */ 478 uint8_t reserved02[8]; 479 uint32_t dist; /* span distance */ 480 uint32_t rnss; /* random number sub-sort */ 481 union { 482 uint32_t reserved03[14]; 483 dmsg_media_block_t block; 484 } media; 485 486 /* 487 * NOTE: for PEER_HAMMER2 cl_label is typically empty and fs_label 488 * is the superroot directory name. 489 * 490 * for PEER_BLOCK cl_label is typically host/device and 491 * fs_label is typically the serial number string. 492 */ 493 char cl_label[128]; /* cluster label */ 494 char fs_label[128]; /* PFS label */ 495 }; 496 497 typedef struct dmsg_lnk_span dmsg_lnk_span_t; 498 499 #define DMSG_SPAN_PROTO_1 1 500 501 /* 502 * LNK_CIRC - Establish a circuit 503 * (transaction, left open, circuit 0 only) 504 * 505 * Establish a circuit to the specified target. The msgid for the open 506 * transaction is used to transit messages in both directions. 507 * 508 * For circuit establishment the receiving entity looks up the outgoing 509 * relayed SPAN on the incoming iocom based on the target field and then 510 * creates peer circuit on the interface the SPAN originally came in on. 511 * Messages received on one side or forwarded to the other side and vise-versa. 512 * Any link state loss causes all related circuits to be lost. 513 */ 514 struct dmsg_lnk_circ { 515 dmsg_hdr_t head; 516 uint64_t reserved01; 517 uint64_t target; 518 }; 519 520 typedef struct dmsg_lnk_circ dmsg_lnk_circ_t; 521 522 /* 523 * LNK_VOLCONF 524 * 525 * All HAMMER2 directories directly under the super-root on your local 526 * media can be mounted separately, even if they share the same physical 527 * device. 528 * 529 * When you do a HAMMER2 mount you are effectively tying into a HAMMER2 530 * cluster via local media. The local media does not have to participate 531 * in the cluster, other than to provide the dmsg_vol_data[] array and 532 * root inode for the mount. 533 * 534 * This is important: The mount device path you specify serves to bootstrap 535 * your entry into the cluster, but your mount will make active connections 536 * to ALL copy elements in the dmsg_vol_data[] array which match the 537 * PFSID of the directory in the super-root that you specified. The local 538 * media path does not have to be mentioned in this array but becomes part 539 * of the cluster based on its type and access rights. ALL ELEMENTS ARE 540 * TREATED ACCORDING TO TYPE NO MATTER WHICH ONE YOU MOUNT FROM. 541 * 542 * The actual cluster may be far larger than the elements you list in the 543 * dmsg_vol_data[] array. You list only the elements you wish to 544 * directly connect to and you are able to access the rest of the cluster 545 * indirectly through those connections. 546 * 547 * This structure must be exactly 128 bytes long. 548 * 549 * WARNING! dmsg_vol_data is embedded in the hammer2 media volume header 550 */ 551 struct dmsg_vol_data { 552 uint8_t copyid; /* 00 copyid 0-255 (must match slot) */ 553 uint8_t inprog; /* 01 operation in progress, or 0 */ 554 uint8_t chain_to; /* 02 operation chaining to, or 0 */ 555 uint8_t chain_from; /* 03 operation chaining from, or 0 */ 556 uint16_t flags; /* 04-05 flags field */ 557 uint8_t error; /* 06 last operational error */ 558 uint8_t priority; /* 07 priority and round-robin flag */ 559 uint8_t remote_pfs_type;/* 08 probed direct remote PFS type */ 560 uint8_t reserved08[23]; /* 09-1F */ 561 uuid_t pfs_clid; /* 20-2F copy target must match this uuid */ 562 uint8_t label[16]; /* 30-3F import/export label */ 563 uint8_t path[64]; /* 40-7F target specification string or key */ 564 }; 565 566 typedef struct dmsg_vol_data dmsg_vol_data_t; 567 568 #define DMSG_VOLF_ENABLED 0x0001 569 #define DMSG_VOLF_INPROG 0x0002 570 #define DMSG_VOLF_CONN_RR 0x80 /* round-robin at same priority */ 571 #define DMSG_VOLF_CONN_EF 0x40 /* media errors flagged */ 572 #define DMSG_VOLF_CONN_PRI 0x0F /* select priority 0-15 (15=best) */ 573 574 #define DMSG_COPYID_COUNT 256 /* WARNING! embedded in hammer2 vol */ 575 576 struct dmsg_lnk_volconf { 577 dmsg_hdr_t head; 578 dmsg_vol_data_t copy; /* copy spec */ 579 int32_t index; 580 int32_t unused01; 581 uuid_t mediaid; 582 int64_t reserved02[32]; 583 }; 584 585 typedef struct dmsg_lnk_volconf dmsg_lnk_volconf_t; 586 587 /* 588 * Debug layer ops operate on any link 589 * 590 * SHELL - Persist stream, access the debug shell on the target 591 * registration. Multiple shells can be operational. 592 */ 593 #define DMSG_DBG_SHELL DMSG_DBG(0x001, dmsg_dbg_shell) 594 595 struct dmsg_dbg_shell { 596 dmsg_hdr_t head; 597 }; 598 typedef struct dmsg_dbg_shell dmsg_dbg_shell_t; 599 600 /* 601 * Domain layer ops operate on any link, link-0 may be used when the 602 * directory connected target is the desired registration. 603 * 604 * (nothing defined) 605 */ 606 607 /* 608 * Cache layer ops operate on any link, link-0 may be used when the 609 * directly connected target is the desired registration. 610 * 611 * LOCK - Persist state, blockable, abortable. 612 * 613 * Obtain cache state (MODIFIED, EXCLUSIVE, SHARED, or INVAL) 614 * in any of three domains (TREE, INUM, ATTR, DIRENT) for a 615 * particular key relative to cache state already owned. 616 * 617 * TREE - Effects entire sub-tree at the specified element 618 * and will cause existing cache state owned by 619 * other nodes to be adjusted such that the request 620 * can be granted. 621 * 622 * INUM - Only effects inode creation/deletion of an existing 623 * element or a new element, by inumber and/or name. 624 * typically can be held for very long periods of time 625 * (think the vnode cache), directly relates to 626 * hammer2_chain structures representing inodes. 627 * 628 * ATTR - Only effects an inode's attributes, such as 629 * ownership, modes, etc. Used for lookups, chdir, 630 * open, etc. mtime has no affect. 631 * 632 * DIRENT - Only affects an inode's attributes plus the 633 * attributes or names related to any directory entry 634 * directly under this inode (non-recursively). Can 635 * be retained for medium periods of time when doing 636 * directory scans. 637 * 638 * This function may block and can be aborted. You may be 639 * granted cache state that is more broad than the state you 640 * requested (e.g. a different set of domains and/or an element 641 * at a higher layer in the tree). When quorum operations 642 * are used you may have to reconcile these grants to the 643 * lowest common denominator. 644 * 645 * In order to grant your request either you or the target 646 * (or both) may have to obtain a quorum agreement. Deadlock 647 * resolution may be required. When doing it yourself you 648 * will typically maintain an active message to each master 649 * node in the system. You can only grant the cache state 650 * when a quorum of nodes agree. 651 * 652 * The cache state includes transaction id information which 653 * can be used to resolve data requests. 654 */ 655 #define DMSG_CAC_LOCK DMSG_CAC(0x001, dmsg_cac_lock) 656 657 /* 658 * Quorum layer ops operate on any link, link-0 may be used when the 659 * directly connected target is the desired registration. 660 * 661 * COMMIT - Persist state, blockable, abortable 662 * 663 * Issue a COMMIT in two phases. A quorum must acknowledge 664 * the operation to proceed to phase-2. Message-update to 665 * proceed to phase-2. 666 */ 667 #define DMSG_QRM_COMMIT DMSG_QRM(0x001, dmsg_qrm_commit) 668 669 /* 670 * DMSG_PROTO_BLK Protocol 671 * 672 * BLK_OPEN - Open device. This transaction must be left open for the 673 * duration and the returned keyid passed in all associated 674 * BLK commands. Multiple OPENs can be issued within the 675 * transaction. 676 * 677 * BLK_CLOSE - Close device. This can be used to close one of the opens 678 * within a BLK_OPEN transaction. It may NOT initiate a 679 * transaction. Note that a termination of the transaction 680 * (e.g. with LNK_ERROR or BLK_ERROR) closes all active OPENs 681 * for that transaction. 682 * 683 * BLK_READ - Strategy read. Not typically streaming. 684 * 685 * BLK_WRITE - Strategy write. Not typically streaming. 686 * 687 * BLK_FLUSH - Strategy flush. Not typically streaming. 688 * 689 * BLK_FREEBLKS - Strategy freeblks. Not typically streaming. 690 */ 691 #define DMSG_BLK_OPEN DMSG_BLK(0x001, dmsg_blk_open) 692 #define DMSG_BLK_CLOSE DMSG_BLK(0x002, dmsg_blk_open) 693 #define DMSG_BLK_READ DMSG_BLK(0x003, dmsg_blk_read) 694 #define DMSG_BLK_WRITE DMSG_BLK(0x004, dmsg_blk_write) 695 #define DMSG_BLK_FLUSH DMSG_BLK(0x005, dmsg_blk_flush) 696 #define DMSG_BLK_FREEBLKS DMSG_BLK(0x006, dmsg_blk_freeblks) 697 #define DMSG_BLK_ERROR DMSG_BLK(0xFFF, dmsg_blk_error) 698 699 struct dmsg_blk_open { 700 dmsg_hdr_t head; 701 uint32_t modes; 702 uint32_t reserved01; 703 }; 704 705 #define DMSG_BLKOPEN_RD 0x0001 706 #define DMSG_BLKOPEN_WR 0x0002 707 708 /* 709 * DMSG_LNK_ERROR is returned for simple results, 710 * DMSG_BLK_ERROR is returned for extended results. 711 */ 712 struct dmsg_blk_error { 713 dmsg_hdr_t head; 714 uint64_t keyid; 715 uint32_t resid; 716 uint32_t reserved02; 717 char buf[64]; 718 }; 719 720 struct dmsg_blk_read { 721 dmsg_hdr_t head; 722 uint64_t keyid; 723 uint64_t offset; 724 uint32_t bytes; 725 uint32_t flags; 726 uint32_t reserved01; 727 uint32_t reserved02; 728 }; 729 730 struct dmsg_blk_write { 731 dmsg_hdr_t head; 732 uint64_t keyid; 733 uint64_t offset; 734 uint32_t bytes; 735 uint32_t flags; 736 uint32_t reserved01; 737 uint32_t reserved02; 738 }; 739 740 struct dmsg_blk_flush { 741 dmsg_hdr_t head; 742 uint64_t keyid; 743 uint64_t offset; 744 uint32_t bytes; 745 uint32_t flags; 746 uint32_t reserved01; 747 uint32_t reserved02; 748 }; 749 750 struct dmsg_blk_freeblks { 751 dmsg_hdr_t head; 752 uint64_t keyid; 753 uint64_t offset; 754 uint32_t bytes; 755 uint32_t flags; 756 uint32_t reserved01; 757 uint32_t reserved02; 758 }; 759 760 typedef struct dmsg_blk_open dmsg_blk_open_t; 761 typedef struct dmsg_blk_read dmsg_blk_read_t; 762 typedef struct dmsg_blk_write dmsg_blk_write_t; 763 typedef struct dmsg_blk_flush dmsg_blk_flush_t; 764 typedef struct dmsg_blk_freeblks dmsg_blk_freeblks_t; 765 typedef struct dmsg_blk_error dmsg_blk_error_t; 766 767 /* 768 * NOTE!!!! ALL EXTENDED HEADER STRUCTURES MUST BE 64-BYTE ALIGNED!!! 769 * 770 * General message errors 771 * 772 * 0x00 - 0x1F Local iocomm errors 773 * 0x20 - 0x2F Global errors 774 */ 775 #define DMSG_ERR_NOSUPP 0x20 776 #define DMSG_ERR_LOSTLINK 0x21 777 #define DMSG_ERR_IO 0x22 /* generic */ 778 #define DMSG_ERR_PARAM 0x23 /* generic */ 779 #define DMSG_ERR_CANTCIRC 0x24 /* (typically means lost span) */ 780 781 union dmsg_any { 782 char buf[DMSG_HDR_MAX]; 783 dmsg_hdr_t head; 784 785 dmsg_lnk_conn_t lnk_conn; 786 dmsg_lnk_span_t lnk_span; 787 dmsg_lnk_circ_t lnk_circ; 788 dmsg_lnk_volconf_t lnk_volconf; 789 790 dmsg_blk_open_t blk_open; 791 dmsg_blk_error_t blk_error; 792 dmsg_blk_read_t blk_read; 793 dmsg_blk_write_t blk_write; 794 dmsg_blk_flush_t blk_flush; 795 dmsg_blk_freeblks_t blk_freeblks; 796 }; 797 798 typedef union dmsg_any dmsg_any_t; 799 800 /* 801 * Kernel iocom structures and prototypes for kern/kern_dmsg.c 802 */ 803 #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES) 804 805 struct hammer2_pfsmount; 806 struct kdmsg_iocom; 807 struct kdmsg_state; 808 struct kdmsg_msg; 809 810 /* 811 * msg_ctl flags (atomic) 812 */ 813 #define KDMSG_CLUSTERCTL_KILL 0x00000001 814 #define KDMSG_CLUSTERCTL_KILLRX 0x00000002 /* staged helper exit */ 815 #define KDMSG_CLUSTERCTL_KILLTX 0x00000004 /* staged helper exit */ 816 #define KDMSG_CLUSTERCTL_SLEEPING 0x00000008 /* interlocked w/msglk */ 817 818 /* 819 * When the KDMSG_IOCOMF_AUTOCIRC flag is set the kdmsg code in 820 * the kernel automatically tries to forge a virtual circuit for 821 * any active SPAN state received. 822 * 823 * This is only done when the received SPANs are significantly filtered 824 * by the transmitted LNK_CONN. That is, it is done only by clients who 825 * connect to specific services over the cluster. 826 */ 827 struct kdmsg_circuit { 828 RB_ENTRY(kdmsg_circuit) rbnode; /* indexed by msgid */ 829 TAILQ_ENTRY(kdmsg_circuit) entry; /* written by shim */ 830 struct kdmsg_iocom *iocom; /* written by shim */ 831 struct kdmsg_state *span_state; 832 struct kdmsg_state *circ_state; /* master circuit */ 833 struct kdmsg_state *rcirc_state; /* slave circuit */ 834 uint64_t msgid; 835 int weight; 836 int recorded; /* written by shim */ 837 int lost; /* written by shim */ 838 int refs; /* written by shim */ 839 }; 840 841 typedef struct kdmsg_circuit kdmsg_circuit_t; 842 843 /* 844 * Transactional state structure, representing an open transaction. The 845 * transaction might represent a cache state (and thus have a chain 846 * association), or a VOP op, LNK_SPAN, or other things. 847 */ 848 struct kdmsg_state { 849 RB_ENTRY(kdmsg_state) rbnode; /* indexed by msgid */ 850 struct kdmsg_iocom *iocom; 851 struct kdmsg_circuit *circ; 852 uint32_t icmd; /* record cmd creating state */ 853 uint32_t txcmd; /* mostly for CMDF flags */ 854 uint32_t rxcmd; /* mostly for CMDF flags */ 855 uint64_t msgid; /* {circuit,msgid} uniq */ 856 int flags; 857 int error; 858 void *chain; /* (caller's state) */ 859 struct kdmsg_msg *msg; 860 int (*func)(struct kdmsg_state *, struct kdmsg_msg *); 861 union { 862 void *any; 863 struct hammer2_pfsmount *pmp; 864 struct kdmsg_circuit *circ; 865 } any; 866 }; 867 868 #define KDMSG_STATE_INSERTED 0x0001 869 #define KDMSG_STATE_DYNAMIC 0x0002 870 #define KDMSG_STATE_DELPEND 0x0004 /* transmit delete pending */ 871 #define KDMSG_STATE_ABORTING 0x0008 /* avoids recursive abort */ 872 873 struct kdmsg_msg { 874 TAILQ_ENTRY(kdmsg_msg) qentry; /* serialized queue */ 875 struct kdmsg_iocom *iocom; 876 struct kdmsg_state *state; 877 struct kdmsg_circuit *circ; 878 size_t hdr_size; 879 size_t aux_size; 880 char *aux_data; 881 int flags; 882 dmsg_any_t any; 883 }; 884 885 #define KDMSG_FLAG_AUXALLOC 0x0001 886 887 typedef struct kdmsg_link kdmsg_link_t; 888 typedef struct kdmsg_state kdmsg_state_t; 889 typedef struct kdmsg_msg kdmsg_msg_t; 890 891 struct kdmsg_state_tree; 892 int kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2); 893 RB_HEAD(kdmsg_state_tree, kdmsg_state); 894 RB_PROTOTYPE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp); 895 896 struct kdmsg_circuit_tree; 897 int kdmsg_circuit_cmp(kdmsg_circuit_t *circ1, kdmsg_circuit_t *circ2); 898 RB_HEAD(kdmsg_circuit_tree, kdmsg_circuit); 899 RB_PROTOTYPE(kdmsg_circuit_tree, kdmsg_circuit, rbnode, kdmsg_circuit_cmp); 900 901 /* 902 * Structure embedded in e.g. mount, master control structure for 903 * DMSG stream handling. 904 */ 905 struct kdmsg_iocom { 906 struct malloc_type *mmsg; 907 struct file *msg_fp; /* cluster pipe->userland */ 908 thread_t msgrd_td; /* cluster thread */ 909 thread_t msgwr_td; /* cluster thread */ 910 int msg_ctl; /* wakeup flags */ 911 int msg_seq; /* cluster msg sequence id */ 912 uint32_t flags; 913 struct lock msglk; /* lockmgr lock */ 914 TAILQ_HEAD(, kdmsg_msg) msgq; /* transmit queue */ 915 void *handle; 916 void (*auto_callback)(kdmsg_msg_t *); 917 int (*rcvmsg)(kdmsg_msg_t *); 918 void (*exit_func)(struct kdmsg_iocom *); 919 struct kdmsg_state *conn_state; /* active LNK_CONN state */ 920 struct kdmsg_state *freerd_state; /* allocation cache */ 921 struct kdmsg_state *freewr_state; /* allocation cache */ 922 struct kdmsg_state_tree staterd_tree; /* active messages */ 923 struct kdmsg_state_tree statewr_tree; /* active messages */ 924 struct kdmsg_circuit_tree circ_tree; /* active circuits */ 925 dmsg_lnk_conn_t auto_lnk_conn; 926 dmsg_lnk_span_t auto_lnk_span; 927 }; 928 929 typedef struct kdmsg_iocom kdmsg_iocom_t; 930 931 #define KDMSG_IOCOMF_AUTOCONN 0x0001 /* handle received LNK_CONN */ 932 #define KDMSG_IOCOMF_AUTOSPAN 0x0002 /* handle received LNK_SPAN */ 933 #define KDMSG_IOCOMF_AUTOCIRC 0x0004 /* handle received LNK_CIRC */ 934 #define KDMSG_IOCOMF_AUTOFORGE 0x0008 /* auto initiate LNK_CIRC */ 935 #define KDMSG_IOCOMF_EXITNOACC 0x0010 /* cannot accept writes */ 936 937 #define KDMSG_IOCOMF_AUTOANY (KDMSG_IOCOMF_AUTOCONN | \ 938 KDMSG_IOCOMF_AUTOSPAN | \ 939 KDMSG_IOCOMF_AUTOCIRC | \ 940 KDMSG_IOCOMF_AUTOFORGE) 941 942 uint32_t kdmsg_icrc32(const void *buf, size_t size); 943 uint32_t kdmsg_icrc32c(const void *buf, size_t size, uint32_t crc); 944 945 /* 946 * kern_dmsg.c 947 */ 948 void kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, u_int32_t flags, 949 struct malloc_type *mmsg, 950 int (*rcvmsg)(kdmsg_msg_t *msg)); 951 void kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp, 952 const char *subsysname); 953 void kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom, 954 void (*conn_callback)(kdmsg_msg_t *msg)); 955 void kdmsg_iocom_uninit(kdmsg_iocom_t *iocom); 956 void kdmsg_drain_msgq(kdmsg_iocom_t *iocom); 957 958 void kdmsg_msg_free(kdmsg_msg_t *msg); 959 kdmsg_msg_t *kdmsg_msg_alloc(kdmsg_iocom_t *iocom, kdmsg_circuit_t *circ, 960 uint32_t cmd, 961 int (*func)(kdmsg_state_t *, kdmsg_msg_t *), 962 void *data); 963 kdmsg_msg_t *kdmsg_msg_alloc_state(kdmsg_state_t *state, uint32_t cmd, 964 int (*func)(kdmsg_state_t *, kdmsg_msg_t *), 965 void *data); 966 void kdmsg_msg_write(kdmsg_msg_t *msg); 967 void kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error); 968 void kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error); 969 void kdmsg_state_reply(kdmsg_state_t *state, uint32_t error); 970 void kdmsg_state_result(kdmsg_state_t *state, uint32_t error); 971 972 void kdmsg_circ_hold(kdmsg_circuit_t *circ); 973 void kdmsg_circ_drop(kdmsg_circuit_t *circ); 974 975 976 #endif 977 978 #endif 979