1 /* 2 * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #ifndef _SYS_DMSG_H_ 36 #define _SYS_DMSG_H_ 37 38 #ifndef _SYS_MALLOC_H_ 39 #include <sys/malloc.h> 40 #endif 41 #ifndef _SYS_TREE_H_ 42 #include <sys/tree.h> 43 #endif 44 #ifndef _SYS_THREAD_H_ 45 #include <sys/thread.h> 46 #endif 47 #ifndef _SYS_UUID_H_ 48 #include <sys/uuid.h> 49 #endif 50 51 /* 52 * Mesh network protocol structures. 53 * 54 * CONN PROTOCOL 55 * 56 * The mesh is constructed from point-to-point streaming links with varying 57 * levels of interconnectedness, forming a graph. Terminii in the graph 58 * are entities such as a HAMMER2 PFS or a network mount or other types 59 * of nodes. 60 * 61 * Upon connecting and after authentication, a LNK_CONN transaction is opened 62 * on circuit 0 by both ends. This configures and enables the SPAN protocol. 63 * The LNK_CONN transaction remains open for the life of the connection. 64 * 65 * SPAN PROTOCOL 66 * 67 * Once enabled, termini transmits a representitive LNK_SPAN out all 68 * available connections advertising what it is. Nodes maintaing multiple 69 * connections will relay received LNK_SPANs out available connections 70 * with some filtering based on the CONN configuration. A distance metric 71 * and per-node random value (rnss) is aggregated. 72 * 73 * Since LNK_SPANs can rapidly multiply in a complex graph, not all incoming 74 * LNK_SPANs will be relayed. Only the top N over all collect LNK_SPANs for 75 * any given advertisement are relayed. 76 * 77 * It is possible to code the SPANning tree algorithm to guarantee that 78 * symmetrical spans will be generated after stabilization. The RNSS field 79 * is used to help distinguish and reduce paths in complex graphs when 80 * symmetric spans are desired. We always generate RNSS but we currently do 81 * not implement symmetrical SPAN guarantees. 82 * 83 * CIRC PROTOCOL 84 * 85 * We aren't done yet. Before transactions can be relayed, symmetric paths 86 * must be formed via the LNK_CIRC protocol. The LNK_CIRC protocol 87 * establishes a virtual circuit from any node to any other node, creating 88 * a circuit id which is stored in dmsg_hdr.circuit. Messages received on 89 * one side or forwarded to the other. Forwarded messages bypass normal 90 * state tracking. 91 * 92 * A virtual circuit is forged by working the propogated SPANs backwards. 93 * Each node in the graph helps propagate the virtual circuit by attach the 94 * LNK_CIRC transaction it receives to a LNK_CIRC transaction it initiates 95 * out the other interface. 96 * 97 * Since SPANs are link-state transactions any change in related span(s) 98 * will also force-terminate VC's using those spans. 99 * 100 * MESSAGE TRANSACTIONAL STATES 101 * 102 * Message state is handled by the CREATE, DELETE, REPLY, and ABORT 103 * flags. Message state is typically recorded at the end points and 104 * at each hop until a DELETE is received from both sides. 105 * 106 * One-way messages such as those used by spanning tree commands are not 107 * recorded. These are sent without the CREATE, DELETE, or ABORT flags set. 108 * ABORT is not supported for one-off messages. The REPLY bit can be used 109 * to distinguish between command and status if desired. 110 * 111 * Persistent-state messages are messages which require a reply to be 112 * returned. These messages can also consist of multiple message elements 113 * for the command or reply or both (or neither). The command message 114 * sequence sets CREATE on the first message and DELETE on the last message. 115 * A single message command sets both (CREATE|DELETE). The reply message 116 * sequence works the same way but of course also sets the REPLY bit. 117 * 118 * Persistent-state messages can be aborted by sending a message element 119 * with the ABORT flag set. This flag can be combined with either or both 120 * the CREATE and DELETE flags. When combined with the CREATE flag the 121 * command is treated as non-blocking but still executes. Whem combined 122 * with the DELETE flag no additional message elements are required. 123 * 124 * ABORT SPECIAL CASE - Mid-stream aborts. A mid-stream abort can be sent 125 * when supported by the sender by sending an ABORT message with neither 126 * CREATE or DELETE set. This effectively turns the message into a 127 * non-blocking message (but depending on what is being represented can also 128 * cut short prior data elements in the stream). 129 * 130 * ABORT SPECIAL CASE - Abort-after-DELETE. Persistent messages have to be 131 * abortable if the stream/pipe/whatever is lost. In this situation any 132 * forwarding relay needs to unconditionally abort commands and replies that 133 * are still active. This is done by sending an ABORT|DELETE even in 134 * situations where a DELETE has already been sent in that direction. This 135 * is done, for example, when links are in a half-closed state. In this 136 * situation it is possible for the abort request to race a transition to the 137 * fully closed state. ABORT|DELETE messages which race the fully closed 138 * state are expected to be discarded by the other end. 139 * 140 * -- 141 * 142 * All base and extended message headers are 64-byte aligned, and all 143 * transports must support extended message headers up to DMSG_HDR_MAX. 144 * Currently we allow extended message headers up to 2048 bytes. Note 145 * that the extended header size is encoded in the 'cmd' field of the header. 146 * 147 * Any in-band data is padded to a 64-byte alignment and placed directly 148 * after the extended header (after the higher-level cmd/rep structure). 149 * The actual unaligned size of the in-band data is encoded in the aux_bytes 150 * field in this case. Maximum data sizes are negotiated during registration. 151 * 152 * Auxillary data can be in-band or out-of-band. In-band data sets aux_descr 153 * equal to 0. Any out-of-band data must be negotiated by the SPAN protocol. 154 * 155 * Auxillary data, whether in-band or out-of-band, must be at-least 64-byte 156 * aligned. The aux_bytes field contains the actual byte-granular length 157 * and not the aligned length. The crc is against the aligned length (so 158 * a faster crc algorithm can be used, theoretically). 159 * 160 * hdr_crc is calculated over the entire, ALIGNED extended header. For 161 * the purposes of calculating the crc, the hdr_crc field is 0. That is, 162 * if calculating the crc in HW a 32-bit '0' must be inserted in place of 163 * the hdr_crc field when reading the entire header and compared at the 164 * end (but the actual hdr_crc must be left intact in memory). A simple 165 * counter to replace the field going into the CRC generator does the job 166 * in HW. The CRC endian is based on the magic number field and may have 167 * to be byte-swapped, too (which is also easy to do in HW). 168 * 169 * aux_crc is calculated over the entire, ALIGNED auxillary data. 170 * 171 * SHARED MEMORY IMPLEMENTATIONS 172 * 173 * Shared-memory implementations typically use a pipe to transmit the extended 174 * message header and shared memory to store any auxilary data. Auxillary 175 * data in one-way (non-transactional) messages is typically required to be 176 * inline. CRCs are still recommended and required at the beginning, but 177 * may be negotiated away later. 178 */ 179 struct dmsg_hdr { 180 uint16_t magic; /* 00 sanity, synchro, endian */ 181 uint16_t reserved02; /* 02 */ 182 uint32_t salt; /* 04 random salt helps w/crypto */ 183 184 uint64_t msgid; /* 08 message transaction id */ 185 uint64_t circuit; /* 10 circuit id or 0 */ 186 uint64_t reserved18; /* 18 */ 187 188 uint32_t cmd; /* 20 flags | cmd | hdr_size / ALIGN */ 189 uint32_t aux_crc; /* 24 auxillary data crc */ 190 uint32_t aux_bytes; /* 28 auxillary data length (bytes) */ 191 uint32_t error; /* 2C error code or 0 */ 192 uint64_t aux_descr; /* 30 negotiated OOB data descr */ 193 uint32_t reserved38; /* 38 */ 194 uint32_t hdr_crc; /* 3C (aligned) extended header crc */ 195 }; 196 197 typedef struct dmsg_hdr dmsg_hdr_t; 198 199 #define DMSG_HDR_MAGIC 0x4832 200 #define DMSG_HDR_MAGIC_REV 0x3248 201 #define DMSG_HDR_CRCOFF offsetof(dmsg_hdr_t, salt) 202 #define DMSG_HDR_CRCBYTES (sizeof(dmsg_hdr_t) - DMSG_HDR_CRCOFF) 203 204 /* 205 * Administrative protocol limits. 206 */ 207 #define DMSG_HDR_MAX 2048 /* <= 65535 */ 208 #define DMSG_AUX_MAX 65536 /* <= 1MB */ 209 #define DMSG_BUF_SIZE (DMSG_HDR_MAX * 4) 210 #define DMSG_BUF_MASK (DMSG_BUF_SIZE - 1) 211 212 /* 213 * The message (cmd) field also encodes various flags and the total size 214 * of the message header. This allows the protocol processors to validate 215 * persistency and structural settings for every command simply by 216 * switch()ing on the (cmd) field. 217 */ 218 #define DMSGF_CREATE 0x80000000U /* msg start */ 219 #define DMSGF_DELETE 0x40000000U /* msg end */ 220 #define DMSGF_REPLY 0x20000000U /* reply path */ 221 #define DMSGF_ABORT 0x10000000U /* abort req */ 222 #define DMSGF_AUXOOB 0x08000000U /* aux-data is OOB */ 223 #define DMSGF_FLAG2 0x04000000U 224 #define DMSGF_FLAG1 0x02000000U 225 #define DMSGF_FLAG0 0x01000000U 226 227 #define DMSGF_FLAGS 0xFF000000U /* all flags */ 228 #define DMSGF_PROTOS 0x00F00000U /* all protos */ 229 #define DMSGF_CMDS 0x000FFF00U /* all cmds */ 230 #define DMSGF_SIZE 0x000000FFU /* N*32 */ 231 232 #define DMSGF_CMDSWMASK (DMSGF_CMDS | \ 233 DMSGF_SIZE | \ 234 DMSGF_PROTOS | \ 235 DMSGF_REPLY) 236 237 #define DMSGF_BASECMDMASK (DMSGF_CMDS | \ 238 DMSGF_SIZE | \ 239 DMSGF_PROTOS) 240 241 #define DMSGF_TRANSMASK (DMSGF_CMDS | \ 242 DMSGF_SIZE | \ 243 DMSGF_PROTOS | \ 244 DMSGF_REPLY | \ 245 DMSGF_CREATE | \ 246 DMSGF_DELETE) 247 248 #define DMSG_PROTO_LNK 0x00000000U 249 #define DMSG_PROTO_DBG 0x00100000U 250 #define DMSG_PROTO_DOM 0x00200000U 251 #define DMSG_PROTO_CAC 0x00300000U 252 #define DMSG_PROTO_QRM 0x00400000U 253 #define DMSG_PROTO_BLK 0x00500000U 254 #define DMSG_PROTO_VOP 0x00600000U 255 256 /* 257 * Message command constructors, sans flags 258 */ 259 #define DMSG_ALIGN 64 260 #define DMSG_ALIGNMASK (DMSG_ALIGN - 1) 261 #define DMSG_DOALIGN(bytes) (((bytes) + DMSG_ALIGNMASK) & \ 262 ~DMSG_ALIGNMASK) 263 264 #define DMSG_HDR_ENCODE(elm) (((uint32_t)sizeof(struct elm) + \ 265 DMSG_ALIGNMASK) / \ 266 DMSG_ALIGN) 267 268 #define DMSG_LNK(cmd, elm) (DMSG_PROTO_LNK | \ 269 ((cmd) << 8) | \ 270 DMSG_HDR_ENCODE(elm)) 271 272 #define DMSG_DBG(cmd, elm) (DMSG_PROTO_DBG | \ 273 ((cmd) << 8) | \ 274 DMSG_HDR_ENCODE(elm)) 275 276 #define DMSG_DOM(cmd, elm) (DMSG_PROTO_DOM | \ 277 ((cmd) << 8) | \ 278 DMSG_HDR_ENCODE(elm)) 279 280 #define DMSG_CAC(cmd, elm) (DMSG_PROTO_CAC | \ 281 ((cmd) << 8) | \ 282 DMSG_HDR_ENCODE(elm)) 283 284 #define DMSG_QRM(cmd, elm) (DMSG_PROTO_QRM | \ 285 ((cmd) << 8) | \ 286 DMSG_HDR_ENCODE(elm)) 287 288 #define DMSG_BLK(cmd, elm) (DMSG_PROTO_BLK | \ 289 ((cmd) << 8) | \ 290 DMSG_HDR_ENCODE(elm)) 291 292 #define DMSG_VOP(cmd, elm) (DMSG_PROTO_VOP | \ 293 ((cmd) << 8) | \ 294 DMSG_HDR_ENCODE(elm)) 295 296 /* 297 * Link layer ops basically talk to just the other side of a direct 298 * connection. 299 * 300 * LNK_PAD - One-way message on circuit 0, ignored by target. Used to 301 * pad message buffers on shared-memory transports. Not 302 * typically used with TCP. 303 * 304 * LNK_PING - One-way message on circuit-0, keep-alive, run by both sides 305 * typically 1/sec on idle link, link is lost after 10 seconds 306 * of inactivity. 307 * 308 * LNK_AUTH - Authenticate the connection, negotiate administrative 309 * rights & encryption, protocol class, etc. Only PAD and 310 * AUTH messages (not even PING) are accepted until 311 * authentication is complete. This message also identifies 312 * the host. 313 * 314 * LNK_CONN - Enable the SPAN protocol on circuit-0, possibly also 315 * installing a PFS filter (by cluster id, unique id, and/or 316 * wildcarded name). 317 * 318 * LNK_SPAN - A SPAN transaction on circuit-0 enables messages to be 319 * relayed to/from a particular cluster node. SPANs are 320 * received, sorted, aggregated, filtered, and retransmitted 321 * back out across all applicable connections. 322 * 323 * The leaf protocol also uses this to make a PFS available 324 * to the cluster (e.g. on-mount). 325 * 326 * LNK_CIRC - a CIRC transaction establishes a circuit from source to 327 * target by creating pairs of open transactions across each 328 * hop. 329 * 330 * LNK_VOLCONF - Volume header configuration change. All hammer2 331 * connections (hammer2 connect ...) stored in the volume 332 * header are spammed on circuit 0 to the hammer2 333 * service daemon, and any live configuration change 334 * thereafter. 335 */ 336 #define DMSG_LNK_PAD DMSG_LNK(0x000, dmsg_hdr) 337 #define DMSG_LNK_PING DMSG_LNK(0x001, dmsg_hdr) 338 #define DMSG_LNK_AUTH DMSG_LNK(0x010, dmsg_lnk_auth) 339 #define DMSG_LNK_CONN DMSG_LNK(0x011, dmsg_lnk_conn) 340 #define DMSG_LNK_SPAN DMSG_LNK(0x012, dmsg_lnk_span) 341 #define DMSG_LNK_CIRC DMSG_LNK(0x013, dmsg_lnk_circ) 342 #define DMSG_LNK_VOLCONF DMSG_LNK(0x020, dmsg_lnk_volconf) 343 #define DMSG_LNK_ERROR DMSG_LNK(0xFFF, dmsg_hdr) 344 345 /* 346 * LNK_AUTH - Authentication (often omitted) 347 */ 348 struct dmsg_lnk_auth { 349 dmsg_hdr_t head; 350 char dummy[64]; 351 }; 352 353 /* 354 * LNK_CONN - Register connection info for SPAN protocol 355 * (transaction, left open, circuit 0 only). 356 * 357 * LNK_CONN identifies a streaming connection into the cluster and serves 358 * to identify, enable, and specify filters for the SPAN protocol. 359 * 360 * peer_mask serves to filter the SPANs we receive by peer_type. A cluster 361 * controller typically sets this to (uint64_t)-1, indicating that it wants 362 * everything. A block devfs interface might set it to 1 << DMSG_PEER_DISK, 363 * and a hammer2 mount might set it to 1 << DMSG_PEER_HAMMER2. 364 * 365 * mediaid allows multiple (e.g. HAMMER2) connections belonging to the same 366 * media to transmit duplicative LNK_VOLCONF updates without causing 367 * confusion in the cluster controller. 368 * 369 * pfs_clid, pfs_fsid, pfs_type, and label are peer-specific and must be 370 * left empty (zero-fill) if not supported by a particular peer. 371 * 372 * DMSG_PEER_CLUSTER filter: none 373 * DMSG_PEER_BLOCK filter: label 374 * DMSG_PEER_HAMMER2 filter: pfs_clid if not empty, and label 375 */ 376 struct dmsg_lnk_conn { 377 dmsg_hdr_t head; 378 uuid_t mediaid; /* media configuration id */ 379 uuid_t pfs_clid; /* rendezvous pfs uuid */ 380 uuid_t pfs_fsid; /* unique pfs uuid */ 381 uint64_t peer_mask; /* PEER mask for SPAN filtering */ 382 uint8_t peer_type; /* see DMSG_PEER_xxx */ 383 uint8_t pfs_type; /* pfs type */ 384 uint16_t proto_version; /* high level protocol support */ 385 uint32_t status; /* status flags */ 386 uint32_t rnss; /* node's generated rnss */ 387 uint8_t reserved02[8]; 388 uint32_t reserved03[12]; 389 uint64_t pfs_mask; /* PFS mask for SPAN filtering */ 390 char cl_label[128]; /* cluster label (for PEER_BLOCK) */ 391 char fs_label[128]; /* PFS label (for PEER_HAMMER2) */ 392 }; 393 394 typedef struct dmsg_lnk_conn dmsg_lnk_conn_t; 395 396 #define DMSG_PFSTYPE_NONE 0 397 #define DMSG_PFSTYPE_ADMIN 1 398 #define DMSG_PFSTYPE_CLIENT 2 399 #define DMSG_PFSTYPE_CACHE 3 400 #define DMSG_PFSTYPE_COPY 4 401 #define DMSG_PFSTYPE_SLAVE 5 402 #define DMSG_PFSTYPE_SOFT_SLAVE 6 403 #define DMSG_PFSTYPE_SOFT_MASTER 7 404 #define DMSG_PFSTYPE_MASTER 8 405 #define DMSG_PFSTYPE_SERVER 9 406 #define DMSG_PFSTYPE_SNAPSHOT 10 407 #define DMSG_PFSTYPE_MAX 11 /* 0-10 */ 408 409 #define DMSG_PEER_NONE 0 410 #define DMSG_PEER_CLUSTER 1 /* a cluster controller */ 411 #define DMSG_PEER_BLOCK 2 /* block devices */ 412 #define DMSG_PEER_HAMMER2 3 /* hammer2-mounted volumes */ 413 414 /* 415 * Structures embedded in LNK_SPAN 416 */ 417 struct dmsg_media_block { 418 uint64_t bytes; /* media size in bytes */ 419 uint32_t blksize; /* media block size */ 420 }; 421 422 typedef struct dmsg_media_block dmsg_media_block_t; 423 424 /* 425 * LNK_SPAN - Initiate or relay a SPAN 426 * (transaction, left open, circuit 0 only) 427 * 428 * This message registers an end-point with the other end of the connection, 429 * telling the other end who we are and what we can provide or intend to 430 * consume. Multiple registrations can be maintained as open transactions 431 * with each one specifying a unique end-point. 432 * 433 * Registrations are sent from {source}=S {1...n} to {target}=0 and maintained 434 * as open transactions. Registrations are also received and maintains as 435 * open transactions, creating a matrix of linkid's. 436 * 437 * While these transactions are open additional transactions can be executed 438 * between any two linkid's {source}=S (registrations we sent) to {target}=T 439 * (registrations we received). 440 * 441 * Closure of any registration transaction will automatically abort any open 442 * transactions using the related linkids. Closure can be initiated 443 * voluntarily from either side with either end issuing a DELETE, or they 444 * can be ABORTed. 445 * 446 * Status updates are performed via the open transaction. 447 * 448 * -- 449 * 450 * A registration identifies a node and its various PFS parameters including 451 * the PFS_TYPE. For example, a diskless HAMMER2 client typically identifies 452 * itself as PFSTYPE_CLIENT. 453 * 454 * Any node may serve as a cluster controller, aggregating and passing 455 * on received registrations, but end-points do not have to implement this 456 * ability. Most end-points typically implement a single client-style or 457 * server-style PFS_TYPE and rendezvous at a cluster controller. 458 * 459 * The cluster controller does not aggregate/pass-on all received 460 * registrations. It typically filters what gets passed on based on what it 461 * receives, passing on only the best candidates. 462 * 463 * If a symmetric spanning tree is desired additional candidates whos 464 * {dist, rnss} fields match the last best candidate must also be propagated. 465 * This feature is not currently enabled. 466 * 467 * STATUS UPDATES: Status updates use the same structure but typically 468 * only contain incremental changes to e.g. pfs_type, with 469 * a text description sent as out-of-band data. 470 */ 471 struct dmsg_lnk_span { 472 dmsg_hdr_t head; 473 uuid_t pfs_clid; /* rendezvous pfs uuid */ 474 uuid_t pfs_fsid; /* unique pfs id (differentiate node) */ 475 uint8_t pfs_type; /* PFS type */ 476 uint8_t peer_type; /* PEER type */ 477 uint16_t proto_version; /* high level protocol support */ 478 uint32_t status; /* status flags */ 479 uint8_t reserved02[8]; 480 uint32_t dist; /* span distance */ 481 uint32_t rnss; /* random number sub-sort */ 482 union { 483 uint32_t reserved03[14]; 484 dmsg_media_block_t block; 485 } media; 486 487 /* 488 * NOTE: for PEER_HAMMER2 cl_label is typically empty and fs_label 489 * is the superroot directory name. 490 * 491 * for PEER_BLOCK cl_label is typically host/device and 492 * fs_label is typically the serial number string. 493 */ 494 char cl_label[128]; /* cluster label */ 495 char fs_label[128]; /* PFS label */ 496 }; 497 498 typedef struct dmsg_lnk_span dmsg_lnk_span_t; 499 500 #define DMSG_SPAN_PROTO_1 1 501 502 /* 503 * LNK_CIRC - Establish a circuit 504 * (transaction, left open, circuit 0 only) 505 * 506 * Establish a circuit to the specified target. The msgid for the open 507 * transaction is used to transit messages in both directions. 508 * 509 * For circuit establishment the receiving entity looks up the outgoing 510 * relayed SPAN on the incoming iocom based on the target field and then 511 * creates peer circuit on the interface the SPAN originally came in on. 512 * Messages received on one side or forwarded to the other side and vise-versa. 513 * Any link state loss causes all related circuits to be lost. 514 */ 515 struct dmsg_lnk_circ { 516 dmsg_hdr_t head; 517 uint64_t reserved01; 518 uint64_t target; 519 }; 520 521 typedef struct dmsg_lnk_circ dmsg_lnk_circ_t; 522 523 /* 524 * LNK_VOLCONF 525 * 526 * All HAMMER2 directories directly under the super-root on your local 527 * media can be mounted separately, even if they share the same physical 528 * device. 529 * 530 * When you do a HAMMER2 mount you are effectively tying into a HAMMER2 531 * cluster via local media. The local media does not have to participate 532 * in the cluster, other than to provide the dmsg_vol_data[] array and 533 * root inode for the mount. 534 * 535 * This is important: The mount device path you specify serves to bootstrap 536 * your entry into the cluster, but your mount will make active connections 537 * to ALL copy elements in the dmsg_vol_data[] array which match the 538 * PFSID of the directory in the super-root that you specified. The local 539 * media path does not have to be mentioned in this array but becomes part 540 * of the cluster based on its type and access rights. ALL ELEMENTS ARE 541 * TREATED ACCORDING TO TYPE NO MATTER WHICH ONE YOU MOUNT FROM. 542 * 543 * The actual cluster may be far larger than the elements you list in the 544 * dmsg_vol_data[] array. You list only the elements you wish to 545 * directly connect to and you are able to access the rest of the cluster 546 * indirectly through those connections. 547 * 548 * This structure must be exactly 128 bytes long. 549 * 550 * WARNING! dmsg_vol_data is embedded in the hammer2 media volume header 551 */ 552 struct dmsg_vol_data { 553 uint8_t copyid; /* 00 copyid 0-255 (must match slot) */ 554 uint8_t inprog; /* 01 operation in progress, or 0 */ 555 uint8_t chain_to; /* 02 operation chaining to, or 0 */ 556 uint8_t chain_from; /* 03 operation chaining from, or 0 */ 557 uint16_t flags; /* 04-05 flags field */ 558 uint8_t error; /* 06 last operational error */ 559 uint8_t priority; /* 07 priority and round-robin flag */ 560 uint8_t remote_pfs_type;/* 08 probed direct remote PFS type */ 561 uint8_t reserved08[23]; /* 09-1F */ 562 uuid_t pfs_clid; /* 20-2F copy target must match this uuid */ 563 uint8_t label[16]; /* 30-3F import/export label */ 564 uint8_t path[64]; /* 40-7F target specification string or key */ 565 }; 566 567 typedef struct dmsg_vol_data dmsg_vol_data_t; 568 569 #define DMSG_VOLF_ENABLED 0x0001 570 #define DMSG_VOLF_INPROG 0x0002 571 #define DMSG_VOLF_CONN_RR 0x80 /* round-robin at same priority */ 572 #define DMSG_VOLF_CONN_EF 0x40 /* media errors flagged */ 573 #define DMSG_VOLF_CONN_PRI 0x0F /* select priority 0-15 (15=best) */ 574 575 #define DMSG_COPYID_COUNT 256 /* WARNING! embedded in hammer2 vol */ 576 577 struct dmsg_lnk_volconf { 578 dmsg_hdr_t head; 579 dmsg_vol_data_t copy; /* copy spec */ 580 int32_t index; 581 int32_t unused01; 582 uuid_t mediaid; 583 int64_t reserved02[32]; 584 }; 585 586 typedef struct dmsg_lnk_volconf dmsg_lnk_volconf_t; 587 588 /* 589 * Debug layer ops operate on any link 590 * 591 * SHELL - Persist stream, access the debug shell on the target 592 * registration. Multiple shells can be operational. 593 */ 594 #define DMSG_DBG_SHELL DMSG_DBG(0x001, dmsg_dbg_shell) 595 596 struct dmsg_dbg_shell { 597 dmsg_hdr_t head; 598 }; 599 typedef struct dmsg_dbg_shell dmsg_dbg_shell_t; 600 601 /* 602 * Domain layer ops operate on any link, link-0 may be used when the 603 * directory connected target is the desired registration. 604 * 605 * (nothing defined) 606 */ 607 608 /* 609 * Cache layer ops operate on any link, link-0 may be used when the 610 * directly connected target is the desired registration. 611 * 612 * LOCK - Persist state, blockable, abortable. 613 * 614 * Obtain cache state (MODIFIED, EXCLUSIVE, SHARED, or INVAL) 615 * in any of three domains (TREE, INUM, ATTR, DIRENT) for a 616 * particular key relative to cache state already owned. 617 * 618 * TREE - Effects entire sub-tree at the specified element 619 * and will cause existing cache state owned by 620 * other nodes to be adjusted such that the request 621 * can be granted. 622 * 623 * INUM - Only effects inode creation/deletion of an existing 624 * element or a new element, by inumber and/or name. 625 * typically can be held for very long periods of time 626 * (think the vnode cache), directly relates to 627 * hammer2_chain structures representing inodes. 628 * 629 * ATTR - Only effects an inode's attributes, such as 630 * ownership, modes, etc. Used for lookups, chdir, 631 * open, etc. mtime has no affect. 632 * 633 * DIRENT - Only affects an inode's attributes plus the 634 * attributes or names related to any directory entry 635 * directly under this inode (non-recursively). Can 636 * be retained for medium periods of time when doing 637 * directory scans. 638 * 639 * This function may block and can be aborted. You may be 640 * granted cache state that is more broad than the state you 641 * requested (e.g. a different set of domains and/or an element 642 * at a higher layer in the tree). When quorum operations 643 * are used you may have to reconcile these grants to the 644 * lowest common denominator. 645 * 646 * In order to grant your request either you or the target 647 * (or both) may have to obtain a quorum agreement. Deadlock 648 * resolution may be required. When doing it yourself you 649 * will typically maintain an active message to each master 650 * node in the system. You can only grant the cache state 651 * when a quorum of nodes agree. 652 * 653 * The cache state includes transaction id information which 654 * can be used to resolve data requests. 655 */ 656 #define DMSG_CAC_LOCK DMSG_CAC(0x001, dmsg_cac_lock) 657 658 /* 659 * Quorum layer ops operate on any link, link-0 may be used when the 660 * directly connected target is the desired registration. 661 * 662 * COMMIT - Persist state, blockable, abortable 663 * 664 * Issue a COMMIT in two phases. A quorum must acknowledge 665 * the operation to proceed to phase-2. Message-update to 666 * proceed to phase-2. 667 */ 668 #define DMSG_QRM_COMMIT DMSG_QRM(0x001, dmsg_qrm_commit) 669 670 /* 671 * DMSG_PROTO_BLK Protocol 672 * 673 * BLK_OPEN - Open device. This transaction must be left open for the 674 * duration and the returned keyid passed in all associated 675 * BLK commands. Multiple OPENs can be issued within the 676 * transaction. 677 * 678 * BLK_CLOSE - Close device. This can be used to close one of the opens 679 * within a BLK_OPEN transaction. It may NOT initiate a 680 * transaction. Note that a termination of the transaction 681 * (e.g. with LNK_ERROR or BLK_ERROR) closes all active OPENs 682 * for that transaction. 683 * 684 * BLK_READ - Strategy read. Not typically streaming. 685 * 686 * BLK_WRITE - Strategy write. Not typically streaming. 687 * 688 * BLK_FLUSH - Strategy flush. Not typically streaming. 689 * 690 * BLK_FREEBLKS - Strategy freeblks. Not typically streaming. 691 */ 692 #define DMSG_BLK_OPEN DMSG_BLK(0x001, dmsg_blk_open) 693 #define DMSG_BLK_CLOSE DMSG_BLK(0x002, dmsg_blk_open) 694 #define DMSG_BLK_READ DMSG_BLK(0x003, dmsg_blk_read) 695 #define DMSG_BLK_WRITE DMSG_BLK(0x004, dmsg_blk_write) 696 #define DMSG_BLK_FLUSH DMSG_BLK(0x005, dmsg_blk_flush) 697 #define DMSG_BLK_FREEBLKS DMSG_BLK(0x006, dmsg_blk_freeblks) 698 #define DMSG_BLK_ERROR DMSG_BLK(0xFFF, dmsg_blk_error) 699 700 struct dmsg_blk_open { 701 dmsg_hdr_t head; 702 uint32_t modes; 703 uint32_t reserved01; 704 }; 705 706 #define DMSG_BLKOPEN_RD 0x0001 707 #define DMSG_BLKOPEN_WR 0x0002 708 709 /* 710 * DMSG_LNK_ERROR is returned for simple results, 711 * DMSG_BLK_ERROR is returned for extended results. 712 */ 713 struct dmsg_blk_error { 714 dmsg_hdr_t head; 715 uint64_t keyid; 716 uint32_t resid; 717 uint32_t reserved02; 718 char buf[64]; 719 }; 720 721 struct dmsg_blk_read { 722 dmsg_hdr_t head; 723 uint64_t keyid; 724 uint64_t offset; 725 uint32_t bytes; 726 uint32_t flags; 727 uint32_t reserved01; 728 uint32_t reserved02; 729 }; 730 731 struct dmsg_blk_write { 732 dmsg_hdr_t head; 733 uint64_t keyid; 734 uint64_t offset; 735 uint32_t bytes; 736 uint32_t flags; 737 uint32_t reserved01; 738 uint32_t reserved02; 739 }; 740 741 struct dmsg_blk_flush { 742 dmsg_hdr_t head; 743 uint64_t keyid; 744 uint64_t offset; 745 uint32_t bytes; 746 uint32_t flags; 747 uint32_t reserved01; 748 uint32_t reserved02; 749 }; 750 751 struct dmsg_blk_freeblks { 752 dmsg_hdr_t head; 753 uint64_t keyid; 754 uint64_t offset; 755 uint32_t bytes; 756 uint32_t flags; 757 uint32_t reserved01; 758 uint32_t reserved02; 759 }; 760 761 typedef struct dmsg_blk_open dmsg_blk_open_t; 762 typedef struct dmsg_blk_read dmsg_blk_read_t; 763 typedef struct dmsg_blk_write dmsg_blk_write_t; 764 typedef struct dmsg_blk_flush dmsg_blk_flush_t; 765 typedef struct dmsg_blk_freeblks dmsg_blk_freeblks_t; 766 typedef struct dmsg_blk_error dmsg_blk_error_t; 767 768 /* 769 * NOTE!!!! ALL EXTENDED HEADER STRUCTURES MUST BE 64-BYTE ALIGNED!!! 770 * 771 * General message errors 772 * 773 * 0x00 - 0x1F Local iocomm errors 774 * 0x20 - 0x2F Global errors 775 */ 776 #define DMSG_ERR_NOSUPP 0x20 777 #define DMSG_ERR_LOSTLINK 0x21 778 #define DMSG_ERR_IO 0x22 /* generic */ 779 #define DMSG_ERR_PARAM 0x23 /* generic */ 780 #define DMSG_ERR_CANTCIRC 0x24 /* (typically means lost span) */ 781 782 union dmsg_any { 783 char buf[DMSG_HDR_MAX]; 784 dmsg_hdr_t head; 785 786 dmsg_lnk_conn_t lnk_conn; 787 dmsg_lnk_span_t lnk_span; 788 dmsg_lnk_circ_t lnk_circ; 789 dmsg_lnk_volconf_t lnk_volconf; 790 791 dmsg_blk_open_t blk_open; 792 dmsg_blk_error_t blk_error; 793 dmsg_blk_read_t blk_read; 794 dmsg_blk_write_t blk_write; 795 dmsg_blk_flush_t blk_flush; 796 dmsg_blk_freeblks_t blk_freeblks; 797 }; 798 799 typedef union dmsg_any dmsg_any_t; 800 801 /* 802 * Kernel iocom structures and prototypes for kern/kern_dmsg.c 803 */ 804 #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES) 805 806 struct hammer2_pfsmount; 807 struct kdmsg_iocom; 808 struct kdmsg_state; 809 struct kdmsg_msg; 810 811 /* 812 * msg_ctl flags (atomic) 813 */ 814 #define KDMSG_CLUSTERCTL_KILL 0x00000001 815 #define KDMSG_CLUSTERCTL_KILLRX 0x00000002 /* staged helper exit */ 816 #define KDMSG_CLUSTERCTL_KILLTX 0x00000004 /* staged helper exit */ 817 #define KDMSG_CLUSTERCTL_SLEEPING 0x00000008 /* interlocked w/msglk */ 818 819 /* 820 * When the KDMSG_IOCOMF_AUTOCIRC flag is set the kdmsg code in 821 * the kernel automatically tries to forge a virtual circuit for 822 * any active SPAN state received. 823 * 824 * This is only done when the received SPANs are significantly filtered 825 * by the transmitted LNK_CONN. That is, it is done only by clients who 826 * connect to specific services over the cluster. 827 */ 828 struct kdmsg_circuit { 829 RB_ENTRY(kdmsg_circuit) rbnode; /* indexed by msgid */ 830 TAILQ_ENTRY(kdmsg_circuit) entry; /* written by shim */ 831 struct kdmsg_iocom *iocom; /* written by shim */ 832 struct kdmsg_state *span_state; 833 struct kdmsg_state *circ_state; /* master circuit */ 834 struct kdmsg_state *rcirc_state; /* slave circuit */ 835 uint64_t msgid; 836 int weight; 837 int recorded; /* written by shim */ 838 int lost; /* written by shim */ 839 int refs; /* written by shim */ 840 }; 841 842 typedef struct kdmsg_circuit kdmsg_circuit_t; 843 844 /* 845 * Transactional state structure, representing an open transaction. The 846 * transaction might represent a cache state (and thus have a chain 847 * association), or a VOP op, LNK_SPAN, or other things. 848 */ 849 struct kdmsg_state { 850 RB_ENTRY(kdmsg_state) rbnode; /* indexed by msgid */ 851 struct kdmsg_iocom *iocom; 852 struct kdmsg_circuit *circ; 853 uint32_t icmd; /* record cmd creating state */ 854 uint32_t txcmd; /* mostly for CMDF flags */ 855 uint32_t rxcmd; /* mostly for CMDF flags */ 856 uint64_t msgid; /* {circuit,msgid} uniq */ 857 int flags; 858 int error; 859 void *chain; /* (caller's state) */ 860 struct kdmsg_msg *msg; 861 int (*func)(struct kdmsg_state *, struct kdmsg_msg *); 862 union { 863 void *any; 864 struct hammer2_pfsmount *pmp; 865 struct kdmsg_circuit *circ; 866 } any; 867 }; 868 869 #define KDMSG_STATE_INSERTED 0x0001 870 #define KDMSG_STATE_DYNAMIC 0x0002 871 #define KDMSG_STATE_DELPEND 0x0004 /* transmit delete pending */ 872 #define KDMSG_STATE_ABORTING 0x0008 /* avoids recursive abort */ 873 874 struct kdmsg_msg { 875 TAILQ_ENTRY(kdmsg_msg) qentry; /* serialized queue */ 876 struct kdmsg_iocom *iocom; 877 struct kdmsg_state *state; 878 struct kdmsg_circuit *circ; 879 size_t hdr_size; 880 size_t aux_size; 881 char *aux_data; 882 int flags; 883 dmsg_any_t any; 884 }; 885 886 #define KDMSG_FLAG_AUXALLOC 0x0001 887 888 typedef struct kdmsg_link kdmsg_link_t; 889 typedef struct kdmsg_state kdmsg_state_t; 890 typedef struct kdmsg_msg kdmsg_msg_t; 891 892 struct kdmsg_state_tree; 893 int kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2); 894 RB_HEAD(kdmsg_state_tree, kdmsg_state); 895 RB_PROTOTYPE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp); 896 897 struct kdmsg_circuit_tree; 898 int kdmsg_circuit_cmp(kdmsg_circuit_t *circ1, kdmsg_circuit_t *circ2); 899 RB_HEAD(kdmsg_circuit_tree, kdmsg_circuit); 900 RB_PROTOTYPE(kdmsg_circuit_tree, kdmsg_circuit, rbnode, kdmsg_circuit_cmp); 901 902 /* 903 * Structure embedded in e.g. mount, master control structure for 904 * DMSG stream handling. 905 */ 906 struct kdmsg_iocom { 907 struct malloc_type *mmsg; 908 struct file *msg_fp; /* cluster pipe->userland */ 909 thread_t msgrd_td; /* cluster thread */ 910 thread_t msgwr_td; /* cluster thread */ 911 int msg_ctl; /* wakeup flags */ 912 int msg_seq; /* cluster msg sequence id */ 913 uint32_t flags; 914 struct lock msglk; /* lockmgr lock */ 915 TAILQ_HEAD(, kdmsg_msg) msgq; /* transmit queue */ 916 void *handle; 917 void (*auto_callback)(kdmsg_msg_t *); 918 int (*rcvmsg)(kdmsg_msg_t *); 919 void (*exit_func)(struct kdmsg_iocom *); 920 struct kdmsg_state *conn_state; /* active LNK_CONN state */ 921 struct kdmsg_state *freerd_state; /* allocation cache */ 922 struct kdmsg_state *freewr_state; /* allocation cache */ 923 struct kdmsg_state_tree staterd_tree; /* active messages */ 924 struct kdmsg_state_tree statewr_tree; /* active messages */ 925 struct kdmsg_circuit_tree circ_tree; /* active circuits */ 926 dmsg_lnk_conn_t auto_lnk_conn; 927 dmsg_lnk_span_t auto_lnk_span; 928 }; 929 930 typedef struct kdmsg_iocom kdmsg_iocom_t; 931 932 #define KDMSG_IOCOMF_AUTOCONN 0x0001 /* handle received LNK_CONN */ 933 #define KDMSG_IOCOMF_AUTOSPAN 0x0002 /* handle received LNK_SPAN */ 934 #define KDMSG_IOCOMF_AUTOCIRC 0x0004 /* handle received LNK_CIRC */ 935 #define KDMSG_IOCOMF_AUTOFORGE 0x0008 /* auto initiate LNK_CIRC */ 936 #define KDMSG_IOCOMF_EXITNOACC 0x0010 /* cannot accept writes */ 937 938 #define KDMSG_IOCOMF_AUTOANY (KDMSG_IOCOMF_AUTOCONN | \ 939 KDMSG_IOCOMF_AUTOSPAN | \ 940 KDMSG_IOCOMF_AUTOCIRC | \ 941 KDMSG_IOCOMF_AUTOFORGE) 942 943 uint32_t kdmsg_icrc32(const void *buf, size_t size); 944 uint32_t kdmsg_icrc32c(const void *buf, size_t size, uint32_t crc); 945 946 /* 947 * kern_dmsg.c 948 */ 949 void kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, u_int32_t flags, 950 struct malloc_type *mmsg, 951 int (*rcvmsg)(kdmsg_msg_t *msg)); 952 void kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp, 953 const char *subsysname); 954 void kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom, 955 void (*conn_callback)(kdmsg_msg_t *msg)); 956 void kdmsg_iocom_uninit(kdmsg_iocom_t *iocom); 957 void kdmsg_drain_msgq(kdmsg_iocom_t *iocom); 958 959 void kdmsg_msg_free(kdmsg_msg_t *msg); 960 kdmsg_msg_t *kdmsg_msg_alloc(kdmsg_iocom_t *iocom, kdmsg_circuit_t *circ, 961 uint32_t cmd, 962 int (*func)(kdmsg_state_t *, kdmsg_msg_t *), 963 void *data); 964 kdmsg_msg_t *kdmsg_msg_alloc_state(kdmsg_state_t *state, uint32_t cmd, 965 int (*func)(kdmsg_state_t *, kdmsg_msg_t *), 966 void *data); 967 void kdmsg_msg_write(kdmsg_msg_t *msg); 968 void kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error); 969 void kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error); 970 void kdmsg_state_reply(kdmsg_state_t *state, uint32_t error); 971 void kdmsg_state_result(kdmsg_state_t *state, uint32_t error); 972 973 void kdmsg_circ_hold(kdmsg_circuit_t *circ); 974 void kdmsg_circ_drop(kdmsg_circuit_t *circ); 975 976 977 #endif 978 979 #endif 980