1 /* 2 * Copyright (c) 2011-2014 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #ifndef _SYS_DMSG_H_ 36 #define _SYS_DMSG_H_ 37 38 #ifndef _SYS_TYPES_H_ 39 #include <sys/types.h> 40 #endif 41 #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES) 42 #ifndef _SYS_MALLOC_H_ 43 #include <sys/malloc.h> 44 #endif 45 #ifndef _SYS_TREE_H_ 46 #include <sys/tree.h> 47 #endif 48 #ifndef _SYS_THREAD_H_ 49 #include <sys/thread.h> 50 #endif 51 #endif 52 #ifndef _SYS_UUID_H_ 53 #include <sys/uuid.h> 54 #endif 55 56 /* 57 * Mesh network protocol structures. 58 * 59 * CONN PROTOCOL 60 * 61 * The mesh is constructed via point-to-point streaming links with varying 62 * levels of interconnectedness, forming a graph. Leafs of the graph are 63 * typically kernel devices (xdisk) or VFSs (HAMMER2). Internal nodes are 64 * usually (user level) hammer2 service demons. 65 * 66 * Upon connecting and after authentication, a LNK_CONN transaction is opened 67 * to configure the link. The SPAN protocol is then typically run over the 68 * open LNK_CONN transaction. 69 * 70 * Terminating the LNK_CONN transaction terminates everything running over it 71 * (typically open LNK_SPAN transactions), which in turn terminates everything 72 * running over the LNK_SPANs. 73 * 74 * SPAN PROTOCOL 75 * 76 * The SPAN protocol runs over an open LNK_CONN transaction and is used to 77 * advertise any number of services. For example, each PFS under a HAMMER2 78 * mount will be advertised as an open LNK_SPAN transaction. 79 * 80 * Any network node on the graph running multiple connections is capable 81 * of relaying LNK_SPANs from any connection to any other connection. This 82 * is typically done by the user-level hammer2 service demon, and typically 83 * not done by kernel devices or VFSs (though these entities must be able 84 * to manage multiple LNK_SPANs since they might advertise or need to talk 85 * to multiple services). 86 * 87 * Relaying is not necessarily trivial as it requires internal nodes to 88 * track two open transactions (on the two iocom interfaces) and translate 89 * the msgid and circuit. In addition, the relay may have to track multiple 90 * SPANs from the same iocom or from multiple iocoms which represent the same 91 * end-point and must select the best end-point, must send notifications when 92 * a better path is available, and must allow (when connectivity is still 93 * present) any existing, open, stacked sub-transactions to complete before 94 * terminating the less efficient SPAN. 95 * 96 * Relaying is optional. It is perfectly acceptable for the hammer2 service 97 * to plug a received socket descriptor directly into the appropriate kernel 98 * device driver. 99 * 100 * STACKED TRANSACTIONS 101 * 102 * Message transactions can be stacked. That is, you can initiate a DMSG 103 * transaction relative to another open transaction. sub-transactions can 104 * be initiate without waiting for the parent transaction to complete its 105 * handshake. 106 * 107 * This is done by entering the open transaction's msgid as the circuit field 108 * in the new transaction (typically by populating msg->parent). The 109 * transaction tracking structure will be referenced and will track the 110 * sub-transaction. Note that msgids must still be unique on an 111 * iocom-by-iocom basis. 112 * 113 * Messages can race closing circuits. When a circuit is lost, 114 * messages are simulated to delete any sub-transactions. 115 * 116 * MESSAGE TRANSACTIONAL STATES 117 * 118 * Message transactions are handled by the CREATE, DELETE, REPLY, ABORT, and 119 * CREPLY flags. Message state is typically recorded at the end points and 120 * will be maintained (preventing reuse of the transaction id) until a DELETE 121 * is both sent and received. 122 * 123 * One-way messages such as those used for debug commands are not recorded 124 * and do not require any transactional state. These are sent without 125 * the CREATE, DELETE, or ABORT flags set. ABORT is not supported for 126 * one-off messages. The REPLY bit can be used to distinguish between 127 * command and status if desired. 128 * 129 * Transactional messages are messages which require a reply to be 130 * returned. These messages can also consist of multiple message elements 131 * for the command or reply or both (or neither). The command message 132 * sequence sets CREATE on the first message and DELETE on the last message. 133 * A single message command sets both (CREATE|DELETE). The reply message 134 * sequence works the same way but of course also sets the REPLY bit. 135 * 136 * Tansactional messages can be aborted by sending a message element 137 * with the ABORT flag set. This flag can be combined with either or both 138 * the CREATE and DELETE flags. When combined with the CREATE flag the 139 * command is treated as non-blocking but still executes. Whem combined 140 * with the DELETE flag no additional message elements are required. 141 * 142 * Transactions are terminated by sending a message with DELETE set. 143 * Transactions must be CREATEd and DELETEd in both directions. If a 144 * transaction is governing stacked sub-transactions the sub-transactions 145 * are automatically terminated before the governing transaction is terminated. 146 * Terminates are handled by simulating a received DELETE and expecting the 147 * normal function callback and state machine to (ultimately) issue a 148 * terminating (DELETE) response. 149 * 150 * Transactions can operate in full-duplex as both sides are fully open 151 * (i.e. CREATE sent, CREATE|REPLY returned, DELETE not sent by anyone). 152 * Additional commands can be initiated from either side of the transaction. 153 * 154 * ABORT SPECIAL CASE - Mid-stream aborts. A mid-stream abort can be sent 155 * when supported by the sender by sending an ABORT message with neither 156 * CREATE or DELETE set. This effectively turns the message into a 157 * non-blocking message (but depending on what is being represented can also 158 * cut short prior data elements in the stream). 159 * 160 * ABORT SPECIAL CASE - Abort-after-DELETE. Transactional messages have to be 161 * abortable if the stream/pipe/whatever is lost. In this situation any 162 * forwarding relay needs to unconditionally abort commands and replies that 163 * are still active. This is done by sending an ABORT|DELETE even in 164 * situations where a DELETE has already been sent in that direction. This 165 * is done, for example, when links are in a half-closed state. In this 166 * situation it is possible for the abort request to race a transition to the 167 * fully closed state. ABORT|DELETE messages which race the fully closed 168 * state are expected to be discarded by the other end. 169 * 170 * -- 171 * 172 * All base and extended message headers are 64-byte aligned, and all 173 * transports must support extended message headers up to DMSG_HDR_MAX. 174 * Currently we allow extended message headers up to 2048 bytes. Note 175 * that the extended header size is encoded in the 'cmd' field of the header. 176 * 177 * Any in-band data is padded to a 64-byte alignment and placed directly 178 * after the extended header (after the higher-level cmd/rep structure). 179 * The actual unaligned size of the in-band data is encoded in the aux_bytes 180 * field in this case. Maximum data sizes are negotiated during registration. 181 * 182 * Auxillary data can be in-band or out-of-band. In-band data sets aux_descr 183 * equal to 0. Any out-of-band data must be negotiated by the SPAN protocol. 184 * 185 * Auxillary data, whether in-band or out-of-band, must be at-least 64-byte 186 * aligned. The aux_bytes field contains the actual byte-granular length 187 * and not the aligned length. The crc is against the aligned length (so 188 * a faster crc algorithm can be used, theoretically). 189 * 190 * hdr_crc is calculated over the entire, ALIGNED extended header. For 191 * the purposes of calculating the crc, the hdr_crc field is 0. That is, 192 * if calculating the crc in HW a 32-bit '0' must be inserted in place of 193 * the hdr_crc field when reading the entire header and compared at the 194 * end (but the actual hdr_crc must be left intact in memory). A simple 195 * counter to replace the field going into the CRC generator does the job 196 * in HW. The CRC endian is based on the magic number field and may have 197 * to be byte-swapped, too (which is also easy to do in HW). 198 * 199 * aux_crc is calculated over the entire, ALIGNED auxillary data. 200 * 201 * SHARED MEMORY IMPLEMENTATIONS 202 * 203 * Shared-memory implementations typically use a pipe to transmit the extended 204 * message header and shared memory to store any auxilary data. Auxillary 205 * data in one-way (non-transactional) messages is typically required to be 206 * inline. CRCs are still recommended and required at the beginning, but 207 * may be negotiated away later. 208 */ 209 210 #define DMSG_TERMINATE_STRING(ary) \ 211 do { (ary)[sizeof(ary) - 1] = 0; } while (0) 212 213 /* 214 * dmsg_hdr must be 64 bytes 215 */ 216 struct dmsg_hdr { 217 uint16_t magic; /* 00 sanity, synchro, endian */ 218 uint16_t reserved02; /* 02 */ 219 uint32_t salt; /* 04 random salt helps w/crypto */ 220 221 uint64_t msgid; /* 08 message transaction id */ 222 uint64_t circuit; /* 10 circuit id or 0 */ 223 uint64_t reserved18; /* 18 */ 224 225 uint32_t cmd; /* 20 flags | cmd | hdr_size / ALIGN */ 226 uint32_t aux_crc; /* 24 auxillary data crc */ 227 uint32_t aux_bytes; /* 28 auxillary data length (bytes) */ 228 uint32_t error; /* 2C error code or 0 */ 229 uint64_t aux_descr; /* 30 negotiated OOB data descr */ 230 uint32_t reserved38; /* 38 */ 231 uint32_t hdr_crc; /* 3C (aligned) extended header crc */ 232 }; 233 234 typedef struct dmsg_hdr dmsg_hdr_t; 235 236 #define DMSG_HDR_MAGIC 0x4832 237 #define DMSG_HDR_MAGIC_REV 0x3248 238 #define DMSG_HDR_CRCOFF offsetof(dmsg_hdr_t, salt) 239 #define DMSG_HDR_CRCBYTES (sizeof(dmsg_hdr_t) - DMSG_HDR_CRCOFF) 240 241 /* 242 * Administrative protocol limits. 243 * 244 * NOTE: A dmsg header must completely fit in the (fifo) buffer, but 245 * dmsg aux data does not have to completely fit. The dmsg 246 * structure allows headers up to 255*64 = 16320 bytes. There 247 * is no real limit on the aux_data other than what we deem 248 * reasonable and defenseable (i.e. not run processes or the 249 * kernel out of memory). But it should be able to handle at 250 * least MAXPHYS bytes which is typically 128KB or 256KB. 251 */ 252 #define DMSG_HDR_MAX 2048 /* <= 8192 */ 253 #define DMSG_AUX_MAX (1024*1024) /* <= 1MB */ 254 #define DMSG_BUF_SIZE (DMSG_HDR_MAX * 4) 255 #define DMSG_BUF_MASK (DMSG_BUF_SIZE - 1) 256 257 /* 258 * The message (cmd) field also encodes various flags and the total size 259 * of the message header. This allows the protocol processors to validate 260 * persistency and structural settings for every command simply by 261 * switch()ing on the (cmd) field. 262 */ 263 #define DMSGF_CREATE 0x80000000U /* msg start */ 264 #define DMSGF_DELETE 0x40000000U /* msg end */ 265 #define DMSGF_REPLY 0x20000000U /* reply path */ 266 #define DMSGF_ABORT 0x10000000U /* abort req */ 267 #define DMSGF_REVTRANS 0x08000000U /* opposite direction msgid */ 268 #define DMSGF_REVCIRC 0x04000000U /* opposite direction circuit */ 269 #define DMSGF_FLAG1 0x02000000U 270 #define DMSGF_FLAG0 0x01000000U 271 272 #define DMSGF_FLAGS 0xFF000000U /* all flags */ 273 #define DMSGF_PROTOS 0x00F00000U /* all protos */ 274 #define DMSGF_CMDS 0x000FFF00U /* all cmds */ 275 #define DMSGF_SIZE 0x000000FFU /* N*32 */ 276 277 /* 278 * XXX Future, flag that an in-line (not part of a CREATE/DELETE) command 279 * expects some sort of acknowledgement. Allows protocol mismatches to 280 * be detected. 281 */ 282 #define DMSGF_CMDF_EXPECT_ACK 0x00080000U /* in-line command no-ack */ 283 284 #define DMSGF_CMDSWMASK (DMSGF_CMDS | \ 285 DMSGF_SIZE | \ 286 DMSGF_PROTOS | \ 287 DMSGF_REPLY) 288 289 #define DMSGF_BASECMDMASK (DMSGF_CMDS | \ 290 DMSGF_SIZE | \ 291 DMSGF_PROTOS) 292 293 #define DMSGF_TRANSMASK (DMSGF_CMDS | \ 294 DMSGF_SIZE | \ 295 DMSGF_PROTOS | \ 296 DMSGF_REPLY | \ 297 DMSGF_CREATE | \ 298 DMSGF_DELETE) 299 300 #define DMSGF_BASEFLAGS (DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY) 301 302 #define DMSG_PROTO_LNK 0x00000000U 303 #define DMSG_PROTO_DBG 0x00100000U 304 #define DMSG_PROTO_HM2 0x00200000U 305 #define DMSG_PROTO_XX3 0x00300000U 306 #define DMSG_PROTO_XX4 0x00400000U 307 #define DMSG_PROTO_BLK 0x00500000U 308 #define DMSG_PROTO_VOP 0x00600000U 309 310 /* 311 * Message command constructors, sans flags 312 */ 313 #define DMSG_ALIGN 64 314 #define DMSG_ALIGNMASK (DMSG_ALIGN - 1) 315 #define DMSG_DOALIGN(bytes) (((bytes) + DMSG_ALIGNMASK) & \ 316 ~DMSG_ALIGNMASK) 317 318 #define DMSG_HDR_ENCODE(elm) (((uint32_t)sizeof(struct elm) + \ 319 DMSG_ALIGNMASK) / \ 320 DMSG_ALIGN) 321 322 #define DMSG_LNK(cmd, elm) (DMSG_PROTO_LNK | \ 323 ((cmd) << 8) | \ 324 DMSG_HDR_ENCODE(elm)) 325 326 #define DMSG_DBG(cmd, elm) (DMSG_PROTO_DBG | \ 327 ((cmd) << 8) | \ 328 DMSG_HDR_ENCODE(elm)) 329 330 #define DMSG_HM2(cmd, elm) (DMSG_PROTO_HM2 | \ 331 ((cmd) << 8) | \ 332 DMSG_HDR_ENCODE(elm)) 333 334 #define DMSG_BLK(cmd, elm) (DMSG_PROTO_BLK | \ 335 ((cmd) << 8) | \ 336 DMSG_HDR_ENCODE(elm)) 337 338 #define DMSG_VOP(cmd, elm) (DMSG_PROTO_VOP | \ 339 ((cmd) << 8) | \ 340 DMSG_HDR_ENCODE(elm)) 341 342 /* 343 * Link layer ops basically talk to just the other side of a direct 344 * connection. 345 * 346 * LNK_PAD - One-way message on circuit 0, ignored by target. Used to 347 * pad message buffers on shared-memory transports. Not 348 * typically used with TCP. 349 * 350 * LNK_PING - One-way message on circuit-0, keep-alive, run by both sides 351 * typically 1/sec on idle link, link is lost after 10 seconds 352 * of inactivity. 353 * 354 * LNK_AUTH - Authenticate the connection, negotiate administrative 355 * rights & encryption, protocol class, etc. Only PAD and 356 * AUTH messages (not even PING) are accepted until 357 * authentication is complete. This message also identifies 358 * the host. 359 * 360 * LNK_CONN - Enable the SPAN protocol on circuit-0, possibly also 361 * installing a PFS filter (by cluster id, unique id, and/or 362 * wildcarded name). 363 * 364 * LNK_SPAN - A SPAN transaction typically on iocom->state0 enables 365 * messages to be relayed to/from a particular cluster node. 366 * SPANs are received, sorted, aggregated, filtered, and 367 * retransmitted back out across all applicable connections. 368 * 369 * The leaf protocol also uses this to make a PFS available 370 * to the cluster (e.g. on-mount). 371 */ 372 #define DMSG_LNK_PAD DMSG_LNK(0x000, dmsg_hdr) 373 #define DMSG_LNK_PING DMSG_LNK(0x001, dmsg_hdr) 374 #define DMSG_LNK_AUTH DMSG_LNK(0x010, dmsg_lnk_auth) 375 #define DMSG_LNK_CONN DMSG_LNK(0x011, dmsg_lnk_conn) 376 #define DMSG_LNK_SPAN DMSG_LNK(0x012, dmsg_lnk_span) 377 #define DMSG_LNK_ERROR DMSG_LNK(0xFFF, dmsg_hdr) 378 379 /* 380 * Reserved command codes for third party subsystems. Structure size is 381 * not known here so do not try to construct the full DMSG_LNK_ define. 382 */ 383 #define DMSG_LNK_CMD_HAMMER2_VOLCONF 0x20 384 385 #define DMSG_LABEL_SIZE 128 /* fixed at 128, do not change */ 386 387 /* 388 * LNK_AUTH - Authentication (often omitted) 389 */ 390 struct dmsg_lnk_auth { 391 dmsg_hdr_t head; 392 char dummy[64]; 393 }; 394 395 /* 396 * LNK_CONN - Register connection info for SPAN protocol 397 * (transaction, left open, iocom->state0 only). 398 * 399 * LNK_CONN identifies a streaming connection into the cluster. 400 * 401 * peer_mask serves to filter the SPANs we receive by peer_type. A cluster 402 * controller typically sets this to (uint64_t)-1, indicating that it wants 403 * everything. A block devfs interface might set it to 1 << DMSG_PEER_DISK, 404 * and a hammer2 mount might set it to 1 << DMSG_PEER_HAMMER2. 405 * 406 * media_iud allows multiple (e.g. HAMMER2) connections belonging to the same 407 * media to transmit duplicative LNK_VOLCONF updates without causing confusion 408 * in the cluster controller. 409 * 410 * pfs_clid, pfs_fsid, pfs_type, and label are peer-specific and must be 411 * left empty (zero-fill) if not supported by a particular peer. 412 */ 413 struct dmsg_lnk_conn { 414 dmsg_hdr_t head; 415 uuid_t media_id; /* media configuration id */ 416 uuid_t peer_id; /* unique peer uuid */ 417 uuid_t reserved01; 418 uint64_t peer_mask; /* PEER mask for SPAN filtering */ 419 uint8_t peer_type; /* see DMSG_PEER_xxx */ 420 uint8_t reserved02; 421 uint16_t proto_version; /* high level protocol support */ 422 uint32_t status; /* status flags */ 423 uint32_t rnss; /* node's generated rnss */ 424 uint8_t reserved03[8]; 425 uint32_t reserved04[14]; 426 char peer_label[DMSG_LABEL_SIZE]; /* peer identity string */ 427 }; 428 429 typedef struct dmsg_lnk_conn dmsg_lnk_conn_t; 430 431 /* 432 * PEER types 0-63 are defined here. There is a limit of 64 types due to 433 * the width of peer_mask. 434 * 435 * PFS types depend on the peer type. sys/dmsg.h only defines the default. 436 * peer-specific headers define PFS types for any given peer. 437 */ 438 #define DMSG_PEER_NONE 0 439 #define DMSG_PEER_ROUTER 1 /* server: cluster controller */ 440 #define DMSG_PEER_BLOCK 2 /* server: block devices */ 441 #define DMSG_PEER_HAMMER2 3 /* server: h2 mounted volume */ 442 #define DMSG_PEER_CLIENT 63 /* a client connection */ 443 #define DMSG_PEER_MAX 64 444 445 #define DMSG_PFSTYPE_DEFAULT 0 446 #define DMSG_PFSTYPE_MASK 0x0F 447 448 /* 449 * Structures embedded in LNK_SPAN 450 */ 451 struct dmsg_media_block { 452 uint64_t bytes; /* media size in bytes */ 453 uint32_t blksize; /* media block size */ 454 uint32_t reserved01; 455 }; 456 457 typedef struct dmsg_media_block dmsg_media_block_t; 458 459 /* 460 * LNK_SPAN - Initiate or relay a SPAN 461 * (transaction, left open, typically only on iocom->state0) 462 * 463 * This message registers an end-point with the other end of the connection, 464 * telling the other end who we are and what we can provide or intend to 465 * consume. Multiple registrations can be maintained as open transactions 466 * with each one specifying a unique end-point. 467 * 468 * Registrations are sent from {source}=S {1...n} to {target}=0 and maintained 469 * as open transactions. Registrations are also received and maintains as 470 * open transactions, creating a matrix of linkid's. 471 * 472 * While these transactions are open additional transactions can be executed 473 * between any two linkid's {source}=S (registrations we sent) to {target}=T 474 * (registrations we received). 475 * 476 * Closure of any registration transaction will automatically abort any open 477 * transactions using the related linkids. Closure can be initiated 478 * voluntarily from either side with either end issuing a DELETE, or they 479 * can be ABORTed. 480 * 481 * Status updates are performed via the open transaction. 482 * 483 * -- 484 * 485 * A registration identifies a node and its various PFS parameters including 486 * the PFS_TYPE. For example, a diskless HAMMER2 client typically identifies 487 * itself as PFSTYPE_CLIENT. 488 * 489 * Any node may serve as a cluster controller, aggregating and passing 490 * on received registrations, but end-points do not have to implement this 491 * ability. Most end-points typically implement a single client-style or 492 * server-style PFS_TYPE and rendezvous at a cluster controller. 493 * 494 * The cluster controller does not aggregate/pass-on all received 495 * registrations. It typically filters what gets passed on based on what it 496 * receives, passing on only the best candidates. 497 * 498 * If a symmetric spanning tree is desired additional candidates whos 499 * {dist, rnss} fields match the last best candidate must also be propagated. 500 * This feature is not currently enabled. 501 * 502 * STATUS UPDATES: Status updates use the same structure but typically 503 * only contain incremental changes to e.g. pfs_type, with 504 * a text description sent as out-of-band data. 505 */ 506 struct dmsg_lnk_span { 507 dmsg_hdr_t head; 508 uuid_t peer_id; 509 uuid_t pfs_id; /* unique pfs id */ 510 uint8_t pfs_type; /* PFS type */ 511 uint8_t peer_type; /* PEER type */ 512 uint16_t proto_version; /* high level protocol support */ 513 uint32_t status; /* status flags */ 514 uint8_t reserved02[8]; 515 uint32_t dist; /* span distance */ 516 uint32_t rnss; /* random number sub-sort */ 517 union { 518 uint32_t reserved03[14]; 519 dmsg_media_block_t block; 520 } media; 521 522 /* 523 * NOTE: for PEER_HAMMER2 cl_label is typically empty and fs_label 524 * is the superroot directory name. 525 * 526 * for PEER_BLOCK cl_label is typically host/device and 527 * fs_label is typically the serial number string. 528 */ 529 char peer_label[DMSG_LABEL_SIZE]; /* peer label */ 530 char pfs_label[DMSG_LABEL_SIZE]; /* PFS label */ 531 }; 532 533 typedef struct dmsg_lnk_span dmsg_lnk_span_t; 534 535 #define DMSG_SPAN_PROTO_1 1 536 537 /* 538 * Debug layer ops operate on any link 539 * 540 * SHELL - Persist stream, access the debug shell on the target 541 * registration. Multiple shells can be operational. 542 */ 543 #define DMSG_DBG_SHELL DMSG_DBG(0x001, dmsg_dbg_shell) 544 545 struct dmsg_dbg_shell { 546 dmsg_hdr_t head; 547 }; 548 typedef struct dmsg_dbg_shell dmsg_dbg_shell_t; 549 550 /* 551 * Hammer2 layer ops (low-level chain manipulation used by cluster code) 552 * 553 * HM2_OPENPFS - Attach a PFS 554 * HM2_FLUSHPFS - Flush a PFS 555 * 556 * HM2_LOOKUP - Lookup chain (parent-relative transaction) 557 * (can request multiple chains) 558 * HM2_NEXT - Lookup next chain (parent-relative transaction) 559 * (can request multiple chains) 560 * HM2_LOCK - [Re]lock a chain (chain-relative) (non-recursive) 561 * HM2_UNLOCK - Unlock a chain (chain-relative) (non-recursive) 562 * HM2_RESIZE - Resize a chain (chain-relative) 563 * HM2_MODIFY - Modify a chain (chain-relative) 564 * HM2_CREATE - Create a chain (parent-relative) 565 * HM2_DUPLICATE- Duplicate a chain (target-parent-relative) 566 * HM2_DELDUP - Delete-Duplicate a chain (chain-relative) 567 * HM2_DELETE - Delete a chain (chain-relative) 568 * HM2_SNAPSHOT - Create a snapshot (snapshot-root-relative, w/clid override) 569 */ 570 #define DMSG_HM2_OPENPFS DMSG_HM2(0x001, dmsg_hm2_openpfs) 571 572 /* 573 * DMSG_PROTO_BLK Protocol 574 * 575 * BLK_OPEN - Open device. This transaction must be left open for the 576 * duration and the returned keyid passed in all associated 577 * BLK commands. Multiple OPENs can be issued within the 578 * transaction. 579 * 580 * BLK_CLOSE - Close device. This can be used to close one of the opens 581 * within a BLK_OPEN transaction. It may NOT initiate a 582 * transaction. Note that a termination of the transaction 583 * (e.g. with LNK_ERROR or BLK_ERROR) closes all active OPENs 584 * for that transaction. XXX not well defined atm. 585 * 586 * BLK_READ - Strategy read. Not typically streaming. 587 * 588 * BLK_WRITE - Strategy write. Not typically streaming. 589 * 590 * BLK_FLUSH - Strategy flush. Not typically streaming. 591 * 592 * BLK_FREEBLKS - Strategy freeblks. Not typically streaming. 593 */ 594 #define DMSG_BLK_OPEN DMSG_BLK(0x001, dmsg_blk_open) 595 #define DMSG_BLK_CLOSE DMSG_BLK(0x002, dmsg_blk_open) 596 #define DMSG_BLK_READ DMSG_BLK(0x003, dmsg_blk_read) 597 #define DMSG_BLK_WRITE DMSG_BLK(0x004, dmsg_blk_write) 598 #define DMSG_BLK_FLUSH DMSG_BLK(0x005, dmsg_blk_flush) 599 #define DMSG_BLK_FREEBLKS DMSG_BLK(0x006, dmsg_blk_freeblks) 600 #define DMSG_BLK_ERROR DMSG_BLK(0xFFF, dmsg_blk_error) 601 602 struct dmsg_blk_open { 603 dmsg_hdr_t head; 604 uint32_t modes; 605 uint32_t reserved01; 606 }; 607 608 #define DMSG_BLKOPEN_RD 0x0001 609 #define DMSG_BLKOPEN_WR 0x0002 610 611 /* 612 * DMSG_LNK_ERROR is returned for simple results, 613 * DMSG_BLK_ERROR is returned for extended results. 614 */ 615 struct dmsg_blk_error { 616 dmsg_hdr_t head; 617 uint64_t keyid; 618 uint32_t resid; 619 uint32_t reserved02; 620 char buf[64]; 621 }; 622 623 struct dmsg_blk_read { 624 dmsg_hdr_t head; 625 uint64_t keyid; 626 uint64_t offset; 627 uint32_t bytes; 628 uint32_t flags; 629 uint32_t reserved01; 630 uint32_t reserved02; 631 }; 632 633 struct dmsg_blk_write { 634 dmsg_hdr_t head; 635 uint64_t keyid; 636 uint64_t offset; 637 uint32_t bytes; 638 uint32_t flags; 639 uint32_t reserved01; 640 uint32_t reserved02; 641 }; 642 643 struct dmsg_blk_flush { 644 dmsg_hdr_t head; 645 uint64_t keyid; 646 uint64_t offset; 647 uint32_t bytes; 648 uint32_t flags; 649 uint32_t reserved01; 650 uint32_t reserved02; 651 }; 652 653 struct dmsg_blk_freeblks { 654 dmsg_hdr_t head; 655 uint64_t keyid; 656 uint64_t offset; 657 uint32_t bytes; 658 uint32_t flags; 659 uint32_t reserved01; 660 uint32_t reserved02; 661 }; 662 663 typedef struct dmsg_blk_open dmsg_blk_open_t; 664 typedef struct dmsg_blk_read dmsg_blk_read_t; 665 typedef struct dmsg_blk_write dmsg_blk_write_t; 666 typedef struct dmsg_blk_flush dmsg_blk_flush_t; 667 typedef struct dmsg_blk_freeblks dmsg_blk_freeblks_t; 668 typedef struct dmsg_blk_error dmsg_blk_error_t; 669 670 /* 671 * NOTE!!!! ALL EXTENDED HEADER STRUCTURES MUST BE 64-BYTE ALIGNED!!! 672 * 673 * General message errors 674 * 675 * 0x00 - 0x1F Local iocomm errors 676 * 0x20 - 0x2F Global errors 677 */ 678 #define DMSG_ERR_NOSUPP 0x20 679 #define DMSG_ERR_LOSTLINK 0x21 680 #define DMSG_ERR_IO 0x22 /* generic */ 681 #define DMSG_ERR_PARAM 0x23 /* generic */ 682 #define DMSG_ERR_CANTCIRC 0x24 /* (typically means lost span) */ 683 684 union dmsg_any { 685 char buf[DMSG_HDR_MAX]; 686 dmsg_hdr_t head; 687 688 dmsg_lnk_conn_t lnk_conn; 689 dmsg_lnk_span_t lnk_span; 690 691 dmsg_blk_open_t blk_open; 692 dmsg_blk_error_t blk_error; 693 dmsg_blk_read_t blk_read; 694 dmsg_blk_write_t blk_write; 695 dmsg_blk_flush_t blk_flush; 696 dmsg_blk_freeblks_t blk_freeblks; 697 }; 698 699 typedef union dmsg_any dmsg_any_t; 700 701 /* 702 * Kernel iocom structures and prototypes for kern/kern_dmsg.c 703 */ 704 #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES) 705 706 struct hammer2_mount; 707 struct xa_softc; 708 struct kdmsg_iocom; 709 struct kdmsg_state; 710 struct kdmsg_msg; 711 struct kdmsg_data; 712 713 /* 714 * msg_ctl flags (atomic) 715 */ 716 #define KDMSG_CLUSTERCTL_UNUSED01 0x00000001 717 #define KDMSG_CLUSTERCTL_KILLRX 0x00000002 /* staged helper exit */ 718 #define KDMSG_CLUSTERCTL_KILLTX 0x00000004 /* staged helper exit */ 719 #define KDMSG_CLUSTERCTL_SLEEPING 0x00000008 /* interlocked w/msglk */ 720 721 /* 722 * Transactional state structure, representing an open transaction. The 723 * transaction might represent a cache state (and thus have a chain 724 * association), or a VOP op, LNK_SPAN, or other things. 725 * 726 * NOTE: A non-empty subq represents one ref. 727 * If we are inserted on a parent's subq, that's one ref (SUBINSERTED). 728 * If we are inserted on a RB tree, that's one ref (RBINSERTED). 729 * msg->state represents a ref. 730 * Other code references may hold refs. 731 * 732 * NOTE: The parent association stays intact as long as a state has a 733 * non-empty subq. Otherwise simulated failures might not be able 734 * to reach the children. 735 */ 736 TAILQ_HEAD(kdmsg_state_list, kdmsg_state); 737 738 struct kdmsg_state { 739 RB_ENTRY(kdmsg_state) rbnode; /* indexed by msgid */ 740 struct kdmsg_state *scan; /* scan check */ 741 struct kdmsg_state_list subq; /* active stacked states */ 742 TAILQ_ENTRY(kdmsg_state) entry; /* on parent subq */ 743 TAILQ_ENTRY(kdmsg_state) user_entry; /* available to devices */ 744 struct kdmsg_iocom *iocom; 745 struct kdmsg_state *parent; 746 int refs; /* refs */ 747 uint32_t icmd; /* record cmd creating state */ 748 uint32_t txcmd; /* mostly for CMDF flags */ 749 uint32_t rxcmd; /* mostly for CMDF flags */ 750 uint64_t msgid; /* {parent,msgid} uniq */ 751 int flags; 752 int error; 753 void *chain; /* (caller's state) */ 754 int (*func)(struct kdmsg_state *, struct kdmsg_msg *); 755 union { 756 void *any; 757 struct hammer2_mount *hmp; 758 struct xa_softc *xa_sc; 759 } any; 760 }; 761 762 #define KDMSG_STATE_SUBINSERTED 0x0001 763 #define KDMSG_STATE_DYNAMIC 0x0002 764 #define KDMSG_STATE_UNUSED0004 0x0004 765 #define KDMSG_STATE_ABORTING 0x0008 /* avoids recursive abort */ 766 #define KDMSG_STATE_OPPOSITE 0x0010 /* opposite direction */ 767 #define KDMSG_STATE_DYING 0x0020 /* atomic recursive circ fail */ 768 #define KDMSG_STATE_INTERLOCK 0x0040 769 #define KDMSG_STATE_RBINSERTED 0x0080 770 #define KDMSG_STATE_SIGNAL 0x0400 771 #define KDMSG_STATE_NEW 0x0800 /* defer abort processing */ 772 773 struct kdmsg_msg { 774 TAILQ_ENTRY(kdmsg_msg) qentry; /* serialized queue */ 775 struct kdmsg_state *state; 776 size_t hdr_size; 777 size_t aux_size; 778 char *aux_data; 779 uint32_t flags; 780 uint32_t tcmd; /* outer transaction cmd */ 781 dmsg_any_t any; /* variable sized */ 782 }; 783 784 struct kdmsg_data { 785 char *aux_data; 786 size_t aux_size; 787 struct kdmsg_iocom *iocom; 788 }; 789 790 #define KDMSG_FLAG_AUXALLOC 0x0001 791 792 typedef struct kdmsg_link kdmsg_link_t; 793 typedef struct kdmsg_state kdmsg_state_t; 794 typedef struct kdmsg_msg kdmsg_msg_t; 795 typedef struct kdmsg_data kdmsg_data_t; 796 797 struct kdmsg_state_tree; 798 int kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2); 799 RB_HEAD(kdmsg_state_tree, kdmsg_state); 800 RB_PROTOTYPE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp); 801 802 /* 803 * Structure embedded in e.g. mount, master control structure for 804 * DMSG stream handling. 805 */ 806 struct kdmsg_iocom { 807 struct malloc_type *mmsg; 808 struct file *msg_fp; /* cluster pipe->userland */ 809 thread_t msgrd_td; /* cluster thread */ 810 thread_t msgwr_td; /* cluster thread */ 811 int msg_ctl; /* wakeup flags */ 812 int msg_seq; /* cluster msg sequence id */ 813 uint32_t flags; 814 struct lock msglk; /* lockmgr lock */ 815 TAILQ_HEAD(, kdmsg_msg) msgq; /* transmit queue */ 816 void *handle; 817 void (*auto_callback)(kdmsg_msg_t *); 818 int (*rcvmsg)(kdmsg_msg_t *); 819 void (*exit_func)(struct kdmsg_iocom *); 820 struct kdmsg_state state0; /* root state for stacking */ 821 struct kdmsg_state *conn_state; /* active LNK_CONN state */ 822 struct kdmsg_state *freerd_state; /* allocation cache */ 823 struct kdmsg_state *freewr_state; /* allocation cache */ 824 struct kdmsg_state_tree staterd_tree; /* active messages */ 825 struct kdmsg_state_tree statewr_tree; /* active messages */ 826 dmsg_lnk_conn_t auto_lnk_conn; 827 dmsg_lnk_span_t auto_lnk_span; 828 }; 829 830 typedef struct kdmsg_iocom kdmsg_iocom_t; 831 832 #define KDMSG_IOCOMF_AUTOCONN 0x0001 /* handle RX/TX LNK_CONN */ 833 #define KDMSG_IOCOMF_AUTORXSPAN 0x0002 /* handle RX LNK_SPAN */ 834 #define KDMSG_IOCOMF_AUTOTXSPAN 0x0008 /* handle TX LNK_SPAN */ 835 #define KDMSG_IOCOMF_EXITNOACC 0x8000 /* cannot accept writes */ 836 837 #define KDMSG_IOCOMF_AUTOANY (KDMSG_IOCOMF_AUTOCONN | \ 838 KDMSG_IOCOMF_AUTORXSPAN | \ 839 KDMSG_IOCOMF_AUTOTXSPAN) 840 841 uint32_t kdmsg_icrc32(const void *buf, size_t size); 842 uint32_t kdmsg_icrc32c(const void *buf, size_t size, uint32_t crc); 843 844 /* 845 * kern_dmsg.c 846 */ 847 void kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, u_int32_t flags, 848 struct malloc_type *mmsg, 849 int (*rcvmsg)(kdmsg_msg_t *msg)); 850 void kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp, 851 const char *subsysname); 852 void kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom, 853 void (*conn_callback)(kdmsg_msg_t *msg)); 854 void kdmsg_iocom_uninit(kdmsg_iocom_t *iocom); 855 void kdmsg_drain_msgq(kdmsg_iocom_t *iocom); 856 857 void kdmsg_msg_free(kdmsg_msg_t *msg); 858 kdmsg_msg_t *kdmsg_msg_alloc(kdmsg_state_t *state, uint32_t cmd, 859 int (*func)(kdmsg_state_t *, kdmsg_msg_t *), 860 void *data); 861 void kdmsg_msg_write(kdmsg_msg_t *msg); 862 void kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error); 863 void kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error); 864 void kdmsg_state_reply(kdmsg_state_t *state, uint32_t error); 865 void kdmsg_state_result(kdmsg_state_t *state, uint32_t error); 866 void kdmsg_detach_aux_data(kdmsg_msg_t *msg, kdmsg_data_t *data); 867 void kdmsg_free_aux_data(kdmsg_data_t *data); 868 869 #endif /* _KERNEL || _KERNEL_STRUCTURES */ 870 871 #endif /* !_SYS_DMSG_H_ */ 872