1 /* 2 * Copyright (c) 2011-2014 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #ifndef _SYS_DMSG_H_ 36 #define _SYS_DMSG_H_ 37 38 #ifndef _SYS_MALLOC_H_ 39 #include <sys/malloc.h> 40 #endif 41 #ifndef _SYS_TREE_H_ 42 #include <sys/tree.h> 43 #endif 44 #ifndef _SYS_THREAD_H_ 45 #include <sys/thread.h> 46 #endif 47 #ifndef _SYS_UUID_H_ 48 #include <sys/uuid.h> 49 #endif 50 51 /* 52 * Mesh network protocol structures. 53 * 54 * CONN PROTOCOL 55 * 56 * The mesh is constructed via point-to-point streaming links with varying 57 * levels of interconnectedness, forming a graph. Leafs of the graph are 58 * typically kernel devices (xdisk) or VFSs (HAMMER2). Internal nodes are 59 * usually (user level) hammer2 service demons. 60 * 61 * Upon connecting and after authentication, a LNK_CONN transaction is opened 62 * to configure the link. The SPAN protocol is then typically run over the 63 * open LNK_CONN transaction. 64 * 65 * Terminating the LNK_CONN transaction terminates everything running over it 66 * (typically open LNK_SPAN transactions), which in turn terminates everything 67 * running over the LNK_SPANs. 68 * 69 * SPAN PROTOCOL 70 * 71 * The SPAN protocol runs over an open LNK_CONN transaction and is used to 72 * advertise any number of services. For example, each PFS under a HAMMER2 73 * mount will be advertised as an open LNK_SPAN transaction. 74 * 75 * Any network node on the graph running multiple connections is capable 76 * of relaying LNK_SPANs from any connection to any other connection. This 77 * is typically done by the user-level hammer2 service demon, and typically 78 * not done by kernel devices or VFSs (though these entities must be able 79 * to manage multiple LNK_SPANs since they might advertise or need to talk 80 * to multiple services). 81 * 82 * Relaying is not necessarily trivial as it requires internal nodes to 83 * track two open transactions (on the two iocom interfaces) and translate 84 * the msgid and circuit. In addition, the relay may have to track multiple 85 * SPANs from the same iocom or from multiple iocoms which represent the same 86 * end-point and must select the best end-point, must send notifications when 87 * a better path is available, and must allow (when connectivity is still 88 * present) any existing, open, stacked sub-transactions to complete before 89 * terminating the less efficient SPAN. 90 * 91 * Relaying is optional. It is perfectly acceptable for the hammer2 service 92 * to plug a received socket descriptor directly into the appropriate kernel 93 * device driver. 94 * 95 * STACKED TRANSACTIONS 96 * 97 * Message transactions can be stacked. That is, you can initiate a DMSG 98 * transaction relative to another open transaction. sub-transactions can 99 * be initiate without waiting for the parent transaction to complete its 100 * handshake. 101 * 102 * This is done by entering the open transaction's msgid as the circuit field 103 * in the new transaction (typically by populating msg->parent). The 104 * transaction tracking structure will be referenced and will track the 105 * sub-transaction. Note that msgids must still be unique on an 106 * iocom-by-iocom basis. 107 * 108 * MESSAGE TRANSACTIONAL STATES 109 * 110 * Message transactions are handled by the CREATE, DELETE, REPLY, ABORT, and 111 * CREPLY flags. Message state is typically recorded at the end points and 112 * will be maintained (preventing reuse of the transaction id) until a DELETE 113 * is both sent and received. 114 * 115 * One-way messages such as those used for debug commands are not recorded 116 * and do not require any transactional state. These are sent without 117 * the CREATE, DELETE, or ABORT flags set. ABORT is not supported for 118 * one-off messages. The REPLY bit can be used to distinguish between 119 * command and status if desired. 120 * 121 * Transactional messages are messages which require a reply to be 122 * returned. These messages can also consist of multiple message elements 123 * for the command or reply or both (or neither). The command message 124 * sequence sets CREATE on the first message and DELETE on the last message. 125 * A single message command sets both (CREATE|DELETE). The reply message 126 * sequence works the same way but of course also sets the REPLY bit. 127 * 128 * Tansactional messages can be aborted by sending a message element 129 * with the ABORT flag set. This flag can be combined with either or both 130 * the CREATE and DELETE flags. When combined with the CREATE flag the 131 * command is treated as non-blocking but still executes. Whem combined 132 * with the DELETE flag no additional message elements are required. 133 * 134 * Transactions are terminated by sending a message with DELETE set. 135 * Transactions must be CREATEd and DELETEd in both directions. If a 136 * transaction is governing stacked sub-transactions the sub-transactions 137 * are automatically terminated before the governing transaction is terminated. 138 * Terminates are handled by simulating a received DELETE and expecting the 139 * normal function callback and state machine to (ultimately) issue a 140 * terminating (DELETE) response. 141 * 142 * Transactions can operate in full-duplex as both sides are fully open 143 * (i.e. CREATE sent, CREATE|REPLY returned, DELETE not sent by anyone). 144 * Additional commands can be initiated from either side of the transaction. 145 * 146 * ABORT SPECIAL CASE - Mid-stream aborts. A mid-stream abort can be sent 147 * when supported by the sender by sending an ABORT message with neither 148 * CREATE or DELETE set. This effectively turns the message into a 149 * non-blocking message (but depending on what is being represented can also 150 * cut short prior data elements in the stream). 151 * 152 * ABORT SPECIAL CASE - Abort-after-DELETE. Transactional messages have to be 153 * abortable if the stream/pipe/whatever is lost. In this situation any 154 * forwarding relay needs to unconditionally abort commands and replies that 155 * are still active. This is done by sending an ABORT|DELETE even in 156 * situations where a DELETE has already been sent in that direction. This 157 * is done, for example, when links are in a half-closed state. In this 158 * situation it is possible for the abort request to race a transition to the 159 * fully closed state. ABORT|DELETE messages which race the fully closed 160 * state are expected to be discarded by the other end. 161 * 162 * -- 163 * 164 * All base and extended message headers are 64-byte aligned, and all 165 * transports must support extended message headers up to DMSG_HDR_MAX. 166 * Currently we allow extended message headers up to 2048 bytes. Note 167 * that the extended header size is encoded in the 'cmd' field of the header. 168 * 169 * Any in-band data is padded to a 64-byte alignment and placed directly 170 * after the extended header (after the higher-level cmd/rep structure). 171 * The actual unaligned size of the in-band data is encoded in the aux_bytes 172 * field in this case. Maximum data sizes are negotiated during registration. 173 * 174 * Auxillary data can be in-band or out-of-band. In-band data sets aux_descr 175 * equal to 0. Any out-of-band data must be negotiated by the SPAN protocol. 176 * 177 * Auxillary data, whether in-band or out-of-band, must be at-least 64-byte 178 * aligned. The aux_bytes field contains the actual byte-granular length 179 * and not the aligned length. The crc is against the aligned length (so 180 * a faster crc algorithm can be used, theoretically). 181 * 182 * hdr_crc is calculated over the entire, ALIGNED extended header. For 183 * the purposes of calculating the crc, the hdr_crc field is 0. That is, 184 * if calculating the crc in HW a 32-bit '0' must be inserted in place of 185 * the hdr_crc field when reading the entire header and compared at the 186 * end (but the actual hdr_crc must be left intact in memory). A simple 187 * counter to replace the field going into the CRC generator does the job 188 * in HW. The CRC endian is based on the magic number field and may have 189 * to be byte-swapped, too (which is also easy to do in HW). 190 * 191 * aux_crc is calculated over the entire, ALIGNED auxillary data. 192 * 193 * SHARED MEMORY IMPLEMENTATIONS 194 * 195 * Shared-memory implementations typically use a pipe to transmit the extended 196 * message header and shared memory to store any auxilary data. Auxillary 197 * data in one-way (non-transactional) messages is typically required to be 198 * inline. CRCs are still recommended and required at the beginning, but 199 * may be negotiated away later. 200 */ 201 202 /* 203 * dmsg_hdr must be 64 bytes 204 */ 205 struct dmsg_hdr { 206 uint16_t magic; /* 00 sanity, synchro, endian */ 207 uint16_t reserved02; /* 02 */ 208 uint32_t salt; /* 04 random salt helps w/crypto */ 209 210 uint64_t msgid; /* 08 message transaction id */ 211 uint64_t circuit; /* 10 circuit id or 0 */ 212 uint64_t reserved18; /* 18 */ 213 214 uint32_t cmd; /* 20 flags | cmd | hdr_size / ALIGN */ 215 uint32_t aux_crc; /* 24 auxillary data crc */ 216 uint32_t aux_bytes; /* 28 auxillary data length (bytes) */ 217 uint32_t error; /* 2C error code or 0 */ 218 uint64_t aux_descr; /* 30 negotiated OOB data descr */ 219 uint32_t reserved38; /* 38 */ 220 uint32_t hdr_crc; /* 3C (aligned) extended header crc */ 221 }; 222 223 typedef struct dmsg_hdr dmsg_hdr_t; 224 225 #define DMSG_HDR_MAGIC 0x4832 226 #define DMSG_HDR_MAGIC_REV 0x3248 227 #define DMSG_HDR_CRCOFF offsetof(dmsg_hdr_t, salt) 228 #define DMSG_HDR_CRCBYTES (sizeof(dmsg_hdr_t) - DMSG_HDR_CRCOFF) 229 230 /* 231 * Administrative protocol limits. 232 */ 233 #define DMSG_HDR_MAX 2048 /* <= 65535 */ 234 #define DMSG_AUX_MAX 65536 /* <= 1MB */ 235 #define DMSG_BUF_SIZE (DMSG_HDR_MAX * 4) 236 #define DMSG_BUF_MASK (DMSG_BUF_SIZE - 1) 237 238 /* 239 * The message (cmd) field also encodes various flags and the total size 240 * of the message header. This allows the protocol processors to validate 241 * persistency and structural settings for every command simply by 242 * switch()ing on the (cmd) field. 243 */ 244 #define DMSGF_CREATE 0x80000000U /* msg start */ 245 #define DMSGF_DELETE 0x40000000U /* msg end */ 246 #define DMSGF_REPLY 0x20000000U /* reply path */ 247 #define DMSGF_ABORT 0x10000000U /* abort req */ 248 #define DMSGF_REVTRANS 0x08000000U /* opposite direction msgid */ 249 #define DMSGF_REVCIRC 0x04000000U /* opposite direction circuit */ 250 #define DMSGF_FLAG1 0x02000000U 251 #define DMSGF_FLAG0 0x01000000U 252 253 #define DMSGF_FLAGS 0xFF000000U /* all flags */ 254 #define DMSGF_PROTOS 0x00F00000U /* all protos */ 255 #define DMSGF_CMDS 0x000FFF00U /* all cmds */ 256 #define DMSGF_SIZE 0x000000FFU /* N*32 */ 257 258 /* 259 * XXX Future, flag that an in-line (not part of a CREATE/DELETE) command 260 * expects some sort of acknowledgement. Allows protocol mismatches to 261 * be detected. 262 */ 263 #define DMSGF_CMDF_EXPECT_ACK 0x00080000U /* in-line command no-ack */ 264 265 #define DMSGF_CMDSWMASK (DMSGF_CMDS | \ 266 DMSGF_SIZE | \ 267 DMSGF_PROTOS | \ 268 DMSGF_REPLY) 269 270 #define DMSGF_BASECMDMASK (DMSGF_CMDS | \ 271 DMSGF_SIZE | \ 272 DMSGF_PROTOS) 273 274 #define DMSGF_TRANSMASK (DMSGF_CMDS | \ 275 DMSGF_SIZE | \ 276 DMSGF_PROTOS | \ 277 DMSGF_REPLY | \ 278 DMSGF_CREATE | \ 279 DMSGF_DELETE) 280 281 #define DMSGF_BASEFLAGS (DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY) 282 283 #define DMSG_PROTO_LNK 0x00000000U 284 #define DMSG_PROTO_DBG 0x00100000U 285 #define DMSG_PROTO_HM2 0x00200000U 286 #define DMSG_PROTO_XX3 0x00300000U 287 #define DMSG_PROTO_XX4 0x00400000U 288 #define DMSG_PROTO_BLK 0x00500000U 289 #define DMSG_PROTO_VOP 0x00600000U 290 291 /* 292 * Message command constructors, sans flags 293 */ 294 #define DMSG_ALIGN 64 295 #define DMSG_ALIGNMASK (DMSG_ALIGN - 1) 296 #define DMSG_DOALIGN(bytes) (((bytes) + DMSG_ALIGNMASK) & \ 297 ~DMSG_ALIGNMASK) 298 299 #define DMSG_HDR_ENCODE(elm) (((uint32_t)sizeof(struct elm) + \ 300 DMSG_ALIGNMASK) / \ 301 DMSG_ALIGN) 302 303 #define DMSG_LNK(cmd, elm) (DMSG_PROTO_LNK | \ 304 ((cmd) << 8) | \ 305 DMSG_HDR_ENCODE(elm)) 306 307 #define DMSG_DBG(cmd, elm) (DMSG_PROTO_DBG | \ 308 ((cmd) << 8) | \ 309 DMSG_HDR_ENCODE(elm)) 310 311 #define DMSG_HM2(cmd, elm) (DMSG_PROTO_HM2 | \ 312 ((cmd) << 8) | \ 313 DMSG_HDR_ENCODE(elm)) 314 315 #define DMSG_BLK(cmd, elm) (DMSG_PROTO_BLK | \ 316 ((cmd) << 8) | \ 317 DMSG_HDR_ENCODE(elm)) 318 319 #define DMSG_VOP(cmd, elm) (DMSG_PROTO_VOP | \ 320 ((cmd) << 8) | \ 321 DMSG_HDR_ENCODE(elm)) 322 323 /* 324 * Link layer ops basically talk to just the other side of a direct 325 * connection. 326 * 327 * LNK_PAD - One-way message on circuit 0, ignored by target. Used to 328 * pad message buffers on shared-memory transports. Not 329 * typically used with TCP. 330 * 331 * LNK_PING - One-way message on circuit-0, keep-alive, run by both sides 332 * typically 1/sec on idle link, link is lost after 10 seconds 333 * of inactivity. 334 * 335 * LNK_AUTH - Authenticate the connection, negotiate administrative 336 * rights & encryption, protocol class, etc. Only PAD and 337 * AUTH messages (not even PING) are accepted until 338 * authentication is complete. This message also identifies 339 * the host. 340 * 341 * LNK_CONN - Enable the SPAN protocol on circuit-0, possibly also 342 * installing a PFS filter (by cluster id, unique id, and/or 343 * wildcarded name). 344 * 345 * LNK_SPAN - A SPAN transaction typically on iocom->state0 enables 346 * messages to be relayed to/from a particular cluster node. 347 * SPANs are received, sorted, aggregated, filtered, and 348 * retransmitted back out across all applicable connections. 349 * 350 * The leaf protocol also uses this to make a PFS available 351 * to the cluster (e.g. on-mount). 352 */ 353 #define DMSG_LNK_PAD DMSG_LNK(0x000, dmsg_hdr) 354 #define DMSG_LNK_PING DMSG_LNK(0x001, dmsg_hdr) 355 #define DMSG_LNK_AUTH DMSG_LNK(0x010, dmsg_lnk_auth) 356 #define DMSG_LNK_CONN DMSG_LNK(0x011, dmsg_lnk_conn) 357 #define DMSG_LNK_SPAN DMSG_LNK(0x012, dmsg_lnk_span) 358 #define DMSG_LNK_ERROR DMSG_LNK(0xFFF, dmsg_hdr) 359 360 /* 361 * Reserved command codes for third party subsystems. Structure size is 362 * not known here so do not try to construct the full DMSG_LNK_ define. 363 */ 364 #define DMSG_LNK_CMD_HAMMER2_VOLCONF 0x20 365 366 #define DMSG_LABEL_SIZE 128 /* fixed at 128, do not change */ 367 368 /* 369 * LNK_AUTH - Authentication (often omitted) 370 */ 371 struct dmsg_lnk_auth { 372 dmsg_hdr_t head; 373 char dummy[64]; 374 }; 375 376 /* 377 * LNK_CONN - Register connection info for SPAN protocol 378 * (transaction, left open, iocom->state0 only). 379 * 380 * LNK_CONN identifies a streaming connection into the cluster and serves 381 * to identify, enable, and specify filters for the SPAN protocol. 382 * 383 * peer_mask serves to filter the SPANs we receive by peer_type. A cluster 384 * controller typically sets this to (uint64_t)-1, indicating that it wants 385 * everything. A block devfs interface might set it to 1 << DMSG_PEER_DISK, 386 * and a hammer2 mount might set it to 1 << DMSG_PEER_HAMMER2. 387 * 388 * mediaid allows multiple (e.g. HAMMER2) connections belonging to the same 389 * media to transmit duplicative LNK_VOLCONF updates without causing 390 * confusion in the cluster controller. 391 * 392 * pfs_clid, pfs_fsid, pfs_type, and label are peer-specific and must be 393 * left empty (zero-fill) if not supported by a particular peer. 394 * 395 * DMSG_PEER_CLUSTER filter: none 396 * DMSG_PEER_BLOCK filter: label 397 * DMSG_PEER_HAMMER2 filter: pfs_clid if not empty, and label 398 */ 399 struct dmsg_lnk_conn { 400 dmsg_hdr_t head; 401 uuid_t mediaid; /* media configuration id */ 402 uuid_t pfs_clid; /* rendezvous pfs uuid */ 403 uuid_t pfs_fsid; /* unique pfs uuid */ 404 uint64_t peer_mask; /* PEER mask for SPAN filtering */ 405 uint8_t peer_type; /* see DMSG_PEER_xxx */ 406 uint8_t pfs_type; /* pfs type */ 407 uint16_t proto_version; /* high level protocol support */ 408 uint32_t status; /* status flags */ 409 uint32_t rnss; /* node's generated rnss */ 410 uint8_t reserved02[8]; 411 uint32_t reserved03[12]; 412 uint64_t pfs_mask; /* PFS mask for SPAN filtering */ 413 char cl_label[DMSG_LABEL_SIZE]; /* cluster label */ 414 char fs_label[DMSG_LABEL_SIZE]; /* PFS label */ 415 }; 416 417 typedef struct dmsg_lnk_conn dmsg_lnk_conn_t; 418 419 /* 420 * PFSTYPEs 0-15 used by sys/dmsg.h 16-31 reserved by hammer2. 421 */ 422 #define DMSG_PFSTYPE_NONE 0 423 #define DMSG_PFSTYPE_ADMIN 1 424 #define DMSG_PFSTYPE_CLIENT 2 425 #define DMSG_PFSTYPE_SERVER 3 426 #define DMSG_PFSTYPE_MAX 32 427 428 #define DMSG_PEER_NONE 0 429 #define DMSG_PEER_CLUSTER 1 /* a cluster controller */ 430 #define DMSG_PEER_BLOCK 2 /* block devices */ 431 #define DMSG_PEER_HAMMER2 3 /* hammer2-mounted volumes */ 432 433 /* 434 * Structures embedded in LNK_SPAN 435 */ 436 struct dmsg_media_block { 437 uint64_t bytes; /* media size in bytes */ 438 uint32_t blksize; /* media block size */ 439 }; 440 441 typedef struct dmsg_media_block dmsg_media_block_t; 442 443 /* 444 * LNK_SPAN - Initiate or relay a SPAN 445 * (transaction, left open, typically only on iocom->state0) 446 * 447 * This message registers an end-point with the other end of the connection, 448 * telling the other end who we are and what we can provide or intend to 449 * consume. Multiple registrations can be maintained as open transactions 450 * with each one specifying a unique end-point. 451 * 452 * Registrations are sent from {source}=S {1...n} to {target}=0 and maintained 453 * as open transactions. Registrations are also received and maintains as 454 * open transactions, creating a matrix of linkid's. 455 * 456 * While these transactions are open additional transactions can be executed 457 * between any two linkid's {source}=S (registrations we sent) to {target}=T 458 * (registrations we received). 459 * 460 * Closure of any registration transaction will automatically abort any open 461 * transactions using the related linkids. Closure can be initiated 462 * voluntarily from either side with either end issuing a DELETE, or they 463 * can be ABORTed. 464 * 465 * Status updates are performed via the open transaction. 466 * 467 * -- 468 * 469 * A registration identifies a node and its various PFS parameters including 470 * the PFS_TYPE. For example, a diskless HAMMER2 client typically identifies 471 * itself as PFSTYPE_CLIENT. 472 * 473 * Any node may serve as a cluster controller, aggregating and passing 474 * on received registrations, but end-points do not have to implement this 475 * ability. Most end-points typically implement a single client-style or 476 * server-style PFS_TYPE and rendezvous at a cluster controller. 477 * 478 * The cluster controller does not aggregate/pass-on all received 479 * registrations. It typically filters what gets passed on based on what it 480 * receives, passing on only the best candidates. 481 * 482 * If a symmetric spanning tree is desired additional candidates whos 483 * {dist, rnss} fields match the last best candidate must also be propagated. 484 * This feature is not currently enabled. 485 * 486 * STATUS UPDATES: Status updates use the same structure but typically 487 * only contain incremental changes to e.g. pfs_type, with 488 * a text description sent as out-of-band data. 489 */ 490 struct dmsg_lnk_span { 491 dmsg_hdr_t head; 492 uuid_t pfs_clid; /* rendezvous pfs uuid */ 493 uuid_t pfs_fsid; /* unique pfs id (differentiate node) */ 494 uint8_t pfs_type; /* PFS type */ 495 uint8_t peer_type; /* PEER type */ 496 uint16_t proto_version; /* high level protocol support */ 497 uint32_t status; /* status flags */ 498 uint8_t reserved02[8]; 499 uint32_t dist; /* span distance */ 500 uint32_t rnss; /* random number sub-sort */ 501 union { 502 uint32_t reserved03[14]; 503 dmsg_media_block_t block; 504 } media; 505 506 /* 507 * NOTE: for PEER_HAMMER2 cl_label is typically empty and fs_label 508 * is the superroot directory name. 509 * 510 * for PEER_BLOCK cl_label is typically host/device and 511 * fs_label is typically the serial number string. 512 */ 513 char cl_label[DMSG_LABEL_SIZE]; /* cluster label */ 514 char fs_label[DMSG_LABEL_SIZE]; /* PFS label */ 515 }; 516 517 typedef struct dmsg_lnk_span dmsg_lnk_span_t; 518 519 #define DMSG_SPAN_PROTO_1 1 520 521 /* 522 * Debug layer ops operate on any link 523 * 524 * SHELL - Persist stream, access the debug shell on the target 525 * registration. Multiple shells can be operational. 526 */ 527 #define DMSG_DBG_SHELL DMSG_DBG(0x001, dmsg_dbg_shell) 528 529 struct dmsg_dbg_shell { 530 dmsg_hdr_t head; 531 }; 532 typedef struct dmsg_dbg_shell dmsg_dbg_shell_t; 533 534 /* 535 * Hammer2 layer ops (low-level chain manipulation used by cluster code) 536 * 537 * HM2_OPENPFS - Attach a PFS 538 * HM2_FLUSHPFS - Flush a PFS 539 * 540 * HM2_LOOKUP - Lookup chain (parent-relative transaction) 541 * (can request multiple chains) 542 * HM2_NEXT - Lookup next chain (parent-relative transaction) 543 * (can request multiple chains) 544 * HM2_LOCK - [Re]lock a chain (chain-relative) (non-recursive) 545 * HM2_UNLOCK - Unlock a chain (chain-relative) (non-recursive) 546 * HM2_RESIZE - Resize a chain (chain-relative) 547 * HM2_MODIFY - Modify a chain (chain-relative) 548 * HM2_CREATE - Create a chain (parent-relative) 549 * HM2_DUPLICATE- Duplicate a chain (target-parent-relative) 550 * HM2_DELDUP - Delete-Duplicate a chain (chain-relative) 551 * HM2_DELETE - Delete a chain (chain-relative) 552 * HM2_SNAPSHOT - Create a snapshot (snapshot-root-relative, w/clid override) 553 */ 554 #define DMSG_HM2_OPENPFS DMSG_HM2(0x001, dmsg_hm2_openpfs) 555 556 /* 557 * DMSG_PROTO_BLK Protocol 558 * 559 * BLK_OPEN - Open device. This transaction must be left open for the 560 * duration and the returned keyid passed in all associated 561 * BLK commands. Multiple OPENs can be issued within the 562 * transaction. 563 * 564 * BLK_CLOSE - Close device. This can be used to close one of the opens 565 * within a BLK_OPEN transaction. It may NOT initiate a 566 * transaction. Note that a termination of the transaction 567 * (e.g. with LNK_ERROR or BLK_ERROR) closes all active OPENs 568 * for that transaction. XXX not well defined atm. 569 * 570 * BLK_READ - Strategy read. Not typically streaming. 571 * 572 * BLK_WRITE - Strategy write. Not typically streaming. 573 * 574 * BLK_FLUSH - Strategy flush. Not typically streaming. 575 * 576 * BLK_FREEBLKS - Strategy freeblks. Not typically streaming. 577 */ 578 #define DMSG_BLK_OPEN DMSG_BLK(0x001, dmsg_blk_open) 579 #define DMSG_BLK_CLOSE DMSG_BLK(0x002, dmsg_blk_open) 580 #define DMSG_BLK_READ DMSG_BLK(0x003, dmsg_blk_read) 581 #define DMSG_BLK_WRITE DMSG_BLK(0x004, dmsg_blk_write) 582 #define DMSG_BLK_FLUSH DMSG_BLK(0x005, dmsg_blk_flush) 583 #define DMSG_BLK_FREEBLKS DMSG_BLK(0x006, dmsg_blk_freeblks) 584 #define DMSG_BLK_ERROR DMSG_BLK(0xFFF, dmsg_blk_error) 585 586 struct dmsg_blk_open { 587 dmsg_hdr_t head; 588 uint32_t modes; 589 uint32_t reserved01; 590 }; 591 592 #define DMSG_BLKOPEN_RD 0x0001 593 #define DMSG_BLKOPEN_WR 0x0002 594 595 /* 596 * DMSG_LNK_ERROR is returned for simple results, 597 * DMSG_BLK_ERROR is returned for extended results. 598 */ 599 struct dmsg_blk_error { 600 dmsg_hdr_t head; 601 uint64_t keyid; 602 uint32_t resid; 603 uint32_t reserved02; 604 char buf[64]; 605 }; 606 607 struct dmsg_blk_read { 608 dmsg_hdr_t head; 609 uint64_t keyid; 610 uint64_t offset; 611 uint32_t bytes; 612 uint32_t flags; 613 uint32_t reserved01; 614 uint32_t reserved02; 615 }; 616 617 struct dmsg_blk_write { 618 dmsg_hdr_t head; 619 uint64_t keyid; 620 uint64_t offset; 621 uint32_t bytes; 622 uint32_t flags; 623 uint32_t reserved01; 624 uint32_t reserved02; 625 }; 626 627 struct dmsg_blk_flush { 628 dmsg_hdr_t head; 629 uint64_t keyid; 630 uint64_t offset; 631 uint32_t bytes; 632 uint32_t flags; 633 uint32_t reserved01; 634 uint32_t reserved02; 635 }; 636 637 struct dmsg_blk_freeblks { 638 dmsg_hdr_t head; 639 uint64_t keyid; 640 uint64_t offset; 641 uint32_t bytes; 642 uint32_t flags; 643 uint32_t reserved01; 644 uint32_t reserved02; 645 }; 646 647 typedef struct dmsg_blk_open dmsg_blk_open_t; 648 typedef struct dmsg_blk_read dmsg_blk_read_t; 649 typedef struct dmsg_blk_write dmsg_blk_write_t; 650 typedef struct dmsg_blk_flush dmsg_blk_flush_t; 651 typedef struct dmsg_blk_freeblks dmsg_blk_freeblks_t; 652 typedef struct dmsg_blk_error dmsg_blk_error_t; 653 654 /* 655 * NOTE!!!! ALL EXTENDED HEADER STRUCTURES MUST BE 64-BYTE ALIGNED!!! 656 * 657 * General message errors 658 * 659 * 0x00 - 0x1F Local iocomm errors 660 * 0x20 - 0x2F Global errors 661 */ 662 #define DMSG_ERR_NOSUPP 0x20 663 #define DMSG_ERR_LOSTLINK 0x21 664 #define DMSG_ERR_IO 0x22 /* generic */ 665 #define DMSG_ERR_PARAM 0x23 /* generic */ 666 #define DMSG_ERR_CANTCIRC 0x24 /* (typically means lost span) */ 667 668 union dmsg_any { 669 char buf[DMSG_HDR_MAX]; 670 dmsg_hdr_t head; 671 672 dmsg_lnk_conn_t lnk_conn; 673 dmsg_lnk_span_t lnk_span; 674 675 dmsg_blk_open_t blk_open; 676 dmsg_blk_error_t blk_error; 677 dmsg_blk_read_t blk_read; 678 dmsg_blk_write_t blk_write; 679 dmsg_blk_flush_t blk_flush; 680 dmsg_blk_freeblks_t blk_freeblks; 681 }; 682 683 typedef union dmsg_any dmsg_any_t; 684 685 /* 686 * Kernel iocom structures and prototypes for kern/kern_dmsg.c 687 */ 688 #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES) 689 690 struct hammer2_mount; 691 struct xa_softc; 692 struct kdmsg_iocom; 693 struct kdmsg_state; 694 struct kdmsg_msg; 695 696 /* 697 * msg_ctl flags (atomic) 698 */ 699 #define KDMSG_CLUSTERCTL_KILL 0x00000001 700 #define KDMSG_CLUSTERCTL_KILLRX 0x00000002 /* staged helper exit */ 701 #define KDMSG_CLUSTERCTL_KILLTX 0x00000004 /* staged helper exit */ 702 #define KDMSG_CLUSTERCTL_SLEEPING 0x00000008 /* interlocked w/msglk */ 703 704 /* 705 * Transactional state structure, representing an open transaction. The 706 * transaction might represent a cache state (and thus have a chain 707 * association), or a VOP op, LNK_SPAN, or other things. 708 */ 709 TAILQ_HEAD(kdmsg_state_list, kdmsg_state); 710 711 struct kdmsg_state { 712 RB_ENTRY(kdmsg_state) rbnode; /* indexed by msgid */ 713 struct kdmsg_state_list subq; /* active stacked states */ 714 TAILQ_ENTRY(kdmsg_state) entry; /* on parent subq */ 715 TAILQ_ENTRY(kdmsg_state) user_entry; /* available to devices */ 716 struct kdmsg_iocom *iocom; 717 struct kdmsg_state *parent; 718 uint32_t icmd; /* record cmd creating state */ 719 uint32_t txcmd; /* mostly for CMDF flags */ 720 uint32_t rxcmd; /* mostly for CMDF flags */ 721 uint64_t msgid; /* {parent,msgid} uniq */ 722 int flags; 723 int error; 724 void *chain; /* (caller's state) */ 725 int (*func)(struct kdmsg_state *, struct kdmsg_msg *); 726 union { 727 void *any; 728 struct hammer2_mount *hmp; 729 struct xa_softc *xa_sc; 730 } any; 731 }; 732 733 #define KDMSG_STATE_INSERTED 0x0001 734 #define KDMSG_STATE_DYNAMIC 0x0002 735 #define KDMSG_STATE_DELPEND 0x0004 /* transmit delete pending */ 736 #define KDMSG_STATE_ABORTING 0x0008 /* avoids recursive abort */ 737 #define KDMSG_STATE_OPPOSITE 0x0010 /* opposite direction */ 738 #define KDMSG_STATE_DYING 0x0020 /* indicates circuit failure */ 739 740 struct kdmsg_msg { 741 TAILQ_ENTRY(kdmsg_msg) qentry; /* serialized queue */ 742 struct kdmsg_state *state; 743 size_t hdr_size; 744 size_t aux_size; 745 char *aux_data; 746 uint32_t flags; 747 uint32_t tcmd; /* outer transaction cmd */ 748 dmsg_any_t any; /* variable sized */ 749 }; 750 751 #define KDMSG_FLAG_AUXALLOC 0x0001 752 753 typedef struct kdmsg_link kdmsg_link_t; 754 typedef struct kdmsg_state kdmsg_state_t; 755 typedef struct kdmsg_msg kdmsg_msg_t; 756 757 struct kdmsg_state_tree; 758 int kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2); 759 RB_HEAD(kdmsg_state_tree, kdmsg_state); 760 RB_PROTOTYPE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp); 761 762 /* 763 * Structure embedded in e.g. mount, master control structure for 764 * DMSG stream handling. 765 */ 766 struct kdmsg_iocom { 767 struct malloc_type *mmsg; 768 struct file *msg_fp; /* cluster pipe->userland */ 769 thread_t msgrd_td; /* cluster thread */ 770 thread_t msgwr_td; /* cluster thread */ 771 int msg_ctl; /* wakeup flags */ 772 int msg_seq; /* cluster msg sequence id */ 773 uint32_t flags; 774 struct lock msglk; /* lockmgr lock */ 775 TAILQ_HEAD(, kdmsg_msg) msgq; /* transmit queue */ 776 void *handle; 777 void (*auto_callback)(kdmsg_msg_t *); 778 int (*rcvmsg)(kdmsg_msg_t *); 779 void (*exit_func)(struct kdmsg_iocom *); 780 struct kdmsg_state state0; /* root state for stacking */ 781 struct kdmsg_state *conn_state; /* active LNK_CONN state */ 782 struct kdmsg_state *freerd_state; /* allocation cache */ 783 struct kdmsg_state *freewr_state; /* allocation cache */ 784 struct kdmsg_state_tree staterd_tree; /* active messages */ 785 struct kdmsg_state_tree statewr_tree; /* active messages */ 786 dmsg_lnk_conn_t auto_lnk_conn; 787 dmsg_lnk_span_t auto_lnk_span; 788 }; 789 790 typedef struct kdmsg_iocom kdmsg_iocom_t; 791 792 #define KDMSG_IOCOMF_AUTOCONN 0x0001 /* handle RX/TX LNK_CONN */ 793 #define KDMSG_IOCOMF_AUTORXSPAN 0x0002 /* handle RX LNK_SPAN */ 794 #define KDMSG_IOCOMF_AUTOTXSPAN 0x0008 /* handle TX LNK_SPAN */ 795 #define KDMSG_IOCOMF_EXITNOACC 0x8000 /* cannot accept writes */ 796 797 #define KDMSG_IOCOMF_AUTOANY (KDMSG_IOCOMF_AUTOCONN | \ 798 KDMSG_IOCOMF_AUTORXSPAN | \ 799 KDMSG_IOCOMF_AUTOTXSPAN) 800 801 uint32_t kdmsg_icrc32(const void *buf, size_t size); 802 uint32_t kdmsg_icrc32c(const void *buf, size_t size, uint32_t crc); 803 804 /* 805 * kern_dmsg.c 806 */ 807 void kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, u_int32_t flags, 808 struct malloc_type *mmsg, 809 int (*rcvmsg)(kdmsg_msg_t *msg)); 810 void kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp, 811 const char *subsysname); 812 void kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom, 813 void (*conn_callback)(kdmsg_msg_t *msg)); 814 void kdmsg_iocom_uninit(kdmsg_iocom_t *iocom); 815 void kdmsg_drain_msgq(kdmsg_iocom_t *iocom); 816 817 void kdmsg_msg_free(kdmsg_msg_t *msg); 818 kdmsg_msg_t *kdmsg_msg_alloc(kdmsg_state_t *state, uint32_t cmd, 819 int (*func)(kdmsg_state_t *, kdmsg_msg_t *), 820 void *data); 821 void kdmsg_msg_write(kdmsg_msg_t *msg); 822 void kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error); 823 void kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error); 824 void kdmsg_state_reply(kdmsg_state_t *state, uint32_t error); 825 void kdmsg_state_result(kdmsg_state_t *state, uint32_t error); 826 827 #endif 828 829 #endif 830