1 /* 2 * Copyright (c) 2011-2014 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #ifndef _SYS_DMSG_H_ 36 #define _SYS_DMSG_H_ 37 38 #ifndef _SYS_TYPES_H_ 39 #include <sys/types.h> 40 #endif 41 #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES) 42 #ifndef _SYS_TREE_H_ 43 #include <sys/tree.h> 44 #endif 45 #ifndef _SYS_THREAD_H_ 46 #include <sys/thread.h> 47 #endif 48 #endif 49 #ifndef _SYS_UUID_H_ 50 #include <sys/uuid.h> 51 #endif 52 53 /* 54 * Mesh network protocol structures. 55 * 56 * CONN PROTOCOL 57 * 58 * The mesh is constructed via point-to-point streaming links with varying 59 * levels of interconnectedness, forming a graph. Leafs of the graph are 60 * typically kernel devices (xdisk) or VFSs (HAMMER2). Internal nodes are 61 * usually (user level) hammer2 service demons. 62 * 63 * Upon connecting and after authentication, a LNK_CONN transaction is opened 64 * to configure the link. The SPAN protocol is then typically run over the 65 * open LNK_CONN transaction. 66 * 67 * Terminating the LNK_CONN transaction terminates everything running over it 68 * (typically open LNK_SPAN transactions), which in turn terminates everything 69 * running over the LNK_SPANs. 70 * 71 * SPAN PROTOCOL 72 * 73 * The SPAN protocol runs over an open LNK_CONN transaction and is used to 74 * advertise any number of services. For example, each PFS under a HAMMER2 75 * mount will be advertised as an open LNK_SPAN transaction. 76 * 77 * Any network node on the graph running multiple connections is capable 78 * of relaying LNK_SPANs from any connection to any other connection. This 79 * is typically done by the user-level hammer2 service demon, and typically 80 * not done by kernel devices or VFSs (though these entities must be able 81 * to manage multiple LNK_SPANs since they might advertise or need to talk 82 * to multiple services). 83 * 84 * Relaying is not necessarily trivial as it requires internal nodes to 85 * track two open transactions (on the two iocom interfaces) and translate 86 * the msgid and circuit. In addition, the relay may have to track multiple 87 * SPANs from the same iocom or from multiple iocoms which represent the same 88 * end-point and must select the best end-point, must send notifications when 89 * a better path is available, and must allow (when connectivity is still 90 * present) any existing, open, stacked sub-transactions to complete before 91 * terminating the less efficient SPAN. 92 * 93 * Relaying is optional. It is perfectly acceptable for the hammer2 service 94 * to plug a received socket descriptor directly into the appropriate kernel 95 * device driver. 96 * 97 * STACKED TRANSACTIONS 98 * 99 * Message transactions can be stacked. That is, you can initiate a DMSG 100 * transaction relative to another open transaction. sub-transactions can 101 * be initiate without waiting for the parent transaction to complete its 102 * handshake. 103 * 104 * This is done by entering the open transaction's msgid as the circuit field 105 * in the new transaction (typically by populating msg->parent). The 106 * transaction tracking structure will be referenced and will track the 107 * sub-transaction. Note that msgids must still be unique on an 108 * iocom-by-iocom basis. 109 * 110 * Messages can race closing circuits. When a circuit is lost, 111 * messages are simulated to delete any sub-transactions. 112 * 113 * MESSAGE TRANSACTIONAL STATES 114 * 115 * Message transactions are handled by the CREATE, DELETE, REPLY, ABORT, and 116 * CREPLY flags. Message state is typically recorded at the end points and 117 * will be maintained (preventing reuse of the transaction id) until a DELETE 118 * is both sent and received. 119 * 120 * One-way messages such as those used for debug commands are not recorded 121 * and do not require any transactional state. These are sent without 122 * the CREATE, DELETE, or ABORT flags set. ABORT is not supported for 123 * one-off messages. The REPLY bit can be used to distinguish between 124 * command and status if desired. 125 * 126 * Transactional messages are messages which require a reply to be 127 * returned. These messages can also consist of multiple message elements 128 * for the command or reply or both (or neither). The command message 129 * sequence sets CREATE on the first message and DELETE on the last message. 130 * A single message command sets both (CREATE|DELETE). The reply message 131 * sequence works the same way but of course also sets the REPLY bit. 132 * 133 * Tansactional messages can be aborted by sending a message element 134 * with the ABORT flag set. This flag can be combined with either or both 135 * the CREATE and DELETE flags. When combined with the CREATE flag the 136 * command is treated as non-blocking but still executes. Whem combined 137 * with the DELETE flag no additional message elements are required. 138 * 139 * Transactions are terminated by sending a message with DELETE set. 140 * Transactions must be CREATEd and DELETEd in both directions. If a 141 * transaction is governing stacked sub-transactions the sub-transactions 142 * are automatically terminated before the governing transaction is terminated. 143 * Terminates are handled by simulating a received DELETE and expecting the 144 * normal function callback and state machine to (ultimately) issue a 145 * terminating (DELETE) response. 146 * 147 * Transactions can operate in full-duplex as both sides are fully open 148 * (i.e. CREATE sent, CREATE|REPLY returned, DELETE not sent by anyone). 149 * Additional commands can be initiated from either side of the transaction. 150 * 151 * ABORT SPECIAL CASE - Mid-stream aborts. A mid-stream abort can be sent 152 * when supported by the sender by sending an ABORT message with neither 153 * CREATE or DELETE set. This effectively turns the message into a 154 * non-blocking message (but depending on what is being represented can also 155 * cut short prior data elements in the stream). 156 * 157 * ABORT SPECIAL CASE - Abort-after-DELETE. Transactional messages have to be 158 * abortable if the stream/pipe/whatever is lost. In this situation any 159 * forwarding relay needs to unconditionally abort commands and replies that 160 * are still active. This is done by sending an ABORT|DELETE even in 161 * situations where a DELETE has already been sent in that direction. This 162 * is done, for example, when links are in a half-closed state. In this 163 * situation it is possible for the abort request to race a transition to the 164 * fully closed state. ABORT|DELETE messages which race the fully closed 165 * state are expected to be discarded by the other end. 166 * 167 * -- 168 * 169 * All base and extended message headers are 64-byte aligned, and all 170 * transports must support extended message headers up to DMSG_HDR_MAX. 171 * Currently we allow extended message headers up to 2048 bytes. Note 172 * that the extended header size is encoded in the 'cmd' field of the header. 173 * 174 * Any in-band data is padded to a 64-byte alignment and placed directly 175 * after the extended header (after the higher-level cmd/rep structure). 176 * The actual unaligned size of the in-band data is encoded in the aux_bytes 177 * field in this case. Maximum data sizes are negotiated during registration. 178 * 179 * Auxillary data can be in-band or out-of-band. In-band data sets aux_descr 180 * equal to 0. Any out-of-band data must be negotiated by the SPAN protocol. 181 * 182 * Auxillary data, whether in-band or out-of-band, must be at-least 64-byte 183 * aligned. The aux_bytes field contains the actual byte-granular length 184 * and not the aligned length. The crc is against the aligned length (so 185 * a faster crc algorithm can be used, theoretically). 186 * 187 * hdr_crc is calculated over the entire, ALIGNED extended header. For 188 * the purposes of calculating the crc, the hdr_crc field is 0. That is, 189 * if calculating the crc in HW a 32-bit '0' must be inserted in place of 190 * the hdr_crc field when reading the entire header and compared at the 191 * end (but the actual hdr_crc must be left intact in memory). A simple 192 * counter to replace the field going into the CRC generator does the job 193 * in HW. The CRC endian is based on the magic number field and may have 194 * to be byte-swapped, too (which is also easy to do in HW). 195 * 196 * aux_crc is calculated over the entire, ALIGNED auxillary data. 197 * 198 * SHARED MEMORY IMPLEMENTATIONS 199 * 200 * Shared-memory implementations typically use a pipe to transmit the extended 201 * message header and shared memory to store any auxilary data. Auxillary 202 * data in one-way (non-transactional) messages is typically required to be 203 * inline. CRCs are still recommended and required at the beginning, but 204 * may be negotiated away later. 205 */ 206 207 #define DMSG_TERMINATE_STRING(ary) \ 208 do { (ary)[sizeof(ary) - 1] = 0; } while (0) 209 210 /* 211 * dmsg_hdr must be 64 bytes 212 */ 213 struct dmsg_hdr { 214 uint16_t magic; /* 00 sanity, synchro, endian */ 215 uint16_t reserved02; /* 02 */ 216 uint32_t salt; /* 04 random salt helps w/crypto */ 217 218 uint64_t msgid; /* 08 message transaction id */ 219 uint64_t circuit; /* 10 circuit id or 0 */ 220 uint64_t link_verifier; /* 18 link verifier */ 221 222 uint32_t cmd; /* 20 flags | cmd | hdr_size / ALIGN */ 223 uint32_t aux_crc; /* 24 auxillary data crc */ 224 uint32_t aux_bytes; /* 28 auxillary data length (bytes) */ 225 uint32_t error; /* 2C error code or 0 */ 226 uint64_t aux_descr; /* 30 negotiated OOB data descr */ 227 uint32_t reserved38; /* 38 */ 228 uint32_t hdr_crc; /* 3C (aligned) extended header crc */ 229 }; 230 231 typedef struct dmsg_hdr dmsg_hdr_t; 232 233 #define DMSG_HDR_MAGIC 0x4832 234 #define DMSG_HDR_MAGIC_REV 0x3248 235 #define DMSG_HDR_CRCOFF offsetof(dmsg_hdr_t, salt) 236 #define DMSG_HDR_CRCBYTES (sizeof(dmsg_hdr_t) - DMSG_HDR_CRCOFF) 237 238 /* 239 * Administrative protocol limits. 240 * 241 * NOTE: A dmsg header must completely fit in the (fifo) buffer, but 242 * dmsg aux data does not have to completely fit. The dmsg 243 * structure allows headers up to 255*64 = 16320 bytes. There 244 * is no real limit on the aux_data other than what we deem 245 * reasonable and defenseable (i.e. not run processes or the 246 * kernel out of memory). But it should be able to handle at 247 * least MAXPHYS bytes which is typically 128KB or 256KB. 248 */ 249 #define DMSG_HDR_MAX 2048 /* <= 8192 */ 250 #define DMSG_AUX_MAX (1024*1024) /* <= 1MB */ 251 #define DMSG_BUF_SIZE (DMSG_HDR_MAX * 4) 252 #define DMSG_BUF_MASK (DMSG_BUF_SIZE - 1) 253 254 /* 255 * The message (cmd) field also encodes various flags and the total size 256 * of the message header. This allows the protocol processors to validate 257 * persistency and structural settings for every command simply by 258 * switch()ing on the (cmd) field. 259 */ 260 #define DMSGF_CREATE 0x80000000U /* msg start */ 261 #define DMSGF_DELETE 0x40000000U /* msg end */ 262 #define DMSGF_REPLY 0x20000000U /* reply path */ 263 #define DMSGF_ABORT 0x10000000U /* abort req */ 264 #define DMSGF_REVTRANS 0x08000000U /* opposite direction msgid */ 265 #define DMSGF_REVCIRC 0x04000000U /* opposite direction circuit */ 266 #define DMSGF_FLAG1 0x02000000U 267 #define DMSGF_FLAG0 0x01000000U 268 269 #define DMSGF_FLAGS 0xFF000000U /* all flags */ 270 #define DMSGF_PROTOS 0x00F00000U /* all protos */ 271 #define DMSGF_CMDS 0x000FFF00U /* all cmds */ 272 #define DMSGF_SIZE 0x000000FFU /* N*32 */ 273 274 /* 275 * XXX Future, flag that an in-line (not part of a CREATE/DELETE) command 276 * expects some sort of acknowledgement. Allows protocol mismatches to 277 * be detected. 278 */ 279 #define DMSGF_CMDF_EXPECT_ACK 0x00080000U /* in-line command no-ack */ 280 281 #define DMSGF_CMDSWMASK (DMSGF_CMDS | \ 282 DMSGF_SIZE | \ 283 DMSGF_PROTOS | \ 284 DMSGF_REPLY) 285 286 #define DMSGF_BASECMDMASK (DMSGF_CMDS | \ 287 DMSGF_SIZE | \ 288 DMSGF_PROTOS) 289 290 #define DMSGF_TRANSMASK (DMSGF_CMDS | \ 291 DMSGF_SIZE | \ 292 DMSGF_PROTOS | \ 293 DMSGF_REPLY | \ 294 DMSGF_CREATE | \ 295 DMSGF_DELETE) 296 297 #define DMSGF_BASEFLAGS (DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY) 298 299 #define DMSG_PROTO_LNK 0x00000000U 300 #define DMSG_PROTO_DBG 0x00100000U 301 #define DMSG_PROTO_HM2 0x00200000U 302 #define DMSG_PROTO_XX3 0x00300000U 303 #define DMSG_PROTO_XX4 0x00400000U 304 #define DMSG_PROTO_BLK 0x00500000U 305 #define DMSG_PROTO_VOP 0x00600000U 306 307 /* 308 * Message command constructors, sans flags 309 */ 310 #define DMSG_ALIGN 64 311 #define DMSG_ALIGNMASK (DMSG_ALIGN - 1) 312 #define DMSG_DOALIGN(bytes) (((bytes) + DMSG_ALIGNMASK) & \ 313 ~DMSG_ALIGNMASK) 314 315 #define DMSG_HDR_ENCODE(elm) (((uint32_t)sizeof(struct elm) + \ 316 DMSG_ALIGNMASK) / \ 317 DMSG_ALIGN) 318 319 #define DMSG_LNK(cmd, elm) (DMSG_PROTO_LNK | \ 320 ((cmd) << 8) | \ 321 DMSG_HDR_ENCODE(elm)) 322 323 #define DMSG_DBG(cmd, elm) (DMSG_PROTO_DBG | \ 324 ((cmd) << 8) | \ 325 DMSG_HDR_ENCODE(elm)) 326 327 #define DMSG_HM2(cmd, elm) (DMSG_PROTO_HM2 | \ 328 ((cmd) << 8) | \ 329 DMSG_HDR_ENCODE(elm)) 330 331 #define DMSG_BLK(cmd, elm) (DMSG_PROTO_BLK | \ 332 ((cmd) << 8) | \ 333 DMSG_HDR_ENCODE(elm)) 334 335 #define DMSG_VOP(cmd, elm) (DMSG_PROTO_VOP | \ 336 ((cmd) << 8) | \ 337 DMSG_HDR_ENCODE(elm)) 338 339 /* 340 * Link layer ops basically talk to just the other side of a direct 341 * connection. 342 * 343 * LNK_PAD - One-way message on circuit 0, ignored by target. Used to 344 * pad message buffers on shared-memory transports. Not 345 * typically used with TCP. 346 * 347 * LNK_PING - One-way message on circuit-0, keep-alive, run by both sides 348 * typically 1/sec on idle link, link is lost after 10 seconds 349 * of inactivity. 350 * 351 * LNK_AUTH - Authenticate the connection, negotiate administrative 352 * rights & encryption, protocol class, etc. Only PAD and 353 * AUTH messages (not even PING) are accepted until 354 * authentication is complete. This message also identifies 355 * the host. 356 * 357 * LNK_CONN - Enable the SPAN protocol on circuit-0, possibly also 358 * installing a PFS filter (by cluster id, unique id, and/or 359 * wildcarded name). 360 * 361 * LNK_SPAN - A SPAN transaction typically on iocom->state0 enables 362 * messages to be relayed to/from a particular cluster node. 363 * SPANs are received, sorted, aggregated, filtered, and 364 * retransmitted back out across all applicable connections. 365 * 366 * The leaf protocol also uses this to make a PFS available 367 * to the cluster (e.g. on-mount). 368 */ 369 #define DMSG_LNK_PAD DMSG_LNK(0x000, dmsg_hdr) 370 #define DMSG_LNK_PING DMSG_LNK(0x001, dmsg_hdr) 371 #define DMSG_LNK_AUTH DMSG_LNK(0x010, dmsg_lnk_auth) 372 #define DMSG_LNK_CONN DMSG_LNK(0x011, dmsg_lnk_conn) 373 #define DMSG_LNK_SPAN DMSG_LNK(0x012, dmsg_lnk_span) 374 #define DMSG_LNK_ERROR DMSG_LNK(0xFFF, dmsg_hdr) 375 376 /* 377 * Reserved command codes for third party subsystems. Structure size is 378 * not known here so do not try to construct the full DMSG_LNK_ define. 379 */ 380 #define DMSG_LNK_CMD_HAMMER2_VOLCONF 0x20 381 382 #define DMSG_LABEL_SIZE 128 /* fixed at 128, do not change */ 383 384 /* 385 * LNK_AUTH - Authentication (often omitted) 386 */ 387 struct dmsg_lnk_auth { 388 dmsg_hdr_t head; 389 char dummy[64]; 390 }; 391 392 /* 393 * LNK_CONN - Register connection info for SPAN protocol 394 * (transaction, left open, iocom->state0 only). 395 * 396 * LNK_CONN identifies a streaming connection into the cluster. 397 * 398 * peer_mask serves to filter the SPANs we receive by peer_type. A cluster 399 * controller typically sets this to (uint64_t)-1, indicating that it wants 400 * everything. A block devfs interface might set it to 1 << DMSG_PEER_DISK, 401 * and a hammer2 mount might set it to 1 << DMSG_PEER_HAMMER2. 402 * 403 * media_iud allows multiple (e.g. HAMMER2) connections belonging to the same 404 * media to transmit duplicative LNK_VOLCONF updates without causing confusion 405 * in the cluster controller. 406 * 407 * pfs_clid, pfs_fsid, pfs_type, and label are peer-specific and must be 408 * left empty (zero-fill) if not supported by a particular peer. 409 */ 410 struct dmsg_lnk_conn { 411 dmsg_hdr_t head; 412 uuid_t media_id; /* media configuration id */ 413 uuid_t peer_id; /* unique peer uuid */ 414 uuid_t reserved01; 415 uint64_t peer_mask; /* PEER mask for SPAN filtering */ 416 uint8_t peer_type; /* see DMSG_PEER_xxx */ 417 uint8_t reserved02; 418 uint16_t proto_version; /* high level protocol support */ 419 uint32_t status; /* status flags */ 420 uint32_t rnss; /* node's generated rnss */ 421 uint8_t reserved03[8]; 422 uint32_t reserved04[14]; 423 char peer_label[DMSG_LABEL_SIZE]; /* peer identity string */ 424 }; 425 426 typedef struct dmsg_lnk_conn dmsg_lnk_conn_t; 427 428 /* 429 * PEER types 0-63 are defined here. There is a limit of 64 types due to 430 * the width of peer_mask. 431 * 432 * PFS types depend on the peer type. sys/dmsg.h only defines the default. 433 * peer-specific headers define PFS types for any given peer. 434 */ 435 #define DMSG_PEER_NONE 0 436 #define DMSG_PEER_ROUTER 1 /* server: cluster controller */ 437 #define DMSG_PEER_BLOCK 2 /* server: block devices */ 438 #define DMSG_PEER_HAMMER2 3 /* server: h2 mounted volume */ 439 #define DMSG_PEER_CLIENT 63 /* a client connection */ 440 #define DMSG_PEER_MAX 64 441 442 #define DMSG_PFSTYPE_DEFAULT 0 443 #define DMSG_PFSTYPE_MASK 0x0F 444 445 /* 446 * Structures embedded in LNK_SPAN 447 */ 448 struct dmsg_media_block { 449 uint64_t bytes; /* media size in bytes */ 450 uint32_t blksize; /* media block size */ 451 uint32_t reserved01; 452 }; 453 454 typedef struct dmsg_media_block dmsg_media_block_t; 455 456 /* 457 * LNK_SPAN - Initiate or relay a SPAN 458 * (transaction, left open, typically only on iocom->state0) 459 * 460 * This message registers an end-point with the other end of the connection, 461 * telling the other end who we are and what we can provide or intend to 462 * consume. Multiple registrations can be maintained as open transactions 463 * with each one specifying a unique end-point. 464 * 465 * Registrations are sent from {source}=S {1...n} to {target}=0 and maintained 466 * as open transactions. Registrations are also received and maintains as 467 * open transactions, creating a matrix of linkid's. 468 * 469 * While these transactions are open additional transactions can be executed 470 * between any two linkid's {source}=S (registrations we sent) to {target}=T 471 * (registrations we received). 472 * 473 * Closure of any registration transaction will automatically abort any open 474 * transactions using the related linkids. Closure can be initiated 475 * voluntarily from either side with either end issuing a DELETE, or they 476 * can be ABORTed. 477 * 478 * Status updates are performed via the open transaction. 479 * 480 * -- 481 * 482 * A registration identifies a node and its various PFS parameters including 483 * the PFS_TYPE. For example, a diskless HAMMER2 client typically identifies 484 * itself as PFSTYPE_CLIENT. 485 * 486 * Any node may serve as a cluster controller, aggregating and passing 487 * on received registrations, but end-points do not have to implement this 488 * ability. Most end-points typically implement a single client-style or 489 * server-style PFS_TYPE and rendezvous at a cluster controller. 490 * 491 * The cluster controller does not aggregate/pass-on all received 492 * registrations. It typically filters what gets passed on based on what it 493 * receives, passing on only the best candidates. 494 * 495 * If a symmetric spanning tree is desired additional candidates whos 496 * {dist, rnss} fields match the last best candidate must also be propagated. 497 * This feature is not currently enabled. 498 * 499 * STATUS UPDATES: Status updates use the same structure but typically 500 * only contain incremental changes to e.g. pfs_type, with 501 * a text description sent as out-of-band data. 502 */ 503 struct dmsg_lnk_span { 504 dmsg_hdr_t head; 505 uuid_t peer_id; 506 uuid_t pfs_id; /* unique pfs id */ 507 uint8_t pfs_type; /* PFS type */ 508 uint8_t peer_type; /* PEER type */ 509 uint16_t proto_version; /* high level protocol support */ 510 uint32_t status; /* status flags */ 511 uint8_t reserved02[8]; 512 uint32_t dist; /* span distance */ 513 uint32_t rnss; /* random number sub-sort */ 514 union { 515 uint32_t reserved03[14]; 516 dmsg_media_block_t block; 517 } media; 518 519 /* 520 * NOTE: for PEER_HAMMER2 cl_label is typically empty and fs_label 521 * is the superroot directory name. 522 * 523 * for PEER_BLOCK cl_label is typically host/device and 524 * fs_label is typically the serial number string. 525 */ 526 char peer_label[DMSG_LABEL_SIZE]; /* peer label */ 527 char pfs_label[DMSG_LABEL_SIZE]; /* PFS label */ 528 }; 529 530 typedef struct dmsg_lnk_span dmsg_lnk_span_t; 531 532 #define DMSG_SPAN_PROTO_1 1 533 534 /* 535 * Debug layer ops operate on any link 536 * 537 * SHELL - Persist stream, access the debug shell on the target 538 * registration. Multiple shells can be operational. 539 */ 540 #define DMSG_DBG_SHELL DMSG_DBG(0x001, dmsg_dbg_shell) 541 542 struct dmsg_dbg_shell { 543 dmsg_hdr_t head; 544 }; 545 typedef struct dmsg_dbg_shell dmsg_dbg_shell_t; 546 547 /* 548 * Hammer2 layer ops (low-level chain manipulation used by cluster code) 549 * 550 * HM2_OPENPFS - Attach a PFS 551 * HM2_FLUSHPFS - Flush a PFS 552 * 553 * HM2_LOOKUP - Lookup chain (parent-relative transaction) 554 * (can request multiple chains) 555 * HM2_NEXT - Lookup next chain (parent-relative transaction) 556 * (can request multiple chains) 557 * HM2_LOCK - [Re]lock a chain (chain-relative) (non-recursive) 558 * HM2_UNLOCK - Unlock a chain (chain-relative) (non-recursive) 559 * HM2_RESIZE - Resize a chain (chain-relative) 560 * HM2_MODIFY - Modify a chain (chain-relative) 561 * HM2_CREATE - Create a chain (parent-relative) 562 * HM2_DUPLICATE- Duplicate a chain (target-parent-relative) 563 * HM2_DELDUP - Delete-Duplicate a chain (chain-relative) 564 * HM2_DELETE - Delete a chain (chain-relative) 565 * HM2_SNAPSHOT - Create a snapshot (snapshot-root-relative, w/clid override) 566 */ 567 #define DMSG_HM2_OPENPFS DMSG_HM2(0x001, dmsg_hm2_openpfs) 568 569 /* 570 * DMSG_PROTO_BLK Protocol 571 * 572 * BLK_OPEN - Open device. This transaction must be left open for the 573 * duration and the returned keyid passed in all associated 574 * BLK commands. Multiple OPENs can be issued within the 575 * transaction. 576 * 577 * BLK_CLOSE - Close device. This can be used to close one of the opens 578 * within a BLK_OPEN transaction. It may NOT initiate a 579 * transaction. Note that a termination of the transaction 580 * (e.g. with LNK_ERROR or BLK_ERROR) closes all active OPENs 581 * for that transaction. XXX not well defined atm. 582 * 583 * BLK_READ - Strategy read. Not typically streaming. 584 * 585 * BLK_WRITE - Strategy write. Not typically streaming. 586 * 587 * BLK_FLUSH - Strategy flush. Not typically streaming. 588 * 589 * BLK_FREEBLKS - Strategy freeblks. Not typically streaming. 590 */ 591 #define DMSG_BLK_OPEN DMSG_BLK(0x001, dmsg_blk_open) 592 #define DMSG_BLK_CLOSE DMSG_BLK(0x002, dmsg_blk_open) 593 #define DMSG_BLK_READ DMSG_BLK(0x003, dmsg_blk_read) 594 #define DMSG_BLK_WRITE DMSG_BLK(0x004, dmsg_blk_write) 595 #define DMSG_BLK_FLUSH DMSG_BLK(0x005, dmsg_blk_flush) 596 #define DMSG_BLK_FREEBLKS DMSG_BLK(0x006, dmsg_blk_freeblks) 597 #define DMSG_BLK_ERROR DMSG_BLK(0xFFF, dmsg_blk_error) 598 599 struct dmsg_blk_open { 600 dmsg_hdr_t head; 601 uint32_t modes; 602 uint32_t reserved01; 603 }; 604 605 #define DMSG_BLKOPEN_RD 0x0001 606 #define DMSG_BLKOPEN_WR 0x0002 607 608 /* 609 * DMSG_LNK_ERROR is returned for simple results, 610 * DMSG_BLK_ERROR is returned for extended results. 611 */ 612 struct dmsg_blk_error { 613 dmsg_hdr_t head; 614 uint64_t keyid; 615 uint32_t resid; 616 uint32_t reserved02; 617 char buf[64]; 618 }; 619 620 struct dmsg_blk_read { 621 dmsg_hdr_t head; 622 uint64_t keyid; 623 uint64_t offset; 624 uint32_t bytes; 625 uint32_t flags; 626 uint32_t reserved01; 627 uint32_t reserved02; 628 }; 629 630 struct dmsg_blk_write { 631 dmsg_hdr_t head; 632 uint64_t keyid; 633 uint64_t offset; 634 uint32_t bytes; 635 uint32_t flags; 636 uint32_t reserved01; 637 uint32_t reserved02; 638 }; 639 640 struct dmsg_blk_flush { 641 dmsg_hdr_t head; 642 uint64_t keyid; 643 uint64_t offset; 644 uint32_t bytes; 645 uint32_t flags; 646 uint32_t reserved01; 647 uint32_t reserved02; 648 }; 649 650 struct dmsg_blk_freeblks { 651 dmsg_hdr_t head; 652 uint64_t keyid; 653 uint64_t offset; 654 uint32_t bytes; 655 uint32_t flags; 656 uint32_t reserved01; 657 uint32_t reserved02; 658 }; 659 660 typedef struct dmsg_blk_open dmsg_blk_open_t; 661 typedef struct dmsg_blk_read dmsg_blk_read_t; 662 typedef struct dmsg_blk_write dmsg_blk_write_t; 663 typedef struct dmsg_blk_flush dmsg_blk_flush_t; 664 typedef struct dmsg_blk_freeblks dmsg_blk_freeblks_t; 665 typedef struct dmsg_blk_error dmsg_blk_error_t; 666 667 /* 668 * NOTE!!!! ALL EXTENDED HEADER STRUCTURES MUST BE 64-BYTE ALIGNED!!! 669 * 670 * General message errors 671 * 672 * 0x00 - 0x1F Local iocomm errors 673 * 0x20 - 0x2F Global errors 674 */ 675 #define DMSG_ERR_NOSUPP 0x20 676 #define DMSG_ERR_LOSTLINK 0x21 677 #define DMSG_ERR_IO 0x22 /* generic */ 678 #define DMSG_ERR_PARAM 0x23 /* generic */ 679 #define DMSG_ERR_CANTCIRC 0x24 /* (typically means lost span) */ 680 681 union dmsg_any { 682 char buf[DMSG_HDR_MAX]; 683 dmsg_hdr_t head; 684 685 dmsg_lnk_conn_t lnk_conn; 686 dmsg_lnk_span_t lnk_span; 687 688 dmsg_blk_open_t blk_open; 689 dmsg_blk_error_t blk_error; 690 dmsg_blk_read_t blk_read; 691 dmsg_blk_write_t blk_write; 692 dmsg_blk_flush_t blk_flush; 693 dmsg_blk_freeblks_t blk_freeblks; 694 }; 695 696 typedef union dmsg_any dmsg_any_t; 697 698 /* 699 * Kernel iocom structures and prototypes for kern/kern_dmsg.c 700 */ 701 #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES) 702 703 struct hammer2_mount; 704 struct xa_softc; 705 struct kdmsg_iocom; 706 struct kdmsg_state; 707 struct kdmsg_msg; 708 struct kdmsg_data; 709 710 /* 711 * msg_ctl flags (atomic) 712 */ 713 #define KDMSG_CLUSTERCTL_UNUSED01 0x00000001 714 #define KDMSG_CLUSTERCTL_KILLRX 0x00000002 /* staged helper exit */ 715 #define KDMSG_CLUSTERCTL_KILLTX 0x00000004 /* staged helper exit */ 716 #define KDMSG_CLUSTERCTL_SLEEPING 0x00000008 /* interlocked w/msglk */ 717 718 /* 719 * Transactional state structure, representing an open transaction. The 720 * transaction might represent a cache state (and thus have a chain 721 * association), or a VOP op, LNK_SPAN, or other things. 722 * 723 * NOTE: A non-empty subq represents one ref. 724 * If we are inserted on a parent's subq, that's one ref (SUBINSERTED). 725 * If we are inserted on a RB tree, that's one ref (RBINSERTED). 726 * msg->state represents a ref. 727 * Other code references may hold refs. 728 * 729 * NOTE: The parent association stays intact as long as a state has a 730 * non-empty subq. Otherwise simulated failures might not be able 731 * to reach the children. 732 */ 733 TAILQ_HEAD(kdmsg_state_list, kdmsg_state); 734 735 struct kdmsg_state { 736 RB_ENTRY(kdmsg_state) rbnode; /* indexed by msgid */ 737 struct kdmsg_state *scan; /* scan check */ 738 struct kdmsg_state_list subq; /* active stacked states */ 739 TAILQ_ENTRY(kdmsg_state) entry; /* on parent subq */ 740 TAILQ_ENTRY(kdmsg_state) user_entry; /* available to devices */ 741 struct kdmsg_iocom *iocom; 742 struct kdmsg_state *parent; 743 int refs; /* refs */ 744 uint32_t icmd; /* record cmd creating state */ 745 uint32_t txcmd; /* mostly for CMDF flags */ 746 uint32_t rxcmd; /* mostly for CMDF flags */ 747 uint64_t msgid; /* {parent,msgid} uniq */ 748 int flags; 749 int error; 750 void *chain; /* (caller's state) */ 751 int (*func)(struct kdmsg_state *, struct kdmsg_msg *); 752 union { 753 void *any; 754 struct hammer2_mount *hmp; 755 struct xa_softc *xa_sc; 756 } any; 757 }; 758 759 #define KDMSG_STATE_SUBINSERTED 0x0001 760 #define KDMSG_STATE_DYNAMIC 0x0002 761 #define KDMSG_STATE_UNUSED0004 0x0004 762 #define KDMSG_STATE_ABORTING 0x0008 /* avoids recursive abort */ 763 #define KDMSG_STATE_OPPOSITE 0x0010 /* opposite direction */ 764 #define KDMSG_STATE_DYING 0x0020 /* atomic recursive circ fail */ 765 #define KDMSG_STATE_INTERLOCK 0x0040 766 #define KDMSG_STATE_RBINSERTED 0x0080 767 #define KDMSG_STATE_SIGNAL 0x0400 768 #define KDMSG_STATE_NEW 0x0800 /* defer abort processing */ 769 770 struct kdmsg_msg { 771 TAILQ_ENTRY(kdmsg_msg) qentry; /* serialized queue */ 772 struct kdmsg_state *state; 773 size_t hdr_size; 774 size_t aux_size; 775 char *aux_data; 776 uint32_t flags; 777 uint32_t tcmd; /* outer transaction cmd */ 778 dmsg_any_t any; /* variable sized */ 779 }; 780 781 struct kdmsg_data { 782 char *aux_data; 783 size_t aux_size; 784 struct kdmsg_iocom *iocom; 785 }; 786 787 #define KDMSG_FLAG_AUXALLOC 0x0001 788 789 typedef struct kdmsg_link kdmsg_link_t; 790 typedef struct kdmsg_state kdmsg_state_t; 791 typedef struct kdmsg_msg kdmsg_msg_t; 792 typedef struct kdmsg_data kdmsg_data_t; 793 794 struct kdmsg_state_tree; 795 int kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2); 796 RB_HEAD(kdmsg_state_tree, kdmsg_state); 797 RB_PROTOTYPE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp); 798 799 struct file; /* forward decl */ 800 struct malloc_type; 801 802 /* 803 * Structure embedded in e.g. mount, master control structure for 804 * DMSG stream handling. 805 */ 806 struct kdmsg_iocom { 807 struct malloc_type *mmsg; 808 struct file *msg_fp; /* cluster pipe->userland */ 809 thread_t msgrd_td; /* cluster thread */ 810 thread_t msgwr_td; /* cluster thread */ 811 int msg_ctl; /* wakeup flags */ 812 int msg_seq; /* cluster msg sequence id */ 813 uint32_t flags; 814 struct lock msglk; /* lockmgr lock */ 815 TAILQ_HEAD(, kdmsg_msg) msgq; /* transmit queue */ 816 void *handle; 817 void (*auto_callback)(kdmsg_msg_t *); 818 int (*rcvmsg)(kdmsg_msg_t *); 819 void (*exit_func)(struct kdmsg_iocom *); 820 struct kdmsg_state state0; /* root state for stacking */ 821 struct kdmsg_state *conn_state; /* active LNK_CONN state */ 822 struct kdmsg_state *freerd_state; /* allocation cache */ 823 struct kdmsg_state *freewr_state; /* allocation cache */ 824 struct kdmsg_state_tree staterd_tree; /* active messages */ 825 struct kdmsg_state_tree statewr_tree; /* active messages */ 826 dmsg_lnk_conn_t auto_lnk_conn; 827 dmsg_lnk_span_t auto_lnk_span; 828 }; 829 830 typedef struct kdmsg_iocom kdmsg_iocom_t; 831 832 #define KDMSG_IOCOMF_AUTOCONN 0x0001 /* handle RX/TX LNK_CONN */ 833 #define KDMSG_IOCOMF_AUTORXSPAN 0x0002 /* handle RX LNK_SPAN */ 834 #define KDMSG_IOCOMF_AUTOTXSPAN 0x0008 /* handle TX LNK_SPAN */ 835 #define KDMSG_IOCOMF_EXITNOACC 0x8000 /* cannot accept writes */ 836 837 #define KDMSG_IOCOMF_AUTOANY (KDMSG_IOCOMF_AUTOCONN | \ 838 KDMSG_IOCOMF_AUTORXSPAN | \ 839 KDMSG_IOCOMF_AUTOTXSPAN) 840 841 #endif /* _KERNEL || _KERNEL_STRUCTURES */ 842 843 #ifdef _KERNEL 844 845 /* 846 * kern_dmsg.c 847 */ 848 void kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, u_int32_t flags, 849 struct malloc_type *mmsg, 850 int (*rcvmsg)(kdmsg_msg_t *msg)); 851 void kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp, 852 const char *subsysname); 853 void kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom, 854 void (*conn_callback)(kdmsg_msg_t *msg)); 855 void kdmsg_iocom_uninit(kdmsg_iocom_t *iocom); 856 void kdmsg_drain_msgq(kdmsg_iocom_t *iocom); 857 858 void kdmsg_msg_free(kdmsg_msg_t *msg); 859 kdmsg_msg_t *kdmsg_msg_alloc(kdmsg_state_t *state, uint32_t cmd, 860 int (*func)(kdmsg_state_t *, kdmsg_msg_t *), 861 void *data); 862 void kdmsg_msg_write(kdmsg_msg_t *msg); 863 void kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error); 864 void kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error); 865 void kdmsg_state_reply(kdmsg_state_t *state, uint32_t error); 866 void kdmsg_state_result(kdmsg_state_t *state, uint32_t error); 867 void kdmsg_detach_aux_data(kdmsg_msg_t *msg, kdmsg_data_t *data); 868 void kdmsg_free_aux_data(kdmsg_data_t *data); 869 870 #endif /* _KERNEL */ 871 872 #endif /* !_SYS_DMSG_H_ */ 873