1 /* 2 * Copyright (c) 2011-2014 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #ifndef _SYS_DMSG_H_ 36 #define _SYS_DMSG_H_ 37 38 #ifndef _SYS_MALLOC_H_ 39 #include <sys/malloc.h> 40 #endif 41 #ifndef _SYS_TREE_H_ 42 #include <sys/tree.h> 43 #endif 44 #ifndef _SYS_THREAD_H_ 45 #include <sys/thread.h> 46 #endif 47 #ifndef _SYS_UUID_H_ 48 #include <sys/uuid.h> 49 #endif 50 51 /* 52 * Mesh network protocol structures. 53 * 54 * CONN PROTOCOL 55 * 56 * The mesh is constructed via point-to-point streaming links with varying 57 * levels of interconnectedness, forming a graph. Leafs of the graph are 58 * typically kernel devices (xdisk) or VFSs (HAMMER2). Internal nodes are 59 * usually (user level) hammer2 service demons. 60 * 61 * Upon connecting and after authentication, a LNK_CONN transaction is opened 62 * to configure the link. The SPAN protocol is then typically run over the 63 * open LNK_CONN transaction. 64 * 65 * Terminating the LNK_CONN transaction terminates everything running over it 66 * (typically open LNK_SPAN transactions), which in turn terminates everything 67 * running over the LNK_SPANs. 68 * 69 * SPAN PROTOCOL 70 * 71 * The SPAN protocol runs over an open LNK_CONN transaction and is used to 72 * advertise any number of services. For example, each PFS under a HAMMER2 73 * mount will be advertised as an open LNK_SPAN transaction. 74 * 75 * Any network node on the graph running multiple connections is capable 76 * of relaying LNK_SPANs from any connection to any other connection. This 77 * is typically done by the user-level hammer2 service demon, and typically 78 * not done by kernel devices or VFSs (though these entities must be able 79 * to manage multiple LNK_SPANs since they might advertise or need to talk 80 * to multiple services). 81 * 82 * Relaying is not necessarily trivial as it requires internal nodes to 83 * track two open transactions (on the two iocom interfaces) and translate 84 * the msgid and circuit. In addition, the relay may have to track multiple 85 * SPANs from the same iocom or from multiple iocoms which represent the same 86 * end-point and must select the best end-point, must send notifications when 87 * a better path is available, and must allow (when connectivity is still 88 * present) any existing, open, stacked sub-transactions to complete before 89 * terminating the less efficient SPAN. 90 * 91 * Relaying is optional. It is perfectly acceptable for the hammer2 service 92 * to plug a received socket descriptor directly into the appropriate kernel 93 * device driver. 94 * 95 * STACKED TRANSACTIONS 96 * 97 * Message transactions can be stacked. That is, you can initiate a DMSG 98 * transaction relative to another open transaction. sub-transactions can 99 * be initiate without waiting for the parent transaction to complete its 100 * handshake. 101 * 102 * This is done by entering the open transaction's msgid as the circuit field 103 * in the new transaction (typically by populating msg->parent). The 104 * transaction tracking structure will be referenced and will track the 105 * sub-transaction. Note that msgids must still be unique on an 106 * iocom-by-iocom basis. 107 * 108 * Messages can race closing circuits. When a circuit is lost, 109 * messages are simulated to delete any sub-transactions. 110 * 111 * MESSAGE TRANSACTIONAL STATES 112 * 113 * Message transactions are handled by the CREATE, DELETE, REPLY, ABORT, and 114 * CREPLY flags. Message state is typically recorded at the end points and 115 * will be maintained (preventing reuse of the transaction id) until a DELETE 116 * is both sent and received. 117 * 118 * One-way messages such as those used for debug commands are not recorded 119 * and do not require any transactional state. These are sent without 120 * the CREATE, DELETE, or ABORT flags set. ABORT is not supported for 121 * one-off messages. The REPLY bit can be used to distinguish between 122 * command and status if desired. 123 * 124 * Transactional messages are messages which require a reply to be 125 * returned. These messages can also consist of multiple message elements 126 * for the command or reply or both (or neither). The command message 127 * sequence sets CREATE on the first message and DELETE on the last message. 128 * A single message command sets both (CREATE|DELETE). The reply message 129 * sequence works the same way but of course also sets the REPLY bit. 130 * 131 * Tansactional messages can be aborted by sending a message element 132 * with the ABORT flag set. This flag can be combined with either or both 133 * the CREATE and DELETE flags. When combined with the CREATE flag the 134 * command is treated as non-blocking but still executes. Whem combined 135 * with the DELETE flag no additional message elements are required. 136 * 137 * Transactions are terminated by sending a message with DELETE set. 138 * Transactions must be CREATEd and DELETEd in both directions. If a 139 * transaction is governing stacked sub-transactions the sub-transactions 140 * are automatically terminated before the governing transaction is terminated. 141 * Terminates are handled by simulating a received DELETE and expecting the 142 * normal function callback and state machine to (ultimately) issue a 143 * terminating (DELETE) response. 144 * 145 * Transactions can operate in full-duplex as both sides are fully open 146 * (i.e. CREATE sent, CREATE|REPLY returned, DELETE not sent by anyone). 147 * Additional commands can be initiated from either side of the transaction. 148 * 149 * ABORT SPECIAL CASE - Mid-stream aborts. A mid-stream abort can be sent 150 * when supported by the sender by sending an ABORT message with neither 151 * CREATE or DELETE set. This effectively turns the message into a 152 * non-blocking message (but depending on what is being represented can also 153 * cut short prior data elements in the stream). 154 * 155 * ABORT SPECIAL CASE - Abort-after-DELETE. Transactional messages have to be 156 * abortable if the stream/pipe/whatever is lost. In this situation any 157 * forwarding relay needs to unconditionally abort commands and replies that 158 * are still active. This is done by sending an ABORT|DELETE even in 159 * situations where a DELETE has already been sent in that direction. This 160 * is done, for example, when links are in a half-closed state. In this 161 * situation it is possible for the abort request to race a transition to the 162 * fully closed state. ABORT|DELETE messages which race the fully closed 163 * state are expected to be discarded by the other end. 164 * 165 * -- 166 * 167 * All base and extended message headers are 64-byte aligned, and all 168 * transports must support extended message headers up to DMSG_HDR_MAX. 169 * Currently we allow extended message headers up to 2048 bytes. Note 170 * that the extended header size is encoded in the 'cmd' field of the header. 171 * 172 * Any in-band data is padded to a 64-byte alignment and placed directly 173 * after the extended header (after the higher-level cmd/rep structure). 174 * The actual unaligned size of the in-band data is encoded in the aux_bytes 175 * field in this case. Maximum data sizes are negotiated during registration. 176 * 177 * Auxillary data can be in-band or out-of-band. In-band data sets aux_descr 178 * equal to 0. Any out-of-band data must be negotiated by the SPAN protocol. 179 * 180 * Auxillary data, whether in-band or out-of-band, must be at-least 64-byte 181 * aligned. The aux_bytes field contains the actual byte-granular length 182 * and not the aligned length. The crc is against the aligned length (so 183 * a faster crc algorithm can be used, theoretically). 184 * 185 * hdr_crc is calculated over the entire, ALIGNED extended header. For 186 * the purposes of calculating the crc, the hdr_crc field is 0. That is, 187 * if calculating the crc in HW a 32-bit '0' must be inserted in place of 188 * the hdr_crc field when reading the entire header and compared at the 189 * end (but the actual hdr_crc must be left intact in memory). A simple 190 * counter to replace the field going into the CRC generator does the job 191 * in HW. The CRC endian is based on the magic number field and may have 192 * to be byte-swapped, too (which is also easy to do in HW). 193 * 194 * aux_crc is calculated over the entire, ALIGNED auxillary data. 195 * 196 * SHARED MEMORY IMPLEMENTATIONS 197 * 198 * Shared-memory implementations typically use a pipe to transmit the extended 199 * message header and shared memory to store any auxilary data. Auxillary 200 * data in one-way (non-transactional) messages is typically required to be 201 * inline. CRCs are still recommended and required at the beginning, but 202 * may be negotiated away later. 203 */ 204 205 #define DMSG_TERMINATE_STRING(ary) \ 206 do { (ary)[sizeof(ary) - 1] = 0; } while (0) 207 208 /* 209 * dmsg_hdr must be 64 bytes 210 */ 211 struct dmsg_hdr { 212 uint16_t magic; /* 00 sanity, synchro, endian */ 213 uint16_t reserved02; /* 02 */ 214 uint32_t salt; /* 04 random salt helps w/crypto */ 215 216 uint64_t msgid; /* 08 message transaction id */ 217 uint64_t circuit; /* 10 circuit id or 0 */ 218 uint64_t reserved18; /* 18 */ 219 220 uint32_t cmd; /* 20 flags | cmd | hdr_size / ALIGN */ 221 uint32_t aux_crc; /* 24 auxillary data crc */ 222 uint32_t aux_bytes; /* 28 auxillary data length (bytes) */ 223 uint32_t error; /* 2C error code or 0 */ 224 uint64_t aux_descr; /* 30 negotiated OOB data descr */ 225 uint32_t reserved38; /* 38 */ 226 uint32_t hdr_crc; /* 3C (aligned) extended header crc */ 227 }; 228 229 typedef struct dmsg_hdr dmsg_hdr_t; 230 231 #define DMSG_HDR_MAGIC 0x4832 232 #define DMSG_HDR_MAGIC_REV 0x3248 233 #define DMSG_HDR_CRCOFF offsetof(dmsg_hdr_t, salt) 234 #define DMSG_HDR_CRCBYTES (sizeof(dmsg_hdr_t) - DMSG_HDR_CRCOFF) 235 236 /* 237 * Administrative protocol limits. 238 * 239 * NOTE: A dmsg header must completely fit in the (fifo) buffer, but 240 * dmsg aux data does not have to completely fit. The dmsg 241 * structure allows headers up to 255*64 = 16320 bytes. There 242 * is no real limit on the aux_data other than what we deem 243 * reasonable and defenseable (i.e. not run processes or the 244 * kernel out of memory). But it should be able to handle at 245 * least MAXPHYS bytes which is typically 128KB or 256KB. 246 */ 247 #define DMSG_HDR_MAX 2048 /* <= 8192 */ 248 #define DMSG_AUX_MAX (1024*1024) /* <= 1MB */ 249 #define DMSG_BUF_SIZE (DMSG_HDR_MAX * 4) 250 #define DMSG_BUF_MASK (DMSG_BUF_SIZE - 1) 251 252 /* 253 * The message (cmd) field also encodes various flags and the total size 254 * of the message header. This allows the protocol processors to validate 255 * persistency and structural settings for every command simply by 256 * switch()ing on the (cmd) field. 257 */ 258 #define DMSGF_CREATE 0x80000000U /* msg start */ 259 #define DMSGF_DELETE 0x40000000U /* msg end */ 260 #define DMSGF_REPLY 0x20000000U /* reply path */ 261 #define DMSGF_ABORT 0x10000000U /* abort req */ 262 #define DMSGF_REVTRANS 0x08000000U /* opposite direction msgid */ 263 #define DMSGF_REVCIRC 0x04000000U /* opposite direction circuit */ 264 #define DMSGF_FLAG1 0x02000000U 265 #define DMSGF_FLAG0 0x01000000U 266 267 #define DMSGF_FLAGS 0xFF000000U /* all flags */ 268 #define DMSGF_PROTOS 0x00F00000U /* all protos */ 269 #define DMSGF_CMDS 0x000FFF00U /* all cmds */ 270 #define DMSGF_SIZE 0x000000FFU /* N*32 */ 271 272 /* 273 * XXX Future, flag that an in-line (not part of a CREATE/DELETE) command 274 * expects some sort of acknowledgement. Allows protocol mismatches to 275 * be detected. 276 */ 277 #define DMSGF_CMDF_EXPECT_ACK 0x00080000U /* in-line command no-ack */ 278 279 #define DMSGF_CMDSWMASK (DMSGF_CMDS | \ 280 DMSGF_SIZE | \ 281 DMSGF_PROTOS | \ 282 DMSGF_REPLY) 283 284 #define DMSGF_BASECMDMASK (DMSGF_CMDS | \ 285 DMSGF_SIZE | \ 286 DMSGF_PROTOS) 287 288 #define DMSGF_TRANSMASK (DMSGF_CMDS | \ 289 DMSGF_SIZE | \ 290 DMSGF_PROTOS | \ 291 DMSGF_REPLY | \ 292 DMSGF_CREATE | \ 293 DMSGF_DELETE) 294 295 #define DMSGF_BASEFLAGS (DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY) 296 297 #define DMSG_PROTO_LNK 0x00000000U 298 #define DMSG_PROTO_DBG 0x00100000U 299 #define DMSG_PROTO_HM2 0x00200000U 300 #define DMSG_PROTO_XX3 0x00300000U 301 #define DMSG_PROTO_XX4 0x00400000U 302 #define DMSG_PROTO_BLK 0x00500000U 303 #define DMSG_PROTO_VOP 0x00600000U 304 305 /* 306 * Message command constructors, sans flags 307 */ 308 #define DMSG_ALIGN 64 309 #define DMSG_ALIGNMASK (DMSG_ALIGN - 1) 310 #define DMSG_DOALIGN(bytes) (((bytes) + DMSG_ALIGNMASK) & \ 311 ~DMSG_ALIGNMASK) 312 313 #define DMSG_HDR_ENCODE(elm) (((uint32_t)sizeof(struct elm) + \ 314 DMSG_ALIGNMASK) / \ 315 DMSG_ALIGN) 316 317 #define DMSG_LNK(cmd, elm) (DMSG_PROTO_LNK | \ 318 ((cmd) << 8) | \ 319 DMSG_HDR_ENCODE(elm)) 320 321 #define DMSG_DBG(cmd, elm) (DMSG_PROTO_DBG | \ 322 ((cmd) << 8) | \ 323 DMSG_HDR_ENCODE(elm)) 324 325 #define DMSG_HM2(cmd, elm) (DMSG_PROTO_HM2 | \ 326 ((cmd) << 8) | \ 327 DMSG_HDR_ENCODE(elm)) 328 329 #define DMSG_BLK(cmd, elm) (DMSG_PROTO_BLK | \ 330 ((cmd) << 8) | \ 331 DMSG_HDR_ENCODE(elm)) 332 333 #define DMSG_VOP(cmd, elm) (DMSG_PROTO_VOP | \ 334 ((cmd) << 8) | \ 335 DMSG_HDR_ENCODE(elm)) 336 337 /* 338 * Link layer ops basically talk to just the other side of a direct 339 * connection. 340 * 341 * LNK_PAD - One-way message on circuit 0, ignored by target. Used to 342 * pad message buffers on shared-memory transports. Not 343 * typically used with TCP. 344 * 345 * LNK_PING - One-way message on circuit-0, keep-alive, run by both sides 346 * typically 1/sec on idle link, link is lost after 10 seconds 347 * of inactivity. 348 * 349 * LNK_AUTH - Authenticate the connection, negotiate administrative 350 * rights & encryption, protocol class, etc. Only PAD and 351 * AUTH messages (not even PING) are accepted until 352 * authentication is complete. This message also identifies 353 * the host. 354 * 355 * LNK_CONN - Enable the SPAN protocol on circuit-0, possibly also 356 * installing a PFS filter (by cluster id, unique id, and/or 357 * wildcarded name). 358 * 359 * LNK_SPAN - A SPAN transaction typically on iocom->state0 enables 360 * messages to be relayed to/from a particular cluster node. 361 * SPANs are received, sorted, aggregated, filtered, and 362 * retransmitted back out across all applicable connections. 363 * 364 * The leaf protocol also uses this to make a PFS available 365 * to the cluster (e.g. on-mount). 366 */ 367 #define DMSG_LNK_PAD DMSG_LNK(0x000, dmsg_hdr) 368 #define DMSG_LNK_PING DMSG_LNK(0x001, dmsg_hdr) 369 #define DMSG_LNK_AUTH DMSG_LNK(0x010, dmsg_lnk_auth) 370 #define DMSG_LNK_CONN DMSG_LNK(0x011, dmsg_lnk_conn) 371 #define DMSG_LNK_SPAN DMSG_LNK(0x012, dmsg_lnk_span) 372 #define DMSG_LNK_ERROR DMSG_LNK(0xFFF, dmsg_hdr) 373 374 /* 375 * Reserved command codes for third party subsystems. Structure size is 376 * not known here so do not try to construct the full DMSG_LNK_ define. 377 */ 378 #define DMSG_LNK_CMD_HAMMER2_VOLCONF 0x20 379 380 #define DMSG_LABEL_SIZE 128 /* fixed at 128, do not change */ 381 382 /* 383 * LNK_AUTH - Authentication (often omitted) 384 */ 385 struct dmsg_lnk_auth { 386 dmsg_hdr_t head; 387 char dummy[64]; 388 }; 389 390 /* 391 * LNK_CONN - Register connection info for SPAN protocol 392 * (transaction, left open, iocom->state0 only). 393 * 394 * LNK_CONN identifies a streaming connection into the cluster. 395 * 396 * peer_mask serves to filter the SPANs we receive by peer_type. A cluster 397 * controller typically sets this to (uint64_t)-1, indicating that it wants 398 * everything. A block devfs interface might set it to 1 << DMSG_PEER_DISK, 399 * and a hammer2 mount might set it to 1 << DMSG_PEER_HAMMER2. 400 * 401 * media_iud allows multiple (e.g. HAMMER2) connections belonging to the same 402 * media to transmit duplicative LNK_VOLCONF updates without causing confusion 403 * in the cluster controller. 404 * 405 * pfs_clid, pfs_fsid, pfs_type, and label are peer-specific and must be 406 * left empty (zero-fill) if not supported by a particular peer. 407 */ 408 struct dmsg_lnk_conn { 409 dmsg_hdr_t head; 410 uuid_t media_id; /* media configuration id */ 411 uuid_t peer_id; /* unique peer uuid */ 412 uuid_t reserved01; 413 uint64_t peer_mask; /* PEER mask for SPAN filtering */ 414 uint8_t peer_type; /* see DMSG_PEER_xxx */ 415 uint8_t reserved02; 416 uint16_t proto_version; /* high level protocol support */ 417 uint32_t status; /* status flags */ 418 uint32_t rnss; /* node's generated rnss */ 419 uint8_t reserved03[8]; 420 uint32_t reserved04[14]; 421 char peer_label[DMSG_LABEL_SIZE]; /* peer identity string */ 422 }; 423 424 typedef struct dmsg_lnk_conn dmsg_lnk_conn_t; 425 426 /* 427 * PEER types 0-63 are defined here. There is a limit of 64 types due to 428 * the width of peer_mask. 429 * 430 * PFS types depend on the peer type. sys/dmsg.h only defines the default. 431 * peer-specific headers define PFS types for any given peer. 432 */ 433 #define DMSG_PEER_NONE 0 434 #define DMSG_PEER_ROUTER 1 /* server: cluster controller */ 435 #define DMSG_PEER_BLOCK 2 /* server: block devices */ 436 #define DMSG_PEER_HAMMER2 3 /* server: h2 mounted volume */ 437 #define DMSG_PEER_CLIENT 63 /* a client connection */ 438 #define DMSG_PEER_MAX 64 439 440 #define DMSG_PFSTYPE_DEFAULT 0 441 #define DMSG_PFSTYPE_MASK 0x0F 442 443 /* 444 * Structures embedded in LNK_SPAN 445 */ 446 struct dmsg_media_block { 447 uint64_t bytes; /* media size in bytes */ 448 uint32_t blksize; /* media block size */ 449 uint32_t reserved01; 450 }; 451 452 typedef struct dmsg_media_block dmsg_media_block_t; 453 454 /* 455 * LNK_SPAN - Initiate or relay a SPAN 456 * (transaction, left open, typically only on iocom->state0) 457 * 458 * This message registers an end-point with the other end of the connection, 459 * telling the other end who we are and what we can provide or intend to 460 * consume. Multiple registrations can be maintained as open transactions 461 * with each one specifying a unique end-point. 462 * 463 * Registrations are sent from {source}=S {1...n} to {target}=0 and maintained 464 * as open transactions. Registrations are also received and maintains as 465 * open transactions, creating a matrix of linkid's. 466 * 467 * While these transactions are open additional transactions can be executed 468 * between any two linkid's {source}=S (registrations we sent) to {target}=T 469 * (registrations we received). 470 * 471 * Closure of any registration transaction will automatically abort any open 472 * transactions using the related linkids. Closure can be initiated 473 * voluntarily from either side with either end issuing a DELETE, or they 474 * can be ABORTed. 475 * 476 * Status updates are performed via the open transaction. 477 * 478 * -- 479 * 480 * A registration identifies a node and its various PFS parameters including 481 * the PFS_TYPE. For example, a diskless HAMMER2 client typically identifies 482 * itself as PFSTYPE_CLIENT. 483 * 484 * Any node may serve as a cluster controller, aggregating and passing 485 * on received registrations, but end-points do not have to implement this 486 * ability. Most end-points typically implement a single client-style or 487 * server-style PFS_TYPE and rendezvous at a cluster controller. 488 * 489 * The cluster controller does not aggregate/pass-on all received 490 * registrations. It typically filters what gets passed on based on what it 491 * receives, passing on only the best candidates. 492 * 493 * If a symmetric spanning tree is desired additional candidates whos 494 * {dist, rnss} fields match the last best candidate must also be propagated. 495 * This feature is not currently enabled. 496 * 497 * STATUS UPDATES: Status updates use the same structure but typically 498 * only contain incremental changes to e.g. pfs_type, with 499 * a text description sent as out-of-band data. 500 */ 501 struct dmsg_lnk_span { 502 dmsg_hdr_t head; 503 uuid_t peer_id; 504 uuid_t pfs_id; /* unique pfs id */ 505 uint8_t pfs_type; /* PFS type */ 506 uint8_t peer_type; /* PEER type */ 507 uint16_t proto_version; /* high level protocol support */ 508 uint32_t status; /* status flags */ 509 uint8_t reserved02[8]; 510 uint32_t dist; /* span distance */ 511 uint32_t rnss; /* random number sub-sort */ 512 union { 513 uint32_t reserved03[14]; 514 dmsg_media_block_t block; 515 } media; 516 517 /* 518 * NOTE: for PEER_HAMMER2 cl_label is typically empty and fs_label 519 * is the superroot directory name. 520 * 521 * for PEER_BLOCK cl_label is typically host/device and 522 * fs_label is typically the serial number string. 523 */ 524 char peer_label[DMSG_LABEL_SIZE]; /* peer label */ 525 char pfs_label[DMSG_LABEL_SIZE]; /* PFS label */ 526 }; 527 528 typedef struct dmsg_lnk_span dmsg_lnk_span_t; 529 530 #define DMSG_SPAN_PROTO_1 1 531 532 /* 533 * Debug layer ops operate on any link 534 * 535 * SHELL - Persist stream, access the debug shell on the target 536 * registration. Multiple shells can be operational. 537 */ 538 #define DMSG_DBG_SHELL DMSG_DBG(0x001, dmsg_dbg_shell) 539 540 struct dmsg_dbg_shell { 541 dmsg_hdr_t head; 542 }; 543 typedef struct dmsg_dbg_shell dmsg_dbg_shell_t; 544 545 /* 546 * Hammer2 layer ops (low-level chain manipulation used by cluster code) 547 * 548 * HM2_OPENPFS - Attach a PFS 549 * HM2_FLUSHPFS - Flush a PFS 550 * 551 * HM2_LOOKUP - Lookup chain (parent-relative transaction) 552 * (can request multiple chains) 553 * HM2_NEXT - Lookup next chain (parent-relative transaction) 554 * (can request multiple chains) 555 * HM2_LOCK - [Re]lock a chain (chain-relative) (non-recursive) 556 * HM2_UNLOCK - Unlock a chain (chain-relative) (non-recursive) 557 * HM2_RESIZE - Resize a chain (chain-relative) 558 * HM2_MODIFY - Modify a chain (chain-relative) 559 * HM2_CREATE - Create a chain (parent-relative) 560 * HM2_DUPLICATE- Duplicate a chain (target-parent-relative) 561 * HM2_DELDUP - Delete-Duplicate a chain (chain-relative) 562 * HM2_DELETE - Delete a chain (chain-relative) 563 * HM2_SNAPSHOT - Create a snapshot (snapshot-root-relative, w/clid override) 564 */ 565 #define DMSG_HM2_OPENPFS DMSG_HM2(0x001, dmsg_hm2_openpfs) 566 567 /* 568 * DMSG_PROTO_BLK Protocol 569 * 570 * BLK_OPEN - Open device. This transaction must be left open for the 571 * duration and the returned keyid passed in all associated 572 * BLK commands. Multiple OPENs can be issued within the 573 * transaction. 574 * 575 * BLK_CLOSE - Close device. This can be used to close one of the opens 576 * within a BLK_OPEN transaction. It may NOT initiate a 577 * transaction. Note that a termination of the transaction 578 * (e.g. with LNK_ERROR or BLK_ERROR) closes all active OPENs 579 * for that transaction. XXX not well defined atm. 580 * 581 * BLK_READ - Strategy read. Not typically streaming. 582 * 583 * BLK_WRITE - Strategy write. Not typically streaming. 584 * 585 * BLK_FLUSH - Strategy flush. Not typically streaming. 586 * 587 * BLK_FREEBLKS - Strategy freeblks. Not typically streaming. 588 */ 589 #define DMSG_BLK_OPEN DMSG_BLK(0x001, dmsg_blk_open) 590 #define DMSG_BLK_CLOSE DMSG_BLK(0x002, dmsg_blk_open) 591 #define DMSG_BLK_READ DMSG_BLK(0x003, dmsg_blk_read) 592 #define DMSG_BLK_WRITE DMSG_BLK(0x004, dmsg_blk_write) 593 #define DMSG_BLK_FLUSH DMSG_BLK(0x005, dmsg_blk_flush) 594 #define DMSG_BLK_FREEBLKS DMSG_BLK(0x006, dmsg_blk_freeblks) 595 #define DMSG_BLK_ERROR DMSG_BLK(0xFFF, dmsg_blk_error) 596 597 struct dmsg_blk_open { 598 dmsg_hdr_t head; 599 uint32_t modes; 600 uint32_t reserved01; 601 }; 602 603 #define DMSG_BLKOPEN_RD 0x0001 604 #define DMSG_BLKOPEN_WR 0x0002 605 606 /* 607 * DMSG_LNK_ERROR is returned for simple results, 608 * DMSG_BLK_ERROR is returned for extended results. 609 */ 610 struct dmsg_blk_error { 611 dmsg_hdr_t head; 612 uint64_t keyid; 613 uint32_t resid; 614 uint32_t reserved02; 615 char buf[64]; 616 }; 617 618 struct dmsg_blk_read { 619 dmsg_hdr_t head; 620 uint64_t keyid; 621 uint64_t offset; 622 uint32_t bytes; 623 uint32_t flags; 624 uint32_t reserved01; 625 uint32_t reserved02; 626 }; 627 628 struct dmsg_blk_write { 629 dmsg_hdr_t head; 630 uint64_t keyid; 631 uint64_t offset; 632 uint32_t bytes; 633 uint32_t flags; 634 uint32_t reserved01; 635 uint32_t reserved02; 636 }; 637 638 struct dmsg_blk_flush { 639 dmsg_hdr_t head; 640 uint64_t keyid; 641 uint64_t offset; 642 uint32_t bytes; 643 uint32_t flags; 644 uint32_t reserved01; 645 uint32_t reserved02; 646 }; 647 648 struct dmsg_blk_freeblks { 649 dmsg_hdr_t head; 650 uint64_t keyid; 651 uint64_t offset; 652 uint32_t bytes; 653 uint32_t flags; 654 uint32_t reserved01; 655 uint32_t reserved02; 656 }; 657 658 typedef struct dmsg_blk_open dmsg_blk_open_t; 659 typedef struct dmsg_blk_read dmsg_blk_read_t; 660 typedef struct dmsg_blk_write dmsg_blk_write_t; 661 typedef struct dmsg_blk_flush dmsg_blk_flush_t; 662 typedef struct dmsg_blk_freeblks dmsg_blk_freeblks_t; 663 typedef struct dmsg_blk_error dmsg_blk_error_t; 664 665 /* 666 * NOTE!!!! ALL EXTENDED HEADER STRUCTURES MUST BE 64-BYTE ALIGNED!!! 667 * 668 * General message errors 669 * 670 * 0x00 - 0x1F Local iocomm errors 671 * 0x20 - 0x2F Global errors 672 */ 673 #define DMSG_ERR_NOSUPP 0x20 674 #define DMSG_ERR_LOSTLINK 0x21 675 #define DMSG_ERR_IO 0x22 /* generic */ 676 #define DMSG_ERR_PARAM 0x23 /* generic */ 677 #define DMSG_ERR_CANTCIRC 0x24 /* (typically means lost span) */ 678 679 union dmsg_any { 680 char buf[DMSG_HDR_MAX]; 681 dmsg_hdr_t head; 682 683 dmsg_lnk_conn_t lnk_conn; 684 dmsg_lnk_span_t lnk_span; 685 686 dmsg_blk_open_t blk_open; 687 dmsg_blk_error_t blk_error; 688 dmsg_blk_read_t blk_read; 689 dmsg_blk_write_t blk_write; 690 dmsg_blk_flush_t blk_flush; 691 dmsg_blk_freeblks_t blk_freeblks; 692 }; 693 694 typedef union dmsg_any dmsg_any_t; 695 696 /* 697 * Kernel iocom structures and prototypes for kern/kern_dmsg.c 698 */ 699 #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES) 700 701 struct hammer2_mount; 702 struct xa_softc; 703 struct kdmsg_iocom; 704 struct kdmsg_state; 705 struct kdmsg_msg; 706 struct kdmsg_data; 707 708 /* 709 * msg_ctl flags (atomic) 710 */ 711 #define KDMSG_CLUSTERCTL_UNUSED01 0x00000001 712 #define KDMSG_CLUSTERCTL_KILLRX 0x00000002 /* staged helper exit */ 713 #define KDMSG_CLUSTERCTL_KILLTX 0x00000004 /* staged helper exit */ 714 #define KDMSG_CLUSTERCTL_SLEEPING 0x00000008 /* interlocked w/msglk */ 715 716 /* 717 * Transactional state structure, representing an open transaction. The 718 * transaction might represent a cache state (and thus have a chain 719 * association), or a VOP op, LNK_SPAN, or other things. 720 * 721 * NOTE: A non-empty subq represents one ref. 722 * If we are inserted on a parent's subq, that's one ref (SUBINSERTED). 723 * If we are inserted on a RB tree, that's one ref (RBINSERTED). 724 * msg->state represents a ref. 725 * Other code references may hold refs. 726 * 727 * NOTE: The parent association stays intact as long as a state has a 728 * non-empty subq. Otherwise simulated failures might not be able 729 * to reach the children. 730 */ 731 TAILQ_HEAD(kdmsg_state_list, kdmsg_state); 732 733 struct kdmsg_state { 734 RB_ENTRY(kdmsg_state) rbnode; /* indexed by msgid */ 735 struct kdmsg_state *scan; /* scan check */ 736 struct kdmsg_state_list subq; /* active stacked states */ 737 TAILQ_ENTRY(kdmsg_state) entry; /* on parent subq */ 738 TAILQ_ENTRY(kdmsg_state) user_entry; /* available to devices */ 739 struct kdmsg_iocom *iocom; 740 struct kdmsg_state *parent; 741 int refs; /* refs */ 742 uint32_t icmd; /* record cmd creating state */ 743 uint32_t txcmd; /* mostly for CMDF flags */ 744 uint32_t rxcmd; /* mostly for CMDF flags */ 745 uint64_t msgid; /* {parent,msgid} uniq */ 746 int flags; 747 int error; 748 void *chain; /* (caller's state) */ 749 int (*func)(struct kdmsg_state *, struct kdmsg_msg *); 750 union { 751 void *any; 752 struct hammer2_mount *hmp; 753 struct xa_softc *xa_sc; 754 } any; 755 }; 756 757 #define KDMSG_STATE_SUBINSERTED 0x0001 758 #define KDMSG_STATE_DYNAMIC 0x0002 759 #define KDMSG_STATE_UNUSED0004 0x0004 760 #define KDMSG_STATE_ABORTING 0x0008 /* avoids recursive abort */ 761 #define KDMSG_STATE_OPPOSITE 0x0010 /* opposite direction */ 762 #define KDMSG_STATE_DYING 0x0020 /* atomic recursive circ fail */ 763 #define KDMSG_STATE_INTERLOCK 0x0040 764 #define KDMSG_STATE_RBINSERTED 0x0080 765 #define KDMSG_STATE_SIGNAL 0x0400 766 #define KDMSG_STATE_NEW 0x0800 /* defer abort processing */ 767 768 struct kdmsg_msg { 769 TAILQ_ENTRY(kdmsg_msg) qentry; /* serialized queue */ 770 struct kdmsg_state *state; 771 size_t hdr_size; 772 size_t aux_size; 773 char *aux_data; 774 uint32_t flags; 775 uint32_t tcmd; /* outer transaction cmd */ 776 dmsg_any_t any; /* variable sized */ 777 }; 778 779 struct kdmsg_data { 780 char *aux_data; 781 size_t aux_size; 782 struct kdmsg_iocom *iocom; 783 }; 784 785 #define KDMSG_FLAG_AUXALLOC 0x0001 786 787 typedef struct kdmsg_link kdmsg_link_t; 788 typedef struct kdmsg_state kdmsg_state_t; 789 typedef struct kdmsg_msg kdmsg_msg_t; 790 typedef struct kdmsg_data kdmsg_data_t; 791 792 struct kdmsg_state_tree; 793 int kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2); 794 RB_HEAD(kdmsg_state_tree, kdmsg_state); 795 RB_PROTOTYPE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp); 796 797 /* 798 * Structure embedded in e.g. mount, master control structure for 799 * DMSG stream handling. 800 */ 801 struct kdmsg_iocom { 802 struct malloc_type *mmsg; 803 struct file *msg_fp; /* cluster pipe->userland */ 804 thread_t msgrd_td; /* cluster thread */ 805 thread_t msgwr_td; /* cluster thread */ 806 int msg_ctl; /* wakeup flags */ 807 int msg_seq; /* cluster msg sequence id */ 808 uint32_t flags; 809 struct lock msglk; /* lockmgr lock */ 810 TAILQ_HEAD(, kdmsg_msg) msgq; /* transmit queue */ 811 void *handle; 812 void (*auto_callback)(kdmsg_msg_t *); 813 int (*rcvmsg)(kdmsg_msg_t *); 814 void (*exit_func)(struct kdmsg_iocom *); 815 struct kdmsg_state state0; /* root state for stacking */ 816 struct kdmsg_state *conn_state; /* active LNK_CONN state */ 817 struct kdmsg_state *freerd_state; /* allocation cache */ 818 struct kdmsg_state *freewr_state; /* allocation cache */ 819 struct kdmsg_state_tree staterd_tree; /* active messages */ 820 struct kdmsg_state_tree statewr_tree; /* active messages */ 821 dmsg_lnk_conn_t auto_lnk_conn; 822 dmsg_lnk_span_t auto_lnk_span; 823 }; 824 825 typedef struct kdmsg_iocom kdmsg_iocom_t; 826 827 #define KDMSG_IOCOMF_AUTOCONN 0x0001 /* handle RX/TX LNK_CONN */ 828 #define KDMSG_IOCOMF_AUTORXSPAN 0x0002 /* handle RX LNK_SPAN */ 829 #define KDMSG_IOCOMF_AUTOTXSPAN 0x0008 /* handle TX LNK_SPAN */ 830 #define KDMSG_IOCOMF_EXITNOACC 0x8000 /* cannot accept writes */ 831 832 #define KDMSG_IOCOMF_AUTOANY (KDMSG_IOCOMF_AUTOCONN | \ 833 KDMSG_IOCOMF_AUTORXSPAN | \ 834 KDMSG_IOCOMF_AUTOTXSPAN) 835 836 uint32_t kdmsg_icrc32(const void *buf, size_t size); 837 uint32_t kdmsg_icrc32c(const void *buf, size_t size, uint32_t crc); 838 839 /* 840 * kern_dmsg.c 841 */ 842 void kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, u_int32_t flags, 843 struct malloc_type *mmsg, 844 int (*rcvmsg)(kdmsg_msg_t *msg)); 845 void kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp, 846 const char *subsysname); 847 void kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom, 848 void (*conn_callback)(kdmsg_msg_t *msg)); 849 void kdmsg_iocom_uninit(kdmsg_iocom_t *iocom); 850 void kdmsg_drain_msgq(kdmsg_iocom_t *iocom); 851 852 void kdmsg_msg_free(kdmsg_msg_t *msg); 853 kdmsg_msg_t *kdmsg_msg_alloc(kdmsg_state_t *state, uint32_t cmd, 854 int (*func)(kdmsg_state_t *, kdmsg_msg_t *), 855 void *data); 856 void kdmsg_msg_write(kdmsg_msg_t *msg); 857 void kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error); 858 void kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error); 859 void kdmsg_state_reply(kdmsg_state_t *state, uint32_t error); 860 void kdmsg_state_result(kdmsg_state_t *state, uint32_t error); 861 void kdmsg_detach_aux_data(kdmsg_msg_t *msg, kdmsg_data_t *data); 862 void kdmsg_free_aux_data(kdmsg_data_t *data); 863 864 #endif 865 866 #endif 867