1 /*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 2006, 2013 Oracle and/or its affiliates. All rights reserved. 5 * 6 * $Id$ 7 */ 8 9 #ifndef _DB_REPMGR_H_ 10 #define _DB_REPMGR_H_ 11 12 #include "dbinc_auto/repmgr_automsg.h" 13 14 #if defined(__cplusplus) 15 extern "C" { 16 #endif 17 18 /* 19 * Replication Manager message format types. These few format codes identify 20 * enough information to describe, at the lowest level, how a message should be 21 * read from the wire, including how much memory should be allocated to hold the 22 * result. (Often we want to allocate more than just enough to hold the 23 * received bytes, if we know that we will need more during processing.) 24 * 25 * These values are transmitted between sites, even sites running differing BDB 26 * versions. Therefore, once assigned, the values are permanently "frozen". 27 * 28 * For example, in repmgr wire protocol version 1 the highest assigned message 29 * type value was 3, for REPMGR_REP_MESSAGE. Wire protocol version 2 added the 30 * HEARTBEAT message type (4). 31 * 32 * New message types added in later versions always get new (higher) values. We 33 * still list them in alphabetical order, for ease of reference. But this 34 * generally does not correspond to numerical order. 35 */ 36 #define REPMGR_APP_MESSAGE 5 /* Msg sent from app. on DB_CHANNEL. */ 37 #define REPMGR_APP_RESPONSE 6 /* Response to a channel request. */ 38 #define REPMGR_OWN_MSG 8 /* Repmgr's own messages, to peers. */ 39 #define REPMGR_HANDSHAKE 2 /* Connection establishment sequence. */ 40 #define REPMGR_HEARTBEAT 4 /* Monitor connection health. */ 41 #define REPMGR_PERMLSN 1 /* My perm LSN. */ 42 #define REPMGR_REP_MESSAGE 3 /* Normal replication message. */ 43 #define REPMGR_RESP_ERROR 7 /* Sys-gen'd error resp to request. */ 44 45 /* 46 * Largest known message type code known in each protocol version we support. 47 * In protocol version one there were only three message types: 1, 2, and 3; so 48 * 3 was the max. In protocol version 2 we introduced heartbeats, type 4. 49 * (Protocol version 3 did not introduce any new message types.) In version 4 50 * we introduced a few more new message types, the largest of which had value 7. 51 */ 52 #define REPMGR_MAX_V1_MSG_TYPE 3 53 #define REPMGR_MAX_V2_MSG_TYPE 4 54 #define REPMGR_MAX_V3_MSG_TYPE 4 55 #define REPMGR_MAX_V4_MSG_TYPE 8 56 #define HEARTBEAT_MIN_VERSION 2 57 #define CHANNEL_MIN_VERSION 4 58 #define CONN_COLLISION_VERSION 4 59 #define GM_MIN_VERSION 4 60 #define OWN_MIN_VERSION 4 61 62 /* The range of protocol versions we're willing to support. */ 63 #define DB_REPMGR_VERSION 4 64 #define DB_REPMGR_MIN_VERSION 1 65 66 /* 67 * For messages with the "REPMGR_OWN_MSG" format code, a message type (see 68 * REPMGR_OWN_MSG_TYPE, below) is included in the header. While at the lowest 69 * level, the format codes identify only enough to read and allocate memory, at 70 * the next higher level the following message type codes identify the content 71 * of the message: how to unmarshal and dispatch it. 72 * 73 * Like the message format types, these message type values should be 74 * permanently frozen. 75 */ 76 #define REPMGR_CONNECT_REJECT 1 77 #define REPMGR_GM_FAILURE 2 78 #define REPMGR_GM_FORWARD 3 79 #define REPMGR_JOIN_REQUEST 4 80 #define REPMGR_JOIN_SUCCESS 5 81 #define REPMGR_PARM_REFRESH 6 82 #define REPMGR_REJOIN 7 83 #define REPMGR_REMOVE_REQUEST 8 84 #define REPMGR_REMOVE_SUCCESS 9 85 #define REPMGR_RESOLVE_LIMBO 10 86 #define REPMGR_SHARING 11 87 88 89 struct __repmgr_connection; 90 typedef struct __repmgr_connection REPMGR_CONNECTION; 91 struct __repmgr_queue; typedef struct __repmgr_queue REPMGR_QUEUE; 92 struct __queued_output; typedef struct __queued_output QUEUED_OUTPUT; 93 struct __repmgr_response; typedef struct __repmgr_response REPMGR_RESPONSE; 94 struct __repmgr_retry; typedef struct __repmgr_retry REPMGR_RETRY; 95 struct __repmgr_runnable; typedef struct __repmgr_runnable REPMGR_RUNNABLE; 96 struct __repmgr_site; typedef struct __repmgr_site REPMGR_SITE; 97 struct __cond_waiters_table; 98 typedef struct __cond_waiters_table COND_WAITERS_TABLE; 99 100 /* Current Group Membership DB format ID. */ 101 #define REPMGR_GMDB_FMT_VERSION 1 102 103 #ifdef DB_WIN32 104 typedef SOCKET socket_t; 105 typedef HANDLE thread_id_t; 106 typedef HANDLE mgr_mutex_t; 107 typedef HANDLE cond_var_t; 108 109 typedef COND_WAITERS_TABLE *waiter_t; 110 typedef WSABUF db_iovec_t; 111 #else 112 typedef int socket_t; 113 typedef pthread_t thread_id_t; 114 typedef pthread_mutex_t mgr_mutex_t; 115 typedef pthread_cond_t cond_var_t; 116 typedef pthread_cond_t waiter_t; 117 typedef struct iovec db_iovec_t; 118 #endif 119 120 /* 121 * The (arbitrary) maximum number of outgoing messages we're willing to hold, on 122 * a queue per connection, waiting for TCP buffer space to become available in 123 * the kernel. Rather than exceeding this limit, we simply discard additional 124 * messages (since this is always allowed by the replication protocol). 125 * As a special dispensation, if a message is destined for a specific remote 126 * site (i.e., it's not a broadcast), then we first try blocking the sending 127 * thread, waiting for space to become available (though we only wait a limited 128 * time). This is so as to be able to handle the immediate flood of (a 129 * potentially large number of) outgoing messages that replication generates, in 130 * a tight loop, when handling PAGE_REQ, LOG_REQ and ALL_REQ requests. 131 */ 132 #define OUT_QUEUE_LIMIT 10 133 134 /* 135 * The system value is available from sysconf(_SC_HOST_NAME_MAX). 136 * Historically, the maximum host name was 256. 137 */ 138 #ifndef MAXHOSTNAMELEN 139 #define MAXHOSTNAMELEN 256 140 #endif 141 142 /* A buffer big enough for the string "site host.domain.com:65535". */ 143 #define MAX_SITE_LOC_STRING (MAXHOSTNAMELEN+20) 144 typedef char SITE_STRING_BUFFER[MAX_SITE_LOC_STRING+1]; 145 146 #define MAX_MSG_BUF (__REPMGR_MAXMSG_SIZE + MAXHOSTNAMELEN + 1) 147 148 /* Default timeout values, in seconds. */ 149 #define DB_REPMGR_DEFAULT_ACK_TIMEOUT (1 * US_PER_SEC) 150 #define DB_REPMGR_DEFAULT_CONNECTION_RETRY (30 * US_PER_SEC) 151 #define DB_REPMGR_DEFAULT_ELECTION_RETRY (10 * US_PER_SEC) 152 #define DB_REPMGR_DEFAULT_CHANNEL_TIMEOUT (5 * US_PER_SEC) 153 154 typedef TAILQ_HEAD(__repmgr_conn_list, __repmgr_connection) CONNECTION_LIST; 155 typedef STAILQ_HEAD(__repmgr_out_q_head, __queued_output) OUT_Q_HEADER; 156 typedef TAILQ_HEAD(__repmgr_retry_q, __repmgr_retry) RETRY_Q_HEADER; 157 158 /* Information about threads managed by Replication Framework. */ 159 struct __repmgr_runnable { 160 ENV *env; 161 thread_id_t thread_id; 162 void *(*run) __P((void *)); 163 int finished; /* Boolean: thread is exiting, may be joined. */ 164 int quit_requested; /* Boolean: thread has been asked to quit. */ 165 #ifdef DB_WIN32 166 HANDLE quit_event; 167 #endif 168 union { 169 170 /* 171 * Options governing requested behavior of election thread. 172 */ 173 #define ELECT_F_EVENT_NOTIFY 0x01 /* Notify application of master failure. */ 174 #define ELECT_F_FAST 0x02 /* First election "fast" (n-1 trick). */ 175 #define ELECT_F_IMMED 0x04 /* Start with immediate election. */ 176 #define ELECT_F_INVITEE 0x08 /* Honor (remote) inviter's nsites. */ 177 #define ELECT_F_STARTUP 0x10 /* Observe repmgr_start() policy. */ 178 u_int32_t flags; 179 180 int eid; /* For Connector thread. */ 181 182 /* 183 * Args for other thread types can be added here in the future 184 * as needed. 185 */ 186 } args; 187 }; 188 189 /* 190 * Information about pending connection establishment retry operations. 191 * 192 * We keep these in order by time. This works, under the assumption that the 193 * DB_REP_CONNECTION_RETRY never changes once we get going (though that 194 * assumption is of course wrong, so this needs to be fixed). 195 * 196 * Usually, we put things onto the tail end of the list. But when we add a new 197 * site while threads are running, we trigger its first connection attempt by 198 * scheduling a retry for "0" microseconds from now, putting its retry element 199 * at the head of the list instead. 200 * 201 * TODO: I think this can be fixed by defining "time" to be the time the element 202 * was added (with some convention like "0" meaning immediate), rather than the 203 * deadline time. 204 */ 205 struct __repmgr_retry { 206 TAILQ_ENTRY(__repmgr_retry) entries; 207 int eid; 208 db_timespec time; 209 }; 210 211 /* 212 * We use scatter/gather I/O for both reading and writing. Repmgr messages 213 * (including rep messages) use 3 segments: envelope, control and rec. 214 * Application messages can have any number of segments (the number they 215 * specify, plus 1 for our envelope). REPMGR_IOVECS_ALLOC_SZ should (only) be 216 * used when n > 3. 217 */ 218 #define REPMGR_IOVECS_ALLOC_SZ(n) \ 219 (sizeof(REPMGR_IOVECS) + ((n) - MIN_IOVEC) * sizeof(db_iovec_t)) 220 typedef struct { 221 /* 222 * Index of the first iovec to be used. Initially of course this is 223 * zero. But as we progress through partial I/O transfers, it ends up 224 * pointing to the first iovec to be used on the next operation. 225 */ 226 int offset; 227 228 /* 229 * Total number of pieces defined for this message; equal to the number 230 * of times add_buffer and/or add_dbt were called to populate it. We do 231 * *NOT* revise this as we go along. So subsequent I/O operations must 232 * use count-offset to get the number of active vector pieces still 233 * remaining. 234 */ 235 int count; 236 237 /* 238 * Total number of bytes accounted for in all the pieces of this 239 * message. We do *NOT* revise this as we go along. 240 */ 241 size_t total_bytes; 242 243 #define MIN_IOVEC 3 244 db_iovec_t vectors[MIN_IOVEC]; /* Variable length array. */ 245 } REPMGR_IOVECS; 246 247 typedef struct { 248 size_t length; /* number of bytes in data */ 249 int ref_count; /* # of sites' send queues pointing to us */ 250 u_int8_t data[1]; /* variable size data area */ 251 } REPMGR_FLAT; 252 253 struct __queued_output { 254 STAILQ_ENTRY(__queued_output) entries; 255 REPMGR_FLAT *msg; 256 size_t offset; 257 }; 258 259 /* 260 * The following is for input. Once we know the sizes of the pieces of an 261 * incoming message, we can create this struct (and also the data areas for the 262 * pieces themselves, in the same memory allocation). This is also the struct 263 * in which the message lives while it's waiting to be processed by message 264 * threads. 265 */ 266 typedef struct __repmgr_message { 267 STAILQ_ENTRY(__repmgr_message) entries; 268 __repmgr_msg_hdr_args msg_hdr; 269 union { 270 struct { 271 int originating_eid; 272 DBT control, rec; 273 } repmsg; 274 struct { 275 REPMGR_CONNECTION *conn; 276 DBT request; 277 } gmdb_msg; 278 struct { 279 /* 280 * Connection from which the message arrived; NULL if 281 * generated on the local site. 282 */ 283 REPMGR_CONNECTION *conn; 284 285 DBT buf; /* for reading */ 286 DBT segments[1]; /* expanded in msg th. before callbk */ 287 } appmsg; 288 } v; /* Variants */ 289 } REPMGR_MESSAGE; 290 291 typedef enum { 292 SIZES_PHASE, 293 DATA_PHASE 294 } phase_t; 295 296 typedef enum { 297 APP_CONNECTION, 298 REP_CONNECTION, 299 UNKNOWN_CONN_TYPE 300 } conn_type_t; 301 302 struct __repmgr_connection { 303 TAILQ_ENTRY(__repmgr_connection) entries; 304 305 socket_t fd; 306 #ifdef DB_WIN32 307 WSAEVENT event_object; 308 #endif 309 310 /* 311 * Number of other structures referring to this conn struct. This 312 * ref_count must be reduced to zero before this conn struct can be 313 * destroyed. Referents include: 314 * 315 * - the select() loop, which owns the right to do all reading, as well 316 * as the exclusive right to eventually close the socket 317 * 318 * - a "channel" that owns this APP_CONNECTION (on the originating side) 319 * 320 * - a message received on this APP_CONNECTION, queued for processing 321 * 322 * - any writer blocked on waiting for the outbound queue to drain 323 */ 324 u_int32_t ref_count; 325 326 conn_type_t type; 327 u_int32_t version; /* Wire protocol version on this connection. */ 328 /* (0 means not yet determined.) */ 329 330 /* 331 * When we make an outgoing connection, it starts in CONNECTED state. When we 332 * get the response to our version negotiation, we move to READY. 333 * For incoming connections that we accept, we start in NEGOTIATE, then to 334 * PARAMETERS, and then to READY. 335 * CONGESTED is a hierarchical substate of READY: it's just like READY, with 336 * the additional wrinkle that we don't bother waiting for the outgoing queue to 337 * drain in certain circumstances. 338 */ 339 #define CONN_CONGESTED 1 /* Long-lived full outgoing queue. */ 340 #define CONN_CONNECTED 2 /* Awaiting reply to our version negotiation. */ 341 #define CONN_DEFUNCT 3 /* Basically dead, awaiting clean-up. */ 342 #define CONN_NEGOTIATE 4 /* Awaiting version proposal. */ 343 #define CONN_PARAMETERS 5 /* Awaiting parameters handshake. */ 344 #define CONN_READY 6 /* Everything's fine. */ 345 int state; 346 347 /* 348 * Input: while we're reading a message, we keep track of what phase 349 * we're in. In both phases, we use a REPMGR_IOVECS to keep track of 350 * our progress within the phase. Depending upon the message type, we 351 * end up with either a rep_message (which is a wrapper for the control 352 * and rec DBTs), or a single generic DBT. 353 * Any time we're in DATA_PHASE, it means we have already received 354 * the message header (consisting of msg_type and 2 sizes), and 355 * therefore we have allocated buffer space to read the data. (This is 356 * important for resource clean-up.) 357 */ 358 phase_t reading_phase; 359 REPMGR_IOVECS iovecs; 360 361 u_int8_t msg_type; 362 u_int8_t msg_hdr_buf[__REPMGR_MSG_HDR_SIZE]; 363 364 union { 365 REPMGR_MESSAGE *rep_message; 366 struct { 367 DBT cntrl, rec; 368 } repmgr_msg; 369 } input; 370 371 /* 372 * Output: usually we just simply write messages right in line, in the 373 * send() function's thread. But if TCP doesn't have enough network 374 * buffer space for us when we first try it, we instead allocate some 375 * memory, and copy the message, and then send it as space becomes 376 * available in our main select() thread. In some cases, if the queue 377 * gets too long we wait until it's drained, and then append to it. 378 * This condition variable's associated mutex is the normal per-repmgr 379 * db_rep->mutex, because that mutex is always held anyway whenever the 380 * output queue is consulted. 381 */ 382 OUT_Q_HEADER outbound_queue; 383 int out_queue_length; 384 cond_var_t drained; 385 386 /* =-=-=-=-= app-channel stuff =-=-=-=-= */ 387 waiter_t response_waiters; 388 389 /* 390 * Array of info about pending responses to requests. This info is here 391 * (rather than on the stack of the thread calling send_request()) 392 * because it provides an easy way to allocate available numbers for 393 * message tags, and also so that we can easily find the right info when 394 * we get the tag back in the msg header of the response. 395 */ 396 REPMGR_RESPONSE *responses; 397 u_int32_t aresp; /* Array size. */ 398 u_int32_t cur_resp; /* Index of response currently reading. */ 399 400 /* =-=-=-=-= for normal repmgr connections =-=-=-=-= */ 401 /* 402 * Generally on a REP_CONNECTION type, we have an associated EID (which 403 * is an index into the sites array, by the way). When we initiate the 404 * connection ("outgoing"), we know from the start what the EID is; the 405 * connection struct is linked from the site struct. On the other hand, 406 * when we receive an incoming connection, we don't know at first what 407 * site it may be associated with (or even whether it's an 408 * APP_CONNECTION or REP_CONNECTION, for that matter). During that 409 * initial uncertain time, the eid is -1. Also, when a connection 410 * becomes defunct, but the conn struct hasn't yet been destroyed, the 411 * eid also becomes -1. 412 * 413 * The eid should be -1 if and only if the connection is on the orphans 414 * list. 415 */ 416 int eid; 417 418 }; 419 420 #define IS_READY_STATE(s) ((s) == CONN_READY || (s) == CONN_CONGESTED) 421 422 #ifdef HAVE_GETADDRINFO 423 typedef struct addrinfo ADDRINFO; 424 typedef struct sockaddr_storage ACCEPT_ADDR; 425 #else 426 typedef struct sockaddr_in ACCEPT_ADDR; 427 /* 428 * Some windows platforms have getaddrinfo (Windows XP), some don't. We don't 429 * support conditional compilation in our Windows build, so we always use our 430 * own getaddrinfo implementation. Rename everything so that we don't collide 431 * with the system libraries. 432 */ 433 #undef AI_PASSIVE 434 #define AI_PASSIVE 0x01 435 #undef AI_CANONNAME 436 #define AI_CANONNAME 0x02 437 #undef AI_NUMERICHOST 438 #define AI_NUMERICHOST 0x04 439 440 typedef struct __addrinfo { 441 int ai_flags; /* AI_PASSIVE, AI_CANONNAME, AI_NUMERICHOST */ 442 int ai_family; /* PF_xxx */ 443 int ai_socktype; /* SOCK_xxx */ 444 int ai_protocol; /* 0 or IPPROTO_xxx for IPv4 and IPv6 */ 445 size_t ai_addrlen; /* length of ai_addr */ 446 char *ai_canonname; /* canonical name for nodename */ 447 struct sockaddr *ai_addr; /* binary address */ 448 struct __addrinfo *ai_next; /* next structure in linked list */ 449 } ADDRINFO; 450 #endif /* HAVE_GETADDRINFO */ 451 452 /* 453 * Unprocessed network address configuration. 454 */ 455 typedef struct { 456 roff_t host; /* Separately allocated copy of string. */ 457 u_int16_t port; /* Stored in plain old host-byte-order. */ 458 } SITEADDR; 459 460 /* 461 * Site information, as stored in shared region. 462 */ 463 typedef struct { 464 SITEADDR addr; /* Unprocessed network address of site. */ 465 u_int32_t config; /* Configuration flags: peer, helper, etc. */ 466 u_int32_t status; /* Group membership status. */ 467 } SITEINFO; 468 469 /* 470 * A site address, as stored locally. 471 */ 472 typedef struct { 473 char *host; /* Separately allocated copy of string. */ 474 u_int16_t port; /* Stored in plain old host-byte-order. */ 475 } repmgr_netaddr_t; 476 477 /* 478 * We store site structs in a dynamically allocated, growable array, indexed by 479 * EID. We allocate EID numbers for all sites simply according to their 480 * index within this array. 481 */ 482 #define SITE_FROM_EID(eid) (&db_rep->sites[eid]) 483 #define EID_FROM_SITE(s) ((int)((s) - (&db_rep->sites[0]))) 484 #define IS_VALID_EID(e) ((e) >= 0) 485 #define IS_KNOWN_REMOTE_SITE(e) ((e) >= 0 && ((e) != db_rep->self_eid) && \ 486 (((u_int)(e)) < db_rep->site_cnt)) 487 #define FOR_EACH_REMOTE_SITE_INDEX(i) \ 488 for ((i) = (db_rep->self_eid == 0 ? 1 : 0); \ 489 ((u_int)i) < db_rep->site_cnt; \ 490 (int)(++(i)) == db_rep->self_eid ? ++(i) : i) 491 492 struct __repmgr_site { 493 repmgr_netaddr_t net_addr; 494 495 /* 496 * Group membership status: a copy of the status from the membership 497 * database, or the out-of-band value 0, meaning that it doesn't exist. 498 * We keep track of a "non-existent" site because the associated 499 * host/port network address is promised to be associated with the 500 * locally known EID for the life of the environment. 501 */ 502 u_int32_t membership; /* Status flags from GMDB. */ 503 u_int32_t config; /* Flags from site->set_config() */ 504 505 /* 506 * Everything below here is applicable only to remote sites. 507 */ 508 DB_LSN max_ack; /* Best ack we've heard from this site. */ 509 int ack_policy; /* Or 0 if unknown. */ 510 u_int16_t alignment; /* Requirements for app channel msgs. */ 511 db_timespec last_rcvd_timestamp; 512 513 /* Contents depends on state. */ 514 struct { 515 struct { /* when CONNECTED */ 516 /* 517 * The only time we ever have two connections is in case 518 * of a "collision" on the "server" side. In that case, 519 * the incoming connection either will be closed 520 * promptly by the remote "client", or it is a half-open 521 * connection due to the remote client system having 522 * crashed and rebooted, in which case KEEPALIVE will 523 * eventually clear it. 524 */ 525 REPMGR_CONNECTION *in; /* incoming connection */ 526 REPMGR_CONNECTION *out; /* outgoing connection */ 527 } conn; 528 REPMGR_RETRY *retry; /* when PAUSING */ 529 /* Unused when CONNECTING. */ 530 } ref; 531 532 /* 533 * Subordinate connections (connections from subordinate processes at a 534 * multi-process site). Note that the SITE_CONNECTED state, and all the 535 * ref.retry stuff above is irrelevant to subordinate connections. If a 536 * connection is on this list, it exists; and we never bother trying to 537 * reconnect lost connections (indeed we can't, for these are always 538 * incoming-only). 539 */ 540 CONNECTION_LIST sub_conns; 541 REPMGR_RUNNABLE *connector; /* Thread to open a connection. */ 542 543 #define SITE_CONNECTED 1 /* We have a (main) connection. */ 544 #define SITE_CONNECTING 2 /* Trying to establish (main) connection. */ 545 #define SITE_IDLE 3 /* Doing nothing. */ 546 #define SITE_PAUSING 4 /* Waiting til time to retry connecting. */ 547 int state; 548 549 #define SITE_HAS_PRIO 0x01 /* Set if "electable" flag bit is valid. */ 550 #define SITE_ELECTABLE 0x02 551 #define SITE_TOUCHED 0x04 /* Seen GMDB record during present scan. */ 552 u_int32_t flags; 553 }; 554 555 /* 556 * Flag values for the public DB_SITE handle. 557 */ 558 #define DB_SITE_PREOPEN 0x01 /* Provisional EID; may change at env open. */ 559 560 struct __repmgr_response { 561 DBT dbt; 562 int ret; 563 564 #define RESP_COMPLETE 0x01 565 #define RESP_DUMMY_BUF 0x02 566 #define RESP_IN_USE 0x04 567 #define RESP_READING 0x08 568 #define RESP_THREAD_WAITING 0x10 569 u_int32_t flags; 570 }; 571 572 /* 573 * Private structure for managing comms "channels." This is separate from 574 * DB_CHANNEL so as to avoid dragging in other private structures (e.g., 575 * REPMGR_CONNECTION) into db.h, similar to the relationship between DB_ENV and 576 * ENV. 577 */ 578 struct __channel { 579 DB_CHANNEL *db_channel; 580 ENV *env; 581 582 union { 583 /* For simple, specific-EID channels. */ 584 REPMGR_CONNECTION *conn; 585 586 /* For EID_MASTER or EID_BROADCAST channels. */ 587 struct { 588 mgr_mutex_t *mutex; /* For connection establishment. */ 589 REPMGR_CONNECTION **array; 590 u_int32_t cnt; 591 } conns; 592 } c; 593 REPMGR_MESSAGE *msg; /* Incoming channel only; NULL otherwise. */ 594 int responded; /* Boolean flag. */ 595 __repmgr_msg_metadata_args *meta; 596 597 /* Used only in send-to-self request case. */ 598 struct __repmgr_response response; 599 }; 600 601 /* 602 * Repmgr keeps track of references to connection information (instances 603 * of struct __repmgr_connection). There are three kinds of places 604 * connections may be found: (1) SITE->ref.conn, (2) SITE->sub_conns, and 605 * (3) db_rep->connections. 606 * 607 * 1. SITE->ref.conn points to our connection with the main process running 608 * at the given site, if such a connection exists. We may have initiated 609 * the connection to the site ourselves, or we may have received it as an 610 * incoming connection. Once it is established there is very little 611 * difference between those two cases. 612 * 613 * 2. SITE->sub_conns is a list of connections we have with subordinate 614 * processes running at the given site. There can be any number of these 615 * connections, one per subordinate process. Note that these connections 616 * are always incoming: there's no way for us to initiate this kind of 617 * connection because subordinate processes do not "listen". 618 * 619 * 3. The db_rep->connections list contains the references to any 620 * connections that are not actively associated with any site (we 621 * sometimes call these "orphans"). There are two times when this can 622 * be: 623 * 624 * a) When we accept an incoming connection, we don't know what site it 625 * comes from until we read the initial handshake message. 626 * 627 * b) When an error occurs on a connection, we first mark it as DEFUNCT 628 * and stop using it. Then, at a later, well-defined time, we close 629 * the connection's file descriptor and get rid of the connection 630 * struct. 631 * 632 * In light of the above, we can see that the following describes the 633 * rules for how connections may be moved among these three kinds of 634 * "places": 635 * 636 * - when we initiate an outgoing connection, we of course know what site 637 * it's going to be going to, and so we immediately put the pointer to 638 * the connection struct into SITE->ref.conn 639 * 640 * - when we accept an incoming connection, we don't immediately know 641 * whom it's from, so we have to put it on the orphans list 642 * (db_rep->connections). 643 * 644 * - (incoming, cont.) But as soon as we complete the initial "handshake" 645 * message exchange, we will know which site it's from and whether it's 646 * a subordinate or main connection. At that point we remove it from 647 * db_rep->connections and either point to it by SITE->ref.conn, or add 648 * it to the SITE->sub_conns list. 649 * 650 * - (for any active connection) when an error occurs, we move the 651 * connection to the orphans list until we have a chance to close it. 652 */ 653 654 /* 655 * Repmgr message formats. 656 * 657 * Declarative definitions of current message formats appear in repmgr.msg. 658 * (The s_message/gen_msg.awk utility generates C code.) In general, we send 659 * the buffers marshaled from those structure formats in the "control" portion 660 * of a message. 661 * 662 * Each message is prefaced by a 9-byte message header (as described in 663 * repmgr_net.c). Different message types use the two available 32-bit integers 664 * in different ways, as codified here: 665 */ 666 #define REPMGR_HDR1(hdr) ((hdr).word1) 667 #define REPMGR_HDR2(hdr) ((hdr).word2) 668 669 /* REPMGR_APP_MESSAGE */ 670 #define APP_MSG_BUFFER_SIZE REPMGR_HDR1 671 #define APP_MSG_SEGMENT_COUNT REPMGR_HDR2 672 673 /* REPMGR_REP_MESSAGE and the other traditional repmgr message types. */ 674 #define REP_MSG_CONTROL_SIZE REPMGR_HDR1 675 #define REP_MSG_REC_SIZE REPMGR_HDR2 676 677 /* REPMGR_APP_RESPONSE */ 678 #define APP_RESP_BUFFER_SIZE REPMGR_HDR1 679 #define APP_RESP_TAG REPMGR_HDR2 680 681 /* REPMGR_RESP_ERROR. Note that a zero-length message body is implied. */ 682 #define RESP_ERROR_CODE REPMGR_HDR1 683 #define RESP_ERROR_TAG REPMGR_HDR2 684 685 /* REPMGR_OWN_MSG */ 686 #define REPMGR_OWN_BUF_SIZE REPMGR_HDR1 687 #define REPMGR_OWN_MSG_TYPE REPMGR_HDR2 688 689 /* 690 * Flags for the handshake message. As with repmgr message types, these values 691 * are transmitted between sites, and must therefore be "frozen" permanently. 692 * Names are alphabetized here for easy reference, but values reflect historical 693 * usage. 694 */ 695 #define APP_CHANNEL_CONNECTION 0x02 /* Connection used for app channel. */ 696 #define ELECTABLE_SITE 0x04 697 #define REPMGR_SUBORDINATE 0x01 /* This is a subordinate connection. */ 698 699 /* 700 * Flags for application-message meta-data. 701 */ 702 #define REPMGR_MULTI_RESP 0x01 703 #define REPMGR_REQUEST_MSG_TYPE 0x02 704 #define REPMGR_RESPONSE_LIMIT 0x04 705 706 /* 707 * Legacy V1 handshake message format. For compatibility, we send this as part 708 * of version negotiation upon connection establishment. 709 */ 710 typedef struct { 711 u_int32_t version; 712 u_int16_t port; 713 u_int32_t priority; 714 } DB_REPMGR_V1_HANDSHAKE; 715 716 /* 717 * Storage formats. 718 * 719 * As with message formats, stored formats are defined in repmgr.msg. 720 */ 721 /* 722 * Flags for the Group Membership data portion of a record. Like message type 723 * codes, these values are frozen across releases, in order to avoid pointless 724 * churn. 725 */ 726 #define SITE_ADDING 0x01 727 #define SITE_DELETING 0x02 728 #define SITE_PRESENT 0x04 729 730 /* 731 * Message types whose processing could take a long time. We're careful to 732 * avoid using up all our message processing threads on these message types, so 733 * that we don't starve out the more important rep messages. 734 */ 735 #define IS_DEFERRABLE(t) ((t) == REPMGR_OWN_MSG || (t) == REPMGR_APP_MESSAGE) 736 /* 737 * When using leases there are times when a thread processing a message 738 * must block, waiting for leases to be refreshed. But refreshing the 739 * leases requires another thread to accept the lease grant messages. 740 */ 741 #define RESERVED_MSG_TH(env) (IS_USING_LEASES(env) ? 2 : 1) 742 743 #define IS_SUBORDINATE(db_rep) (db_rep->listen_fd == INVALID_SOCKET) 744 745 #define IS_PEER_POLICY(p) ((p) == DB_REPMGR_ACKS_ALL_PEERS || \ 746 (p) == DB_REPMGR_ACKS_QUORUM || \ 747 (p) == DB_REPMGR_ACKS_ONE_PEER) 748 749 /* 750 * Most of the code in repmgr runs while holding repmgr's main mutex, which 751 * resides in db_rep->mutex. This mutex is owned by a single repmgr process, 752 * and serializes access to the (large) critical sections among threads in the 753 * process. Unlike many other mutexes in DB, it is specifically coded as either 754 * a POSIX threads mutex or a Win32 mutex. Note that although it's a large 755 * fraction of the code, it's a tiny fraction of the time: repmgr spends most of 756 * its time in a call to select(), and as well a bit in calls into the Base 757 * replication API. All of those release the mutex. 758 * Access to repmgr's shared list of site addresses is protected by 759 * another mutex: mtx_repmgr. And, when changing space allocation for that site 760 * list we conform to the convention of acquiring renv->mtx_regenv. These are 761 * less frequent of course. 762 * When it's necessary to acquire more than one of these mutexes, the 763 * ordering priority (or "lock ordering protocol") is: 764 * db_rep->mutex (first) 765 * mtx_repmgr (briefly) 766 * mtx_regenv (last, and most briefly) 767 * 768 * There are also mutexes for app message "channels". Each channel has a mutex, 769 * which is used to serialize any connection re-establishment that may become 770 * necessary during its lifetime (such as when a master changes). This never 771 * happens on a simple, specific-EID channel, but in other cases multiple app 772 * threads could be making send_xxx() calls concurrently, and it would not do to 773 * have two of them try to re-connect concurrently. 774 * When re-establishing a connection, the channel lock is held while 775 * grabbing first the mtx_repmgr, and then the db_rep mutex (but not both 776 * together). I.e., we have: 777 * channel->mutex (first) 778 * [mtx_repmgr (very briefly)] and then [db_rep->mutex (very briefly)] 779 */ 780 781 #define LOCK_MUTEX(m) do { \ 782 if (__repmgr_lock_mutex(m) != 0) \ 783 return (DB_RUNRECOVERY); \ 784 } while (0) 785 786 #define UNLOCK_MUTEX(m) do { \ 787 if (__repmgr_unlock_mutex(m) != 0) \ 788 return (DB_RUNRECOVERY); \ 789 } while (0) 790 791 /* POSIX/Win32 socket (and other) portability. */ 792 #ifdef DB_WIN32 793 #define WOULDBLOCK WSAEWOULDBLOCK 794 #undef DB_REPMGR_EAGAIN 795 796 #define net_errno WSAGetLastError() 797 typedef int socklen_t; 798 typedef char * sockopt_t; 799 #define sendsocket(s, buf, len, flags) send((s), (buf), (int)(len), (flags)) 800 801 #define iov_len len 802 #define iov_base buf 803 804 typedef DWORD threadsync_timeout_t; 805 806 #define REPMGR_INITED(db_rep) (db_rep->signaler != NULL) 807 #else 808 809 #define INVALID_SOCKET -1 810 #define SOCKET_ERROR -1 811 #define WOULDBLOCK EWOULDBLOCK 812 #define DB_REPMGR_EAGAIN EAGAIN 813 814 #define net_errno errno 815 typedef void * sockopt_t; 816 817 #define sendsocket(s, buf, len, flags) send((s), (buf), (len), (flags)) 818 #define closesocket(fd) close(fd) 819 820 typedef struct timespec threadsync_timeout_t; 821 822 #define REPMGR_INITED(db_rep) (db_rep->read_pipe >= 0) 823 #endif 824 825 #define SELECTOR_RUNNING(db_rep) ((db_rep)->selector != NULL) 826 827 /* 828 * Generic definition of some action to be performed on each connection, in the 829 * form of a call-back function. 830 */ 831 typedef int (*CONNECTION_ACTION) __P((ENV *, REPMGR_CONNECTION *, void *)); 832 833 /* 834 * Generic predicate to test a condition that a thread is waiting for. 835 */ 836 typedef int (*PREDICATE) __P((ENV *, void *)); 837 838 #include "dbinc_auto/repmgr_ext.h" 839 840 #if defined(__cplusplus) 841 } 842 #endif 843 #endif /* !_DB_REPMGR_H_ */ 844