1 /*- 2 * Copyright (c) 2006, 2020 Oracle and/or its affiliates. All rights reserved. 3 * 4 * See the file LICENSE for license information. 5 * 6 * $Id$ 7 */ 8 9 #ifndef _DB_REPMGR_H_ 10 #define _DB_REPMGR_H_ 11 12 #include "dbinc_auto/repmgr_automsg.h" 13 14 #if defined(__cplusplus) 15 extern "C" { 16 #endif 17 18 /* 19 * Replication Manager message format types. These few format codes identify 20 * enough information to describe, at the lowest level, how a message should be 21 * read from the wire, including how much memory should be allocated to hold the 22 * result. (Often we want to allocate more than just enough to hold the 23 * received bytes, if we know that we will need more during processing.) 24 * 25 * These values are transmitted between sites, even sites running differing BDB 26 * versions. Therefore, once assigned, the values are permanently "frozen". 27 * 28 * For example, in repmgr wire protocol version 1 the highest assigned message 29 * type value was 3, for REPMGR_REP_MESSAGE. Wire protocol version 2 added the 30 * HEARTBEAT message type (4). 31 * 32 * New message types added in later versions always get new (higher) values. We 33 * still list them in alphabetical order, for ease of reference. But this 34 * generally does not correspond to numerical order. 35 */ 36 #define REPMGR_APP_MESSAGE 5 /* Msg sent from app. on DB_CHANNEL. */ 37 #define REPMGR_APP_RESPONSE 6 /* Response to a channel request. */ 38 #define REPMGR_OWN_MSG 8 /* Repmgr's own messages, to peers. */ 39 #define REPMGR_HANDSHAKE 2 /* Connection establishment sequence. */ 40 #define REPMGR_HEARTBEAT 4 /* Monitor connection health. */ 41 #define REPMGR_PERMLSN 1 /* My perm LSN. */ 42 #define REPMGR_REP_MESSAGE 3 /* Normal replication message. */ 43 #define REPMGR_RESP_ERROR 7 /* Sys-gen'd error resp to request. */ 44 45 /* 46 * Largest known message type code known in each protocol version we support. 47 * In protocol version one there were only three message types: 1, 2, and 3; so 48 * 3 was the max. In protocol version 2 we introduced heartbeats, type 4. 49 * (Protocol version 3 did not introduce any new message types.) In version 4 50 * we introduced a few more new message types, the largest of which had value 8. 51 * Protocol version 5 did not introduce any new message types, but changed 52 * the format of site info and membership data to support views. 53 * 54 * Protocol version 6 introduced preferred master mode, which added several 55 * new REPMGR_OWN messages. 56 * 57 * Protocol version 7 did not introduce any new message types, but changed the 58 * format of permlsn to improve group-aware log archiving and added new struct 59 * names for heartbeat and readonly_response payloads. 60 */ 61 #define REPMGR_MAX_V1_MSG_TYPE 3 62 #define REPMGR_MAX_V2_MSG_TYPE 4 63 #define REPMGR_MAX_V3_MSG_TYPE 4 64 #define REPMGR_MAX_V4_MSG_TYPE 8 65 #define REPMGR_MAX_V5_MSG_TYPE 8 66 #define REPMGR_MAX_V6_MSG_TYPE 8 67 #define REPMGR_MAX_V7_MSG_TYPE 8 68 #define HEARTBEAT_MIN_VERSION 2 69 #define CHANNEL_MIN_VERSION 4 70 #define CONN_COLLISION_VERSION 4 71 #define GM_MIN_VERSION 4 72 #define OWN_MIN_VERSION 4 73 #define VIEW_MIN_VERSION 5 74 #define PREFMAS_MIN_VERSION 6 75 76 /* The range of protocol versions we're willing to support. */ 77 #define DB_REPMGR_VERSION 7 78 #define DB_REPMGR_MIN_VERSION 2 79 80 /* 81 * For messages with the "REPMGR_OWN_MSG" format code, a message type (see 82 * REPMGR_OWN_MSG_TYPE, below) is included in the header. While at the lowest 83 * level, the format codes identify only enough to read and allocate memory, at 84 * the next higher level the following message type codes identify the content 85 * of the message: how to unmarshal and dispatch it. 86 * 87 * Like the message format types, these message type values should be 88 * permanently frozen. 89 */ 90 #define REPMGR_CONNECT_REJECT 1 91 #define REPMGR_GM_FAILURE 2 92 #define REPMGR_GM_FORWARD 3 93 #define REPMGR_JOIN_REQUEST 4 94 #define REPMGR_JOIN_SUCCESS 5 95 #define REPMGR_PARM_REFRESH 6 96 #define REPMGR_REJOIN 7 97 #define REPMGR_REMOVE_REQUEST 8 98 #define REPMGR_REMOVE_SUCCESS 9 99 #define REPMGR_RESOLVE_LIMBO 10 100 #define REPMGR_SHARING 11 101 #define REPMGR_LSNHIST_REQUEST 12 102 #define REPMGR_LSNHIST_RESPONSE 13 103 #define REPMGR_PREFMAS_FAILURE 14 104 #define REPMGR_PREFMAS_SUCCESS 15 105 #define REPMGR_READONLY_MASTER 16 106 #define REPMGR_READONLY_RESPONSE 17 107 #define REPMGR_RESTART_CLIENT 18 108 109 /* 110 * Write forwarding definitions. 111 * 112 * This is always the first value in a repmgr write forwarding message. 113 * Testing this helps ensure that repmgr write forwarding code only 114 * attempts to process a write forwarding message and not some other 115 * type of repmgr application-defined message or random noise. 116 */ 117 #define REPMGR_WF_IDENTIFIER 64424 118 119 /* 120 * Values for write forwarded operations. These values should remain 121 * constant across BDB versions for backward compatibility. Add values 122 * for any new write forwarded operations to the end of this list. 123 */ 124 #define REPMGR_WF_SINGLE_DEL 1 125 #define REPMGR_WF_SINGLE_PUT 2 126 127 /* Current protocol version and a fast way to test for a supported message. */ 128 #define REPMGR_WF_VERSION 1 129 #define REPMGR_WF_MAX_V1_MSG_TYPE 2 130 131 /* These masks enable two small integers to share the space of a large one. */ 132 #define REPMGR_WF_LOWER_MASK 0x0000ffff 133 #define REPMGR_WF_UPPER_MASK 0xffff0000 134 135 /* The minimum and maximum number of DBTs in a write forwarding message. */ 136 #define REPMGR_WF_MIN_DBTS 5 137 #define REPMGR_WF_MAX_DBTS 6 138 139 /* Length of a string to which to dump a hex fileid value in verbose output. */ 140 #define REPMGR_WF_FILEID_STRLEN 80 141 142 #define IS_USING_WRITE_FORWARDING(env) \ 143 (FLD_ISSET(((env)->rep_handle->region)->config, REP_C_FORWARD_WRITES)) 144 145 #define REPMGR_WF_DUMP_FILEID(fileid, i, str) \ 146 memset(str, 0, REPMGR_WF_FILEID_STRLEN); \ 147 for (i = 0; i < DB_FILE_ID_LEN; i++) \ 148 (void)sprintf(str, "%s%x ", str, fileid[i]); 149 150 /* 151 * Enable two u_int32_t numbers to share the space of a single u_int64_t. 152 * This helps the write forwarding protocol to use fewer iovec segments 153 * and stay within the bounds on systems where IOV_MAX is a small number 154 * (e.g. 16 on Solaris). 155 */ 156 typedef union { 157 u_int64_t unum64; 158 u_int32_t unum32[2]; 159 } wf_uint32_pair; 160 161 /* Detect inconsistencies between view callback and site's gmdb. */ 162 #define PARTICIPANT_TO_VIEW(db_rep, site) \ 163 ((db_rep)->partial && !FLD_ISSET((site)->gmdb_flags, SITE_VIEW)) 164 #define VIEW_TO_PARTICIPANT(db_rep, site) \ 165 (!(db_rep)->partial && FLD_ISSET((site)->gmdb_flags, SITE_VIEW)) 166 167 struct __repmgr_connection; 168 typedef struct __repmgr_connection REPMGR_CONNECTION; 169 struct __repmgr_queue; typedef struct __repmgr_queue REPMGR_QUEUE; 170 struct __queued_output; typedef struct __queued_output QUEUED_OUTPUT; 171 struct __repmgr_response; typedef struct __repmgr_response REPMGR_RESPONSE; 172 struct __repmgr_retry; typedef struct __repmgr_retry REPMGR_RETRY; 173 struct __repmgr_runnable; typedef struct __repmgr_runnable REPMGR_RUNNABLE; 174 struct __repmgr_site; typedef struct __repmgr_site REPMGR_SITE; 175 struct __cond_waiters_table; 176 typedef struct __cond_waiters_table COND_WAITERS_TABLE; 177 178 /* Current Group Membership DB format ID. */ 179 #define REPMGR_GMDB_FMT_VERSION 2 180 #define REPMGR_GMDB_FMT_MIN_VERSION 1 181 182 #ifdef DB_WIN32 183 typedef SOCKET socket_t; 184 typedef HANDLE thread_id_t; 185 typedef HANDLE mgr_mutex_t; 186 typedef HANDLE cond_var_t; 187 188 typedef COND_WAITERS_TABLE *waiter_t; 189 typedef WSABUF db_iovec_t; 190 #else 191 typedef int socket_t; 192 typedef pthread_t thread_id_t; 193 typedef pthread_mutex_t mgr_mutex_t; 194 typedef pthread_cond_t cond_var_t; 195 typedef pthread_cond_t waiter_t; 196 typedef struct iovec db_iovec_t; 197 #endif 198 199 /* 200 * The (arbitrary) maximum number of outgoing messages we're willing to hold, on 201 * a queue per connection, waiting for TCP buffer space to become available in 202 * the kernel. Rather than exceeding this limit, we simply discard additional 203 * messages (since this is always allowed by the replication protocol). 204 * As a special dispensation, if a message is destined for a specific remote 205 * site (i.e., it's not a broadcast), then we first try blocking the sending 206 * thread, waiting for space to become available (though we only wait a limited 207 * time). This is so as to be able to handle the immediate flood of (a 208 * potentially large number of) outgoing messages that replication generates, in 209 * a tight loop, when handling PAGE_REQ, LOG_REQ and ALL_REQ requests. 210 */ 211 #define OUT_QUEUE_LIMIT 10 212 213 /* 214 * The system value is available from sysconf(_SC_HOST_NAME_MAX). 215 */ 216 #ifndef MAXHOSTNAMELEN 217 #define MAXHOSTNAMELEN 1025 218 #endif 219 220 /* A buffer big enough for the string "site host.domain.com:65535". */ 221 #define MAX_SITE_LOC_STRING (MAXHOSTNAMELEN+20) 222 typedef char SITE_STRING_BUFFER[MAX_SITE_LOC_STRING+1]; 223 224 #define MAX_MSG_BUF (__REPMGR_MAXMSG_SIZE + MAXHOSTNAMELEN + 1) 225 226 /* Default timeout values, in seconds. */ 227 #define DB_REPMGR_DEFAULT_ACK_TIMEOUT (1 * US_PER_SEC) 228 #define DB_REPMGR_DEFAULT_CONNECTION_RETRY (30 * US_PER_SEC) 229 #define DB_REPMGR_DEFAULT_ELECTION_RETRY (10 * US_PER_SEC) 230 #define DB_REPMGR_DEFAULT_CHANNEL_TIMEOUT (5 * US_PER_SEC) 231 232 /* Default preferred master automatic configuration values. */ 233 #define DB_REPMGR_PREFMAS_ELECTION_RETRY (1 * US_PER_SEC) 234 #define DB_REPMGR_PREFMAS_HEARTBEAT_MONITOR (2 * US_PER_SEC) 235 #define DB_REPMGR_PREFMAS_HEARTBEAT_SEND (75 * (US_PER_SEC / 100)) 236 #define DB_REPMGR_PREFMAS_PRIORITY_CLIENT 75 237 #define DB_REPMGR_PREFMAS_PRIORITY_MASTER 200 238 239 /* Defaults for undocumented incoming queue maximum messages. */ 240 #define DB_REPMGR_DEFAULT_INQUEUE_MAX (100 * MEGABYTE) 241 #define DB_REPMGR_INQUEUE_REDZONE_PERCENT 85 242 243 typedef TAILQ_HEAD(__repmgr_conn_list, __repmgr_connection) CONNECTION_LIST; 244 typedef STAILQ_HEAD(__repmgr_out_q_head, __queued_output) OUT_Q_HEADER; 245 typedef TAILQ_HEAD(__repmgr_retry_q, __repmgr_retry) RETRY_Q_HEADER; 246 247 /* Information about threads managed by Replication Framework. */ 248 struct __repmgr_runnable { 249 ENV *env; 250 thread_id_t thread_id; 251 void *(*run) __P((void *)); 252 int finished; /* Boolean: thread is exiting, may be joined. */ 253 int quit_requested; /* Boolean: thread has been asked to quit. */ 254 #ifdef DB_WIN32 255 HANDLE quit_event; 256 #endif 257 union { 258 259 /* 260 * Options governing requested behavior of election thread. 261 */ 262 #define ELECT_F_CLIENT_RESTART 0x01 /* Do client restarts but no elections. */ 263 #define ELECT_F_EVENT_NOTIFY 0x02 /* Notify application of master failure. */ 264 #define ELECT_F_FAST 0x04 /* First election "fast" (n-1 trick). */ 265 #define ELECT_F_IMMED 0x08 /* Start with immediate election. */ 266 #define ELECT_F_INVITEE 0x10 /* Honor (remote) inviter's nsites. */ 267 #define ELECT_F_STARTUP 0x20 /* Observe repmgr_start() policy. */ 268 u_int32_t flags; 269 270 /* For connector thread. */ 271 struct { 272 int eid; 273 #define CONNECT_F_REFRESH 0x01 /* New connection to replace old one. */ 274 u_int32_t flags; 275 } conn_th; 276 277 /* 278 * Args for other thread types can be added here in the future 279 * as needed. 280 */ 281 } args; 282 }; 283 284 /* 285 * Information about pending connection establishment retry operations. 286 * 287 * We keep these in order by time. This works, under the assumption that the 288 * DB_REP_CONNECTION_RETRY never changes once we get going (though that 289 * assumption is of course wrong, so this needs to be fixed). 290 * 291 * Usually, we put things onto the tail end of the list. But when we add a new 292 * site while threads are running, we trigger its first connection attempt by 293 * scheduling a retry for "0" microseconds from now, putting its retry element 294 * at the head of the list instead. 295 * 296 * TODO: I think this can be fixed by defining "time" to be the time the element 297 * was added (with some convention like "0" meaning immediate), rather than the 298 * deadline time. 299 */ 300 struct __repmgr_retry { 301 TAILQ_ENTRY(__repmgr_retry) entries; 302 int eid; 303 db_timespec time; 304 }; 305 306 /* 307 * We use scatter/gather I/O for both reading and writing. Repmgr messages 308 * (including rep messages) use 3 segments: envelope, control and rec. 309 * Application messages can have any number of segments (the number they 310 * specify, plus 1 for our envelope). REPMGR_IOVECS_ALLOC_SZ should (only) be 311 * used when n > 3. 312 */ 313 #define REPMGR_IOVECS_ALLOC_SZ(n) \ 314 (sizeof(REPMGR_IOVECS) + ((n) - MIN_IOVEC) * sizeof(db_iovec_t)) 315 typedef struct { 316 /* 317 * Index of the first iovec to be used. Initially of course this is 318 * zero. But as we progress through partial I/O transfers, it ends up 319 * pointing to the first iovec to be used on the next operation. 320 */ 321 int offset; 322 323 /* 324 * Total number of pieces defined for this message; equal to the number 325 * of times add_buffer and/or add_dbt were called to populate it. We do 326 * *NOT* revise this as we go along. So subsequent I/O operations must 327 * use count-offset to get the number of active vector pieces still 328 * remaining. 329 */ 330 int count; 331 332 /* 333 * Total number of bytes accounted for in all the pieces of this 334 * message. We do *NOT* revise this as we go along. 335 */ 336 size_t total_bytes; 337 338 #define MIN_IOVEC 3 339 db_iovec_t vectors[MIN_IOVEC]; /* Variable length array. */ 340 } REPMGR_IOVECS; 341 342 typedef struct { 343 size_t length; /* number of bytes in data */ 344 int ref_count; /* # of sites' send queues pointing to us */ 345 u_int8_t data[1]; /* variable size data area */ 346 } REPMGR_FLAT; 347 348 struct __queued_output { 349 STAILQ_ENTRY(__queued_output) entries; 350 REPMGR_FLAT *msg; 351 size_t offset; 352 }; 353 354 /* 355 * The following is for input. Once we know the sizes of the pieces of an 356 * incoming message, we can create this struct (and also the data areas for the 357 * pieces themselves, in the same memory allocation). This is also the struct 358 * in which the message lives while it's waiting to be processed by message 359 * threads. 360 */ 361 typedef struct __repmgr_message { 362 STAILQ_ENTRY(__repmgr_message) entries; 363 size_t size; 364 __repmgr_msg_hdr_args msg_hdr; 365 union { 366 struct { 367 int originating_eid; 368 DBT control, rec; 369 } repmsg; 370 struct { 371 REPMGR_CONNECTION *conn; 372 DBT request; 373 } gmdb_msg; 374 struct { 375 /* 376 * Connection from which the message arrived; NULL if 377 * generated on the local site. 378 */ 379 REPMGR_CONNECTION *conn; 380 381 DBT buf; /* for reading */ 382 DBT segments[1]; /* expanded in msg th. before callbk */ 383 } appmsg; 384 } v; /* Variants */ 385 } REPMGR_MESSAGE; 386 387 typedef enum { 388 SIZES_PHASE, 389 DATA_PHASE 390 } phase_t; 391 392 typedef enum { 393 APP_CONNECTION, 394 REP_CONNECTION, 395 UNKNOWN_CONN_TYPE 396 } conn_type_t; 397 398 #if defined(HAVE_REPMGR_SSL) 399 typedef struct __repmgr_ssl_write_sync_info { 400 char *rmgr_ssl_wbuf_ptr; 401 int rmgr_ssl_wbuf_length; 402 int rmgr_ssl_write_pending; 403 mgr_mutex_t *rmgr_ssl_write_mutex; 404 } REPMGR_SSL_WRITE_INFO; 405 406 typedef struct __repmgr_ssl__conn_info { 407 /* Only one thread can be interacting with the SSL object at a time. */ 408 mgr_mutex_t *repmgr_ssl_conn_mutex; 409 SSL *ssl; 410 411 #define REPMGR_SSL_READ_PENDING_ON_READ 0x1 412 #define REPMGR_SSL_READ_PENDING_ON_WRITE 0x2 413 #define REPMGR_SSL_WRITE_PENDING_ON_READ 0x4 414 #define REPMGR_SSL_WRITE_PENDING_ON_WRITE 0x8 415 int ssl_io_state; 416 417 REPMGR_SSL_WRITE_INFO *rmgr_ssl_wrinfo; 418 } REPMGR_SSL_CONN_INFO; 419 #endif 420 421 struct __repmgr_connection { 422 TAILQ_ENTRY(__repmgr_connection) entries; 423 424 socket_t fd; 425 #ifdef DB_WIN32 426 WSAEVENT event_object; 427 #endif 428 429 /* 430 * Number of other structures referring to this conn struct. This 431 * ref_count must be reduced to zero before this conn struct can be 432 * destroyed. Referents include: 433 * 434 * - the select() loop, which owns the right to do all reading, as well 435 * as the exclusive right to eventually close the socket 436 * 437 * - a "channel" that owns this APP_CONNECTION (on the originating side) 438 * 439 * - a message received on this APP_CONNECTION, queued for processing 440 * 441 * - any writer blocked on waiting for the outbound queue to drain 442 */ 443 u_int32_t ref_count; 444 445 conn_type_t type; 446 u_int32_t version; /* Wire protocol version on this connection. */ 447 /* (0 means not yet determined.) */ 448 449 /* 450 * When we make an outgoing connection, it starts in CONNECTED state. When we 451 * get the response to our version negotiation, we move to READY. 452 * For incoming connections that we accept, we start in NEGOTIATE, then to 453 * PARAMETERS, and then to READY. 454 * CONGESTED is a hierarchical substate of READY: it's just like READY, with 455 * the additional wrinkle that we don't bother waiting for the outgoing queue to 456 * drain in certain circumstances. 457 */ 458 #define CONN_CONGESTED 1 /* Long-lived full outgoing queue. */ 459 #define CONN_CONNECTED 2 /* Awaiting reply to our version negotiation. */ 460 #define CONN_DEFUNCT 3 /* Basically dead, awaiting clean-up. */ 461 #define CONN_NEGOTIATE 4 /* Awaiting version proposal. */ 462 #define CONN_PARAMETERS 5 /* Awaiting parameters handshake. */ 463 #define CONN_READY 6 /* Everything's fine. */ 464 int state; 465 u_int32_t auto_takeover;/* Connection to remote listener candidate. */ 466 467 /* 468 * Input: while we're reading a message, we keep track of what phase 469 * we're in. In both phases, we use a REPMGR_IOVECS to keep track of 470 * our progress within the phase. Depending upon the message type, we 471 * end up with either a rep_message (which is a wrapper for the control 472 * and rec DBTs), or a single generic DBT. 473 * Any time we're in DATA_PHASE, it means we have already received 474 * the message header (consisting of msg_type and 2 sizes), and 475 * therefore we have allocated buffer space to read the data. (This is 476 * important for resource clean-up.) 477 */ 478 phase_t reading_phase; 479 REPMGR_IOVECS iovecs; 480 481 u_int8_t msg_type; 482 u_int8_t msg_hdr_buf[__REPMGR_MSG_HDR_SIZE]; 483 484 union { 485 REPMGR_MESSAGE *rep_message; 486 struct { 487 DBT cntrl, rec; 488 } repmgr_msg; 489 } input; 490 491 /* 492 * Output: usually we just simply write messages right in line, in the 493 * send() function's thread. But if TCP doesn't have enough network 494 * buffer space for us when we first try it, we instead allocate some 495 * memory, and copy the message, and then send it as space becomes 496 * available in our main select() thread. In some cases, if the queue 497 * gets too long we wait until it's drained, and then append to it. 498 * This condition variable's associated mutex is the normal per-repmgr 499 * db_rep->mutex, because that mutex is always held anyway whenever the 500 * output queue is consulted. 501 */ 502 OUT_Q_HEADER outbound_queue; 503 int out_queue_length; 504 cond_var_t drained; 505 506 /* =-=-=-=-= app-channel stuff =-=-=-=-= */ 507 waiter_t response_waiters; 508 509 /* 510 * Array of info about pending responses to requests. This info is here 511 * (rather than on the stack of the thread calling send_request()) 512 * because it provides an easy way to allocate available numbers for 513 * message tags, and also so that we can easily find the right info when 514 * we get the tag back in the msg header of the response. 515 */ 516 REPMGR_RESPONSE *responses; 517 u_int32_t aresp; /* Array size. */ 518 u_int32_t cur_resp; /* Index of response currently reading. */ 519 520 /* =-=-=-=-= for normal repmgr connections =-=-=-=-= */ 521 /* 522 * Generally on a REP_CONNECTION type, we have an associated EID (which 523 * is an index into the sites array, by the way). When we initiate the 524 * connection ("outgoing"), we know from the start what the EID is; the 525 * connection struct is linked from the site struct. On the other hand, 526 * when we receive an incoming connection, we don't know at first what 527 * site it may be associated with (or even whether it's an 528 * APP_CONNECTION or REP_CONNECTION, for that matter). During that 529 * initial uncertain time, the eid is -1. Also, when a connection 530 * becomes defunct, but the conn struct hasn't yet been destroyed, the 531 * eid also becomes -1. 532 * 533 * The eid should be -1 if and only if the connection is on the orphans 534 * list. 535 */ 536 int eid; 537 538 #if defined(HAVE_REPMGR_SSL) 539 /* For ssl connection information. */ 540 REPMGR_SSL_CONN_INFO *repmgr_ssl_info; 541 #endif 542 543 ENV *env; 544 }; 545 546 #define IS_READY_STATE(s) ((s) == CONN_READY || (s) == CONN_CONGESTED) 547 548 #ifdef HAVE_GETADDRINFO 549 typedef struct addrinfo ADDRINFO; 550 typedef struct sockaddr_storage ACCEPT_ADDR; 551 #else 552 typedef struct sockaddr_in ACCEPT_ADDR; 553 /* 554 * Some windows platforms have getaddrinfo (Windows XP), some don't. We don't 555 * support conditional compilation in our Windows build, so we always use our 556 * own getaddrinfo implementation. Rename everything so that we don't collide 557 * with the system libraries. 558 */ 559 #undef AI_PASSIVE 560 #define AI_PASSIVE 0x01 561 #undef AI_CANONNAME 562 #define AI_CANONNAME 0x02 563 #undef AI_NUMERICHOST 564 #define AI_NUMERICHOST 0x04 565 566 typedef struct __addrinfo { 567 int ai_flags; /* AI_PASSIVE, AI_CANONNAME, AI_NUMERICHOST */ 568 int ai_family; /* PF_xxx */ 569 int ai_socktype; /* SOCK_xxx */ 570 int ai_protocol; /* 0 or IPPROTO_xxx for IPv4 and IPv6 */ 571 size_t ai_addrlen; /* length of ai_addr */ 572 char *ai_canonname; /* canonical name for nodename */ 573 struct sockaddr *ai_addr; /* binary address */ 574 struct __addrinfo *ai_next; /* next structure in linked list */ 575 } ADDRINFO; 576 #endif /* HAVE_GETADDRINFO */ 577 578 /* 579 * Unprocessed network address configuration. 580 */ 581 typedef struct { 582 roff_t host; /* Separately allocated copy of string. */ 583 u_int16_t port; /* Stored in plain old host-byte-order. */ 584 } SITEADDR; 585 586 /* 587 * Site information, as stored in shared region. 588 */ 589 typedef struct { 590 SITEADDR addr; /* Unprocessed network address of site. */ 591 u_int32_t config; /* Configuration flags: peer, helper, etc. */ 592 u_int32_t status; /* Group membership status. */ 593 u_int32_t flags; /* Group membership flags. */ 594 u_int32_t listener_cand;/* Number of listener candidates of site. */ 595 } SITEINFO; 596 597 /* 598 * A site address, as stored locally. 599 */ 600 typedef struct { 601 char *host; /* Separately allocated copy of string. */ 602 u_int16_t port; /* Stored in plain old host-byte-order. */ 603 } repmgr_netaddr_t; 604 605 /* 606 * We store site structs in a dynamically allocated, growable array, indexed by 607 * EID. We allocate EID numbers for all sites simply according to their 608 * index within this array. 609 */ 610 #define SITE_FROM_EID(eid) (&db_rep->sites[eid]) 611 #define EID_FROM_SITE(s) ((int)((s) - (&db_rep->sites[0]))) 612 #define IS_VALID_EID(e) ((e) >= 0) 613 #define IS_KNOWN_REMOTE_SITE(e) ((e) >= 0 && ((e) != db_rep->self_eid) && \ 614 (((u_int)(e)) < db_rep->site_cnt)) 615 #define FOR_EACH_REMOTE_SITE_INDEX(i) \ 616 for ((i) = (db_rep->self_eid == 0 ? 1 : 0); \ 617 ((u_int)i) < db_rep->site_cnt; \ 618 (int)(++(i)) == db_rep->self_eid ? ++(i) : i) 619 620 /* 621 * Enable replication manager auto listener takeover. 622 */ 623 #define HAVE_REPLICATION_LISTENER_TAKEOVER 1 624 625 /* Listener candidate, that is subordinate rep-aware process. */ 626 #define IS_LISTENER_CAND(db_rep) \ 627 (FLD_ISSET((db_rep)->region->config, REP_C_AUTOTAKEOVER) && \ 628 IS_SUBORDINATE(db_rep) && (db_rep)->repmgr_status == running) 629 630 /* 631 * The number of listener candidates for each remote site is maintained in 632 * the listener process and used in subordinate rep-aware processes. 633 */ 634 #define SET_LISTENER_CAND(cond, op) \ 635 do { \ 636 if (FLD_ISSET(rep->config, REP_C_AUTOTAKEOVER) && \ 637 !IS_SUBORDINATE(db_rep) && (cond)) { \ 638 MUTEX_LOCK(env, rep->mtx_repmgr); \ 639 sites = R_ADDR(env->reginfo, rep->siteinfo_off);\ 640 (sites[eid].listener_cand)op; \ 641 MUTEX_UNLOCK(env, rep->mtx_repmgr); \ 642 } \ 643 } while (0) 644 645 #define CHECK_LISTENER_CAND(val, op, tval, fval) \ 646 do { \ 647 if (IS_LISTENER_CAND(db_rep)) { \ 648 MUTEX_LOCK(env, rep->mtx_repmgr); \ 649 sites = R_ADDR(env->reginfo, rep->siteinfo_off);\ 650 val = ((sites[eid].listener_cand)op) ? \ 651 (tval) : (fval); \ 652 MUTEX_UNLOCK(env, rep->mtx_repmgr); \ 653 } \ 654 } while (0) 655 656 struct __repmgr_site { 657 repmgr_netaddr_t net_addr; 658 659 /* 660 * Group membership status: a copy of the status from the membership 661 * database, or the out-of-band value 0, meaning that it doesn't exist. 662 * We keep track of a "non-existent" site because the associated 663 * host/port network address is promised to be associated with the 664 * locally known EID for the life of the environment. 665 */ 666 u_int32_t membership; /* Status value from GMDB. */ 667 u_int32_t gmdb_flags; /* Flags from GMDB. */ 668 u_int32_t config; /* Flags from site->set_config() */ 669 670 /* 671 * Everything below here is applicable only to remote sites. 672 */ 673 u_int32_t max_ack_gen; /* Master generation for max_ack. */ 674 DB_LSN max_ack; /* Best ack we've heard from this site. */ 675 DB_LSN max_ckp_lsn; /* Best ckp_lsn we've heard from this site. */ 676 int ack_policy; /* Or 0 if unknown. */ 677 u_int16_t alignment; /* Requirements for app channel msgs. */ 678 db_timespec last_rcvd_timestamp; 679 680 /* Contents depends on state. */ 681 struct { 682 struct { /* when CONNECTED */ 683 /* 684 * The only time we ever have two connections is in case 685 * of a "collision" on the "server" side. In that case, 686 * the incoming connection either will be closed 687 * promptly by the remote "client", or it is a half-open 688 * connection due to the remote client system having 689 * crashed and rebooted, in which case KEEPALIVE will 690 * eventually clear it. 691 */ 692 REPMGR_CONNECTION *in; /* incoming connection */ 693 REPMGR_CONNECTION *out; /* outgoing connection */ 694 } conn; 695 REPMGR_RETRY *retry; /* when PAUSING */ 696 /* Unused when CONNECTING. */ 697 } ref; 698 699 /* 700 * Subordinate connections (connections from subordinate processes at a 701 * multi-process site). Note that the SITE_CONNECTED state, and all the 702 * ref.retry stuff above is irrelevant to subordinate connections. If a 703 * connection is on this list, it exists; and we never bother trying to 704 * reconnect lost connections (indeed we can't, for these are always 705 * incoming-only). 706 */ 707 CONNECTION_LIST sub_conns; 708 REPMGR_RUNNABLE *connector; /* Thread to open a connection. */ 709 710 #define SITE_CONNECTED 1 /* We have a (main) connection. */ 711 #define SITE_CONNECTING 2 /* Trying to establish (main) connection. */ 712 #define SITE_IDLE 3 /* Doing nothing. */ 713 #define SITE_PAUSING 4 /* Waiting til time to retry connecting. */ 714 int state; 715 716 #define SITE_HAS_PRIO 0x01 /* Set if "electable" flag bit is valid. */ 717 #define SITE_ELECTABLE 0x02 718 #define SITE_TOUCHED 0x04 /* Seen GMDB record during present scan. */ 719 u_int32_t flags; 720 }; 721 722 /* 723 * Flag values for the public DB_SITE handle. 724 */ 725 #define DB_SITE_PREOPEN 0x01 /* Provisional EID; may change at env open. */ 726 727 struct __repmgr_response { 728 DBT dbt; 729 int ret; 730 731 #define RESP_COMPLETE 0x01 732 #define RESP_DUMMY_BUF 0x02 733 #define RESP_IN_USE 0x04 734 #define RESP_READING 0x08 735 #define RESP_THREAD_WAITING 0x10 736 u_int32_t flags; 737 }; 738 739 /* 740 * Private structure for managing comms "channels." This is separate from 741 * DB_CHANNEL so as to avoid dragging in other private structures (e.g., 742 * REPMGR_CONNECTION) into db.h, similar to the relationship between DB_ENV and 743 * ENV. 744 */ 745 struct __channel { 746 DB_CHANNEL *db_channel; 747 ENV *env; 748 749 union { 750 /* For simple, specific-EID channels. */ 751 REPMGR_CONNECTION *conn; 752 753 /* For EID_MASTER or EID_BROADCAST channels. */ 754 struct { 755 mgr_mutex_t *mutex; /* For connection establishment. */ 756 REPMGR_CONNECTION **array; 757 u_int32_t cnt; 758 } conns; 759 } c; 760 REPMGR_MESSAGE *msg; /* Incoming channel only; NULL otherwise. */ 761 int responded; /* Boolean flag. */ 762 __repmgr_msg_metadata_args *meta; 763 764 /* Used only in send-to-self request case. */ 765 struct __repmgr_response response; 766 }; 767 768 /* 769 * Repmgr keeps track of references to connection information (instances 770 * of struct __repmgr_connection). There are three kinds of places 771 * connections may be found: (1) SITE->ref.conn, (2) SITE->sub_conns, and 772 * (3) db_rep->connections. 773 * 774 * 1. SITE->ref.conn points to our connection with the listener process 775 * running at the given site, if such a connection exists. We may have 776 * initiated the connection to the site ourselves, or we may have received 777 * it as an incoming connection. Once it is established there is very 778 * little difference between those two cases. 779 * 780 * 2. SITE->sub_conns is a list of connections we have with subordinate 781 * processes running at the given site. There can be any number of these 782 * connections, one per subordinate process. Note that these connections 783 * are always incoming: there's no way for us to initiate this kind of 784 * connection because subordinate processes do not "listen". 785 * 786 * 3. The db_rep->connections list contains the references to any 787 * connections that are not actively associated with any site (we 788 * sometimes call these "orphans"). There are two times when this can 789 * be: 790 * 791 * a) When we accept an incoming connection, we don't know what site it 792 * comes from until we read the initial handshake message. 793 * 794 * b) When an error occurs on a connection, we first mark it as DEFUNCT 795 * and stop using it. Then, at a later, well-defined time, we close 796 * the connection's file descriptor and get rid of the connection 797 * struct. 798 * 799 * In light of the above, we can see that the following describes the 800 * rules for how connections may be moved among these three kinds of 801 * "places": 802 * 803 * - when we initiate an outgoing connection, we of course know what site 804 * it's going to be going to, and so we immediately put the pointer to 805 * the connection struct into SITE->ref.conn 806 * 807 * - when we accept an incoming connection, we don't immediately know 808 * whom it's from, so we have to put it on the orphans list 809 * (db_rep->connections). 810 * 811 * - (incoming, cont.) But as soon as we complete the initial "handshake" 812 * message exchange, we will know which site it's from and whether it's 813 * a subordinate or main connection. At that point we remove it from 814 * db_rep->connections and either point to it by SITE->ref.conn, or add 815 * it to the SITE->sub_conns list. 816 * 817 * - (for any active connection) when an error occurs, we move the 818 * connection to the orphans list until we have a chance to close it. 819 */ 820 821 /* 822 * Repmgr message formats. 823 * 824 * Declarative definitions of current message formats appear in repmgr.msg. 825 * (The s_message/gen_msg.awk utility generates C code.) In general, we send 826 * the buffers marshaled from those structure formats in the "control" portion 827 * of a message. 828 * 829 * Each message is prefaced by a 9-byte message header (as described in 830 * repmgr_net.c). Different message types use the two available 32-bit integers 831 * in different ways, as codified here: 832 */ 833 #define REPMGR_HDR1(hdr) ((hdr).word1) 834 #define REPMGR_HDR2(hdr) ((hdr).word2) 835 836 /* REPMGR_APP_MESSAGE */ 837 #define APP_MSG_BUFFER_SIZE REPMGR_HDR1 838 #define APP_MSG_SEGMENT_COUNT REPMGR_HDR2 839 840 /* REPMGR_REP_MESSAGE and the other traditional repmgr message types. */ 841 #define REP_MSG_CONTROL_SIZE REPMGR_HDR1 842 #define REP_MSG_REC_SIZE REPMGR_HDR2 843 844 /* REPMGR_APP_RESPONSE */ 845 #define APP_RESP_BUFFER_SIZE REPMGR_HDR1 846 #define APP_RESP_TAG REPMGR_HDR2 847 848 /* REPMGR_RESP_ERROR. Note that a zero-length message body is implied. */ 849 #define RESP_ERROR_CODE REPMGR_HDR1 850 #define RESP_ERROR_TAG REPMGR_HDR2 851 852 /* REPMGR_OWN_MSG */ 853 #define REPMGR_OWN_BUF_SIZE REPMGR_HDR1 854 #define REPMGR_OWN_MSG_TYPE REPMGR_HDR2 855 856 /* 857 * Flags for the handshake message. As with repmgr message types, these values 858 * are transmitted between sites, and must therefore be "frozen" permanently. 859 * Names are alphabetized here for easy reference, but values reflect historical 860 * usage. 861 */ 862 #define APP_CHANNEL_CONNECTION 0x02 /* Connection used for app channel. */ 863 #define ELECTABLE_SITE 0x04 864 #define REPMGR_AUTOTAKEOVER 0x08 /* Could become main connection. */ 865 #define REPMGR_SUBORDINATE 0x01 /* This is a subordinate connection. */ 866 867 /* 868 * Flags for application-message meta-data. 869 */ 870 #define REPMGR_MULTI_RESP 0x01 871 #define REPMGR_REQUEST_MSG_TYPE 0x02 872 #define REPMGR_RESPONSE_LIMIT 0x04 873 874 /* 875 * Legacy V1 handshake message format. This is still used in the repmgr 876 * protocol for the initial handshake sent as part of version negotiation 877 * upon connection establishment. It is no longer needed for V1 878 * compatibility because 4.6/V1 is no longer supported. 879 */ 880 typedef struct { 881 u_int32_t version; 882 u_int16_t port; 883 u_int32_t priority; 884 } DB_REPMGR_V1_HANDSHAKE; 885 886 /* 887 * Storage formats. 888 * 889 * As with message formats, stored formats are defined in repmgr.msg. 890 */ 891 /* 892 * Status values for the Group Membership data portion of a record. Like 893 * message type codes, these values are frozen across releases, in order to 894 * avoid pointless churn. These values are mutually exclusive. 895 */ 896 #define SITE_ADDING 0x01 897 #define SITE_DELETING 0x02 898 #define SITE_PRESENT 0x04 899 /* 900 * Flags for the Group Membership data portion of a record. These values are 901 * also frozen across releases. These values are bit fields and may be OR'ed 902 * together. 903 */ 904 #define SITE_VIEW 0x01 905 #define SITE_JOIN_ELECTABLE 0x02 906 907 /* 908 * Message types whose processing could take a long time. We're careful to 909 * avoid using up all our message processing threads on these message types, so 910 * that we don't starve out the more important rep messages. 911 */ 912 #define IS_DEFERRABLE(t) ((t) == REPMGR_OWN_MSG || (t) == REPMGR_APP_MESSAGE) 913 /* 914 * When using leases there are times when a thread processing a message 915 * must block, waiting for leases to be refreshed. But refreshing the 916 * leases requires another thread to accept the lease grant messages. 917 */ 918 #define RESERVED_MSG_TH(env) (IS_USING_LEASES(env) ? 2 : 1) 919 920 #define IS_SUBORDINATE(db_rep) (db_rep->listen_fd == INVALID_SOCKET) 921 922 #define IS_PEER_POLICY(p) ((p) == DB_REPMGR_ACKS_ALL_PEERS || \ 923 (p) == DB_REPMGR_ACKS_QUORUM || \ 924 (p) == DB_REPMGR_ACKS_ONE_PEER) 925 926 /* 927 * Most of the code in repmgr runs while holding repmgr's main mutex, which 928 * resides in db_rep->mutex. This mutex is owned by a single repmgr process, 929 * and serializes access to the (large) critical sections among threads in the 930 * process. Unlike many other mutexes in DB, it is specifically coded as either 931 * a POSIX threads mutex or a Win32 mutex. Note that although it's a large 932 * fraction of the code, it's a tiny fraction of the time: repmgr spends most of 933 * its time in a call to select(), and as well a bit in calls into the Base 934 * replication API. All of those release the mutex. 935 * Access to repmgr's shared values is protected by another mutex: 936 * mtx_repmgr. And, when changing space allocation for that site list 937 * we conform to the convention of acquiring renv->mtx_regenv. These are 938 * less frequent of course. 939 * When it's necessary to acquire more than one of these mutexes, the 940 * ordering priority (or "lock ordering protocol") is: 941 * db_rep->mutex (first) 942 * mtx_repmgr (briefly) 943 * mtx_regenv (last, and most briefly) 944 * 945 * There are also mutexes for app message "channels". Each channel has a mutex, 946 * which is used to serialize any connection re-establishment that may become 947 * necessary during its lifetime (such as when a master changes). This never 948 * happens on a simple, specific-EID channel, but in other cases multiple app 949 * threads could be making send_xxx() calls concurrently, and it would not do to 950 * have two of them try to re-connect concurrently. 951 * When re-establishing a connection, the channel lock is held while 952 * grabbing first the mtx_repmgr, and then the db_rep mutex (but not both 953 * together). I.e., we have: 954 * channel->mutex (first) 955 * [mtx_repmgr (very briefly)] and then [db_rep->mutex (very briefly)] 956 */ 957 958 #define LOCK_MUTEX(m) do { \ 959 if (__repmgr_lock_mutex(m) != 0) \ 960 return (DB_RUNRECOVERY); \ 961 } while (0) 962 963 #define UNLOCK_MUTEX(m) do { \ 964 if (__repmgr_unlock_mutex(m) != 0) \ 965 return (DB_RUNRECOVERY); \ 966 } while (0) 967 968 /* POSIX/Win32 socket (and other) portability. */ 969 #ifdef DB_WIN32 970 #define WOULDBLOCK WSAEWOULDBLOCK 971 #undef DB_REPMGR_EAGAIN 972 973 #define net_errno WSAGetLastError() 974 typedef int socklen_t; 975 typedef char * sockopt_t; 976 #define sendsocket(s, buf, len, flags) send((s), (buf), (int)(len), (flags)) 977 978 #define iov_len len 979 #define iov_base buf 980 981 typedef DWORD threadsync_timeout_t; 982 983 #define REPMGR_INITED(db_rep) (db_rep->signaler != NULL) 984 #else 985 986 #define INVALID_SOCKET -1 987 #define SOCKET_ERROR -1 988 #define WOULDBLOCK EWOULDBLOCK 989 #define DB_REPMGR_EAGAIN EAGAIN 990 991 #define net_errno errno 992 typedef void * sockopt_t; 993 994 #define sendsocket(s, buf, len, flags) send((s), (buf), (len), (flags)) 995 #define closesocket(fd) close(fd) 996 997 typedef struct timespec threadsync_timeout_t; 998 999 #define REPMGR_INITED(db_rep) (db_rep->read_pipe >= 0) 1000 #endif 1001 1002 #define SELECTOR_RUNNING(db_rep) ((db_rep)->selector != NULL) 1003 1004 #define IS_REPMGR_SSL_ENABLED(env) \ 1005 (!FLD_ISSET(((env)->rep_handle->region)->config, REP_C_DISABLE_SSL)) 1006 1007 /* Macros for SSL Diagnostic messages. */ 1008 #if defined(HAVE_REPMGR_SSL) 1009 1010 #define SSL_DEBUG_SHUTDOWN(env, format,...) \ 1011 if (IS_REPMGR_SSL_ENABLED(env)) \ 1012 VPRINT(env, (env, \ 1013 DB_VERB_REPMGR_SSL_CONN|DB_VERB_REPMGR_SSL_ALL, \ 1014 format, ##__VA_ARGS__)) 1015 #define SSL_DEBUG_CONNECT(env, format,...) \ 1016 if (IS_REPMGR_SSL_ENABLED(env)) \ 1017 VPRINT(env, (env, \ 1018 DB_VERB_REPMGR_SSL_CONN|DB_VERB_REPMGR_SSL_ALL, \ 1019 format, ##__VA_ARGS__)) 1020 #define SSL_DEBUG_ACCEPT(env, format,...) \ 1021 if (IS_REPMGR_SSL_ENABLED(env)) \ 1022 VPRINT(env, (env, \ 1023 DB_VERB_REPMGR_SSL_CONN|DB_VERB_REPMGR_SSL_ALL, \ 1024 format, ##__VA_ARGS__)) 1025 #define SSL_DEBUG_WRITE(env, format,...) \ 1026 if (IS_REPMGR_SSL_ENABLED(env)) \ 1027 VPRINT(env, (env, \ 1028 DB_VERB_REPMGR_SSL_IO|DB_VERB_REPMGR_SSL_ALL, \ 1029 format, ##__VA_ARGS__)) 1030 #define SSL_DEBUG_READ(env, format,...) \ 1031 if (IS_REPMGR_SSL_ENABLED(env)) \ 1032 VPRINT(env, (env, \ 1033 DB_VERB_REPMGR_SSL_IO|DB_VERB_REPMGR_SSL_ALL, \ 1034 format, ##__VA_ARGS__)) 1035 1036 #else 1037 1038 #define SSL_DEBUG_SHUTDOWN(env, format,...) 1039 #define SSL_DEBUG_CONNECT(env, format,...) 1040 #define SSL_DEBUG_ACCEPT(env, format,...) 1041 #define SSL_DEBUG_WRITE(env, format,...) 1042 #define SSL_DEBUG_READ(env, format,...) 1043 1044 #endif 1045 1046 /* 1047 * Generic definition of some action to be performed on each connection, in the 1048 * form of a call-back function. 1049 */ 1050 typedef int (*CONNECTION_ACTION) __P((ENV *, REPMGR_CONNECTION *, void *)); 1051 1052 /* 1053 * Generic predicate to test a condition that a thread is waiting for. 1054 */ 1055 typedef int (*PREDICATE) __P((ENV *, void *)); 1056 1057 #include "dbinc_auto/repmgr_ext.h" 1058 1059 #if defined(__cplusplus) 1060 } 1061 #endif 1062 #endif /* !_DB_REPMGR_H_ */ 1063