1 /*-
2  * Copyright (c) 2006, 2020 Oracle and/or its affiliates.  All rights reserved.
3  *
4  * See the file LICENSE for license information.
5  *
6  * $Id$
7  */
8 
9 #ifndef _DB_REPMGR_H_
10 #define	_DB_REPMGR_H_
11 
12 #include "dbinc_auto/repmgr_automsg.h"
13 
14 #if defined(__cplusplus)
15 extern "C" {
16 #endif
17 
18 /*
19  * Replication Manager message format types.  These few format codes identify
20  * enough information to describe, at the lowest level, how a message should be
21  * read from the wire, including how much memory should be allocated to hold the
22  * result.  (Often we want to allocate more than just enough to hold the
23  * received bytes, if we know that we will need more during processing.)
24  *
25  * These values are transmitted between sites, even sites running differing BDB
26  * versions.  Therefore, once assigned, the values are permanently "frozen".
27  *
28  * For example, in repmgr wire protocol version 1 the highest assigned message
29  * type value was 3, for REPMGR_REP_MESSAGE.  Wire protocol version 2 added the
30  * HEARTBEAT message type (4).
31  *
32  * New message types added in later versions always get new (higher) values.  We
33  * still list them in alphabetical order, for ease of reference.  But this
34  * generally does not correspond to numerical order.
35  */
36 #define	REPMGR_APP_MESSAGE	5	/* Msg sent from app. on DB_CHANNEL. */
37 #define	REPMGR_APP_RESPONSE	6	/* Response to a channel request. */
38 #define	REPMGR_OWN_MSG		8	/* Repmgr's own messages, to peers. */
39 #define	REPMGR_HANDSHAKE	2	/* Connection establishment sequence. */
40 #define	REPMGR_HEARTBEAT	4	/* Monitor connection health. */
41 #define	REPMGR_PERMLSN		1	/* My perm LSN. */
42 #define	REPMGR_REP_MESSAGE	3	/* Normal replication message. */
43 #define	REPMGR_RESP_ERROR	7	/* Sys-gen'd error resp to request. */
44 
45 /*
46  * Largest known message type code known in each protocol version we support.
47  * In protocol version one there were only three message types: 1, 2, and 3; so
48  * 3 was the max.  In protocol version 2 we introduced heartbeats, type 4.
49  * (Protocol version 3 did not introduce any new message types.)  In version 4
50  * we introduced a few more new message types, the largest of which had value 8.
51  * Protocol version 5 did not introduce any new message types, but changed
52  * the format of site info and membership data to support views.
53  *
54  * Protocol version 6 introduced preferred master mode, which added several
55  * new REPMGR_OWN messages.
56  *
57  * Protocol version 7 did not introduce any new message types, but changed the
58  * format of permlsn to improve group-aware log archiving and added new struct
59  * names for heartbeat and readonly_response payloads.
60  */
61 #define	REPMGR_MAX_V1_MSG_TYPE	3
62 #define	REPMGR_MAX_V2_MSG_TYPE	4
63 #define	REPMGR_MAX_V3_MSG_TYPE	4
64 #define	REPMGR_MAX_V4_MSG_TYPE	8
65 #define	REPMGR_MAX_V5_MSG_TYPE	8
66 #define	REPMGR_MAX_V6_MSG_TYPE	8
67 #define	REPMGR_MAX_V7_MSG_TYPE	8
68 #define	HEARTBEAT_MIN_VERSION	2
69 #define	CHANNEL_MIN_VERSION	4
70 #define	CONN_COLLISION_VERSION	4
71 #define	GM_MIN_VERSION		4
72 #define	OWN_MIN_VERSION		4
73 #define	VIEW_MIN_VERSION	5
74 #define	PREFMAS_MIN_VERSION	6
75 
76 /* The range of protocol versions we're willing to support. */
77 #define	DB_REPMGR_VERSION	7
78 #define	DB_REPMGR_MIN_VERSION	2
79 
80 /*
81  * For messages with the "REPMGR_OWN_MSG" format code, a message type (see
82  * REPMGR_OWN_MSG_TYPE, below) is included in the header.  While at the lowest
83  * level, the format codes identify only enough to read and allocate memory, at
84  * the next higher level the following message type codes identify the content
85  * of the message: how to unmarshal and dispatch it.
86  *
87  * Like the message format types, these message type values should be
88  * permanently frozen.
89  */
90 #define	REPMGR_CONNECT_REJECT		1
91 #define	REPMGR_GM_FAILURE		2
92 #define	REPMGR_GM_FORWARD		3
93 #define	REPMGR_JOIN_REQUEST		4
94 #define	REPMGR_JOIN_SUCCESS		5
95 #define	REPMGR_PARM_REFRESH		6
96 #define	REPMGR_REJOIN			7
97 #define	REPMGR_REMOVE_REQUEST		8
98 #define	REPMGR_REMOVE_SUCCESS		9
99 #define	REPMGR_RESOLVE_LIMBO		10
100 #define	REPMGR_SHARING			11
101 #define	REPMGR_LSNHIST_REQUEST		12
102 #define	REPMGR_LSNHIST_RESPONSE		13
103 #define	REPMGR_PREFMAS_FAILURE		14
104 #define	REPMGR_PREFMAS_SUCCESS		15
105 #define	REPMGR_READONLY_MASTER		16
106 #define	REPMGR_READONLY_RESPONSE	17
107 #define	REPMGR_RESTART_CLIENT		18
108 
109 /*
110  * Write forwarding definitions.
111  *
112  * This is always the first value in a repmgr write forwarding message.
113  * Testing this helps ensure that repmgr write forwarding code only
114  * attempts to process a write forwarding message and not some other
115  * type of repmgr application-defined message or random noise.
116  */
117 #define	REPMGR_WF_IDENTIFIER 64424
118 
119 /*
120  * Values for write forwarded operations.  These values should remain
121  * constant across BDB versions for backward compatibility.  Add values
122  * for any new write forwarded operations to the end of this list.
123  */
124 #define	REPMGR_WF_SINGLE_DEL 1
125 #define	REPMGR_WF_SINGLE_PUT 2
126 
127 /* Current protocol version and a fast way to test for a supported message. */
128 #define	REPMGR_WF_VERSION 1
129 #define	REPMGR_WF_MAX_V1_MSG_TYPE 2
130 
131 /* These masks enable two small integers to share the space of a large one. */
132 #define	REPMGR_WF_LOWER_MASK 0x0000ffff
133 #define	REPMGR_WF_UPPER_MASK 0xffff0000
134 
135 /* The minimum and maximum number of DBTs in a write forwarding message. */
136 #define	REPMGR_WF_MIN_DBTS 5
137 #define	REPMGR_WF_MAX_DBTS 6
138 
139 /* Length of a string to which to dump a hex fileid value in verbose output. */
140 #define	REPMGR_WF_FILEID_STRLEN 80
141 
142 #define	IS_USING_WRITE_FORWARDING(env)					\
143 	(FLD_ISSET(((env)->rep_handle->region)->config, REP_C_FORWARD_WRITES))
144 
145 #define	REPMGR_WF_DUMP_FILEID(fileid, i, str)				\
146 	memset(str, 0, REPMGR_WF_FILEID_STRLEN);			\
147 	for (i = 0; i < DB_FILE_ID_LEN; i++)				\
148 		(void)sprintf(str, "%s%x ", str, fileid[i]);
149 
150 /*
151  * Enable two u_int32_t numbers to share the space of a single u_int64_t.
152  * This helps the write forwarding protocol to use fewer iovec segments
153  * and stay within the bounds on systems where IOV_MAX is a small number
154  * (e.g. 16 on Solaris).
155  */
156 typedef union {
157 	u_int64_t unum64;
158 	u_int32_t unum32[2];
159 } wf_uint32_pair;
160 
161 /* Detect inconsistencies between view callback and site's gmdb. */
162 #define	PARTICIPANT_TO_VIEW(db_rep, site)				\
163 	((db_rep)->partial && !FLD_ISSET((site)->gmdb_flags, SITE_VIEW))
164 #define	VIEW_TO_PARTICIPANT(db_rep, site)				\
165 	(!(db_rep)->partial && FLD_ISSET((site)->gmdb_flags, SITE_VIEW))
166 
167 struct __repmgr_connection;
168     typedef struct __repmgr_connection REPMGR_CONNECTION;
169 struct __repmgr_queue; typedef struct __repmgr_queue REPMGR_QUEUE;
170 struct __queued_output; typedef struct __queued_output QUEUED_OUTPUT;
171 struct __repmgr_response; typedef struct __repmgr_response REPMGR_RESPONSE;
172 struct __repmgr_retry; typedef struct __repmgr_retry REPMGR_RETRY;
173 struct __repmgr_runnable; typedef struct __repmgr_runnable REPMGR_RUNNABLE;
174 struct __repmgr_site; typedef struct __repmgr_site REPMGR_SITE;
175 struct __cond_waiters_table;
176     typedef struct __cond_waiters_table COND_WAITERS_TABLE;
177 
178 /* Current Group Membership DB format ID. */
179 #define	REPMGR_GMDB_FMT_VERSION		2
180 #define	REPMGR_GMDB_FMT_MIN_VERSION	1
181 
182 #ifdef DB_WIN32
183 typedef SOCKET socket_t;
184 typedef HANDLE thread_id_t;
185 typedef HANDLE mgr_mutex_t;
186 typedef HANDLE cond_var_t;
187 
188 typedef COND_WAITERS_TABLE *waiter_t;
189 typedef WSABUF db_iovec_t;
190 #else
191 typedef int socket_t;
192 typedef pthread_t thread_id_t;
193 typedef pthread_mutex_t mgr_mutex_t;
194 typedef pthread_cond_t cond_var_t;
195 typedef pthread_cond_t waiter_t;
196 typedef struct iovec db_iovec_t;
197 #endif
198 
199 /*
200  * The (arbitrary) maximum number of outgoing messages we're willing to hold, on
201  * a queue per connection, waiting for TCP buffer space to become available in
202  * the kernel.  Rather than exceeding this limit, we simply discard additional
203  * messages (since this is always allowed by the replication protocol).
204  *    As a special dispensation, if a message is destined for a specific remote
205  * site (i.e., it's not a broadcast), then we first try blocking the sending
206  * thread, waiting for space to become available (though we only wait a limited
207  * time).  This is so as to be able to handle the immediate flood of (a
208  * potentially large number of) outgoing messages that replication generates, in
209  * a tight loop, when handling PAGE_REQ, LOG_REQ and ALL_REQ requests.
210  */
211 #define	OUT_QUEUE_LIMIT	10
212 
213 /*
214  * The system value is available from sysconf(_SC_HOST_NAME_MAX).
215  */
216 #ifndef MAXHOSTNAMELEN
217 #define	MAXHOSTNAMELEN	1025
218 #endif
219 
220 /* A buffer big enough for the string "site host.domain.com:65535". */
221 #define	MAX_SITE_LOC_STRING (MAXHOSTNAMELEN+20)
222 typedef char SITE_STRING_BUFFER[MAX_SITE_LOC_STRING+1];
223 
224 #define	MAX_MSG_BUF	(__REPMGR_MAXMSG_SIZE + MAXHOSTNAMELEN + 1)
225 
226 /* Default timeout values, in seconds. */
227 #define	DB_REPMGR_DEFAULT_ACK_TIMEOUT		(1 * US_PER_SEC)
228 #define	DB_REPMGR_DEFAULT_CONNECTION_RETRY	(30 * US_PER_SEC)
229 #define	DB_REPMGR_DEFAULT_ELECTION_RETRY	(10 * US_PER_SEC)
230 #define	DB_REPMGR_DEFAULT_CHANNEL_TIMEOUT	(5 * US_PER_SEC)
231 
232 /* Default preferred master automatic configuration values. */
233 #define	DB_REPMGR_PREFMAS_ELECTION_RETRY	(1 * US_PER_SEC)
234 #define	DB_REPMGR_PREFMAS_HEARTBEAT_MONITOR	(2 * US_PER_SEC)
235 #define	DB_REPMGR_PREFMAS_HEARTBEAT_SEND	(75 * (US_PER_SEC / 100))
236 #define	DB_REPMGR_PREFMAS_PRIORITY_CLIENT	75
237 #define	DB_REPMGR_PREFMAS_PRIORITY_MASTER	200
238 
239 /* Defaults for undocumented incoming queue maximum messages. */
240 #define	DB_REPMGR_DEFAULT_INQUEUE_MAX		(100 * MEGABYTE)
241 #define	DB_REPMGR_INQUEUE_REDZONE_PERCENT	85
242 
243 typedef TAILQ_HEAD(__repmgr_conn_list, __repmgr_connection) CONNECTION_LIST;
244 typedef STAILQ_HEAD(__repmgr_out_q_head, __queued_output) OUT_Q_HEADER;
245 typedef TAILQ_HEAD(__repmgr_retry_q, __repmgr_retry) RETRY_Q_HEADER;
246 
247 /* Information about threads managed by Replication Framework. */
248 struct __repmgr_runnable {
249 	ENV *env;
250 	thread_id_t thread_id;
251 	void *(*run) __P((void *));
252 	int finished;		/* Boolean: thread is exiting, may be joined. */
253 	int quit_requested;	/* Boolean: thread has been asked to quit. */
254 #ifdef DB_WIN32
255 	HANDLE quit_event;
256 #endif
257 	union {
258 
259 /*
260  * Options governing requested behavior of election thread.
261  */
262 #define	ELECT_F_CLIENT_RESTART	0x01 /* Do client restarts but no elections. */
263 #define	ELECT_F_EVENT_NOTIFY	0x02 /* Notify application of master failure. */
264 #define	ELECT_F_FAST		0x04 /* First election "fast" (n-1 trick). */
265 #define	ELECT_F_IMMED		0x08 /* Start with immediate election. */
266 #define	ELECT_F_INVITEE		0x10 /* Honor (remote) inviter's nsites. */
267 #define	ELECT_F_STARTUP		0x20 /* Observe repmgr_start() policy. */
268 		u_int32_t flags;
269 
270 		/* For connector thread. */
271 		struct {
272 			int eid;
273 #define	CONNECT_F_REFRESH	0x01 /* New connection to replace old one. */
274 			u_int32_t flags;
275 		} conn_th;
276 
277 		/*
278 		 * Args for other thread types can be added here in the future
279 		 * as needed.
280 		 */
281 	} args;
282 };
283 
284 /*
285  * Information about pending connection establishment retry operations.
286  *
287  * We keep these in order by time.  This works, under the assumption that the
288  * DB_REP_CONNECTION_RETRY never changes once we get going (though that
289  * assumption is of course wrong, so this needs to be fixed).
290  *
291  * Usually, we put things onto the tail end of the list.  But when we add a new
292  * site while threads are running, we trigger its first connection attempt by
293  * scheduling a retry for "0" microseconds from now, putting its retry element
294  * at the head of the list instead.
295  *
296  * TODO: I think this can be fixed by defining "time" to be the time the element
297  * was added (with some convention like "0" meaning immediate), rather than the
298  * deadline time.
299  */
300 struct __repmgr_retry {
301 	TAILQ_ENTRY(__repmgr_retry) entries;
302 	int eid;
303 	db_timespec time;
304 };
305 
306 /*
307  * We use scatter/gather I/O for both reading and writing.  Repmgr messages
308  * (including rep messages) use 3 segments: envelope, control and rec.
309  * Application messages can have any number of segments (the number they
310  * specify, plus 1 for our envelope).  REPMGR_IOVECS_ALLOC_SZ should (only) be
311  * used when n > 3.
312  */
313 #define	REPMGR_IOVECS_ALLOC_SZ(n) \
314 	(sizeof(REPMGR_IOVECS) + ((n) - MIN_IOVEC) * sizeof(db_iovec_t))
315 typedef struct {
316 	/*
317 	 * Index of the first iovec to be used.  Initially of course this is
318 	 * zero.  But as we progress through partial I/O transfers, it ends up
319 	 * pointing to the first iovec to be used on the next operation.
320 	 */
321 	int offset;
322 
323 	/*
324 	 * Total number of pieces defined for this message; equal to the number
325 	 * of times add_buffer and/or add_dbt were called to populate it.  We do
326 	 * *NOT* revise this as we go along.  So subsequent I/O operations must
327 	 * use count-offset to get the number of active vector pieces still
328 	 * remaining.
329 	 */
330 	int count;
331 
332 	/*
333 	 * Total number of bytes accounted for in all the pieces of this
334 	 * message.  We do *NOT* revise this as we go along.
335 	 */
336 	size_t total_bytes;
337 
338 #define	MIN_IOVEC	3
339 	db_iovec_t vectors[MIN_IOVEC];	/* Variable length array. */
340 } REPMGR_IOVECS;
341 
342 typedef struct {
343 	size_t length;		/* number of bytes in data */
344 	int ref_count;		/* # of sites' send queues pointing to us */
345 	u_int8_t data[1];	/* variable size data area */
346 } REPMGR_FLAT;
347 
348 struct __queued_output {
349 	STAILQ_ENTRY(__queued_output) entries;
350 	REPMGR_FLAT *msg;
351 	size_t offset;
352 };
353 
354 /*
355  * The following is for input.  Once we know the sizes of the pieces of an
356  * incoming message, we can create this struct (and also the data areas for the
357  * pieces themselves, in the same memory allocation).  This is also the struct
358  * in which the message lives while it's waiting to be processed by message
359  * threads.
360  */
361 typedef struct __repmgr_message {
362 	STAILQ_ENTRY(__repmgr_message) entries;
363 	size_t size;
364 	__repmgr_msg_hdr_args msg_hdr;
365 	union {
366 		struct {
367 			int originating_eid;
368 			DBT control, rec;
369 		} repmsg;
370 		struct {
371 			REPMGR_CONNECTION *conn;
372 			DBT request;
373 		} gmdb_msg;
374 		struct {
375 			/*
376 			 * Connection from which the message arrived; NULL if
377 			 * generated on the local site.
378 			 */
379 			REPMGR_CONNECTION *conn;
380 
381 			DBT buf; /* for reading */
382 			DBT segments[1]; /* expanded in msg th. before callbk */
383 		} appmsg;
384 	} v;			/* Variants */
385 } REPMGR_MESSAGE;
386 
387 typedef enum {
388 	SIZES_PHASE,
389 	DATA_PHASE
390 } phase_t;
391 
392 typedef enum {
393 	APP_CONNECTION,
394 	REP_CONNECTION,
395 	UNKNOWN_CONN_TYPE
396 } conn_type_t;
397 
398 #if defined(HAVE_REPMGR_SSL)
399 typedef struct __repmgr_ssl_write_sync_info {
400 	char		*rmgr_ssl_wbuf_ptr;
401 	int		rmgr_ssl_wbuf_length;
402 	int		rmgr_ssl_write_pending;
403 	mgr_mutex_t	*rmgr_ssl_write_mutex;
404 } REPMGR_SSL_WRITE_INFO;
405 
406 typedef struct __repmgr_ssl__conn_info {
407 	/* Only one thread can be interacting with the SSL object at a time. */
408 	mgr_mutex_t	*repmgr_ssl_conn_mutex;
409 	SSL *ssl;
410 
411 #define	REPMGR_SSL_READ_PENDING_ON_READ		0x1
412 #define	REPMGR_SSL_READ_PENDING_ON_WRITE	0x2
413 #define	REPMGR_SSL_WRITE_PENDING_ON_READ	0x4
414 #define	REPMGR_SSL_WRITE_PENDING_ON_WRITE	0x8
415 	int ssl_io_state;
416 
417 	REPMGR_SSL_WRITE_INFO	*rmgr_ssl_wrinfo;
418 } REPMGR_SSL_CONN_INFO;
419 #endif
420 
421 struct __repmgr_connection {
422 	TAILQ_ENTRY(__repmgr_connection) entries;
423 
424 	socket_t fd;
425 #ifdef DB_WIN32
426 	WSAEVENT event_object;
427 #endif
428 
429 	/*
430 	 * Number of other structures referring to this conn struct.  This
431 	 * ref_count must be reduced to zero before this conn struct can be
432 	 * destroyed.  Referents include:
433 	 *
434 	 * - the select() loop, which owns the right to do all reading, as well
435 	 *   as the exclusive right to eventually close the socket
436 	 *
437 	 * - a "channel" that owns this APP_CONNECTION (on the originating side)
438 	 *
439 	 * - a message received on this APP_CONNECTION, queued for processing
440 	 *
441 	 * - any writer blocked on waiting for the outbound queue to drain
442 	 */
443 	u_int32_t	ref_count;
444 
445 	conn_type_t type;
446 	u_int32_t version;	/* Wire protocol version on this connection. */
447 				/* (0 means not yet determined.) */
448 
449 /*
450  * When we make an outgoing connection, it starts in CONNECTED state.  When we
451  * get the response to our version negotiation, we move to READY.
452  *     For incoming connections that we accept, we start in NEGOTIATE, then to
453  * PARAMETERS, and then to READY.
454  *     CONGESTED is a hierarchical substate of READY: it's just like READY, with
455  * the additional wrinkle that we don't bother waiting for the outgoing queue to
456  * drain in certain circumstances.
457  */
458 #define	CONN_CONGESTED	1	/* Long-lived full outgoing queue. */
459 #define	CONN_CONNECTED	2	/* Awaiting reply to our version negotiation. */
460 #define	CONN_DEFUNCT	3	/* Basically dead, awaiting clean-up. */
461 #define	CONN_NEGOTIATE	4	/* Awaiting version proposal. */
462 #define	CONN_PARAMETERS	5	/* Awaiting parameters handshake. */
463 #define	CONN_READY	6	/* Everything's fine. */
464 	int state;
465 	u_int32_t auto_takeover;/* Connection to remote listener candidate. */
466 
467 	/*
468 	 * Input: while we're reading a message, we keep track of what phase
469 	 * we're in.  In both phases, we use a REPMGR_IOVECS to keep track of
470 	 * our progress within the phase.  Depending upon the message type, we
471 	 * end up with either a rep_message (which is a wrapper for the control
472 	 * and rec DBTs), or a single generic DBT.
473 	 *     Any time we're in DATA_PHASE, it means we have already received
474 	 * the message header (consisting of msg_type and 2 sizes), and
475 	 * therefore we have allocated buffer space to read the data.  (This is
476 	 * important for resource clean-up.)
477 	 */
478 	phase_t		reading_phase;
479 	REPMGR_IOVECS iovecs;
480 
481 	u_int8_t	msg_type;
482 	u_int8_t	msg_hdr_buf[__REPMGR_MSG_HDR_SIZE];
483 
484 	union {
485 		REPMGR_MESSAGE *rep_message;
486 		struct {
487 			DBT cntrl, rec;
488 		} repmgr_msg;
489 	} input;
490 
491 	/*
492 	 * Output: usually we just simply write messages right in line, in the
493 	 * send() function's thread.  But if TCP doesn't have enough network
494 	 * buffer space for us when we first try it, we instead allocate some
495 	 * memory, and copy the message, and then send it as space becomes
496 	 * available in our main select() thread.  In some cases, if the queue
497 	 * gets too long we wait until it's drained, and then append to it.
498 	 * This condition variable's associated mutex is the normal per-repmgr
499 	 * db_rep->mutex, because that mutex is always held anyway whenever the
500 	 * output queue is consulted.
501 	 */
502 	OUT_Q_HEADER outbound_queue;
503 	int out_queue_length;
504 	cond_var_t drained;
505 
506 	/* =-=-=-=-= app-channel stuff =-=-=-=-= */
507 	waiter_t	response_waiters;
508 
509 	/*
510 	 * Array of info about pending responses to requests.  This info is here
511 	 * (rather than on the stack of the thread calling send_request())
512 	 * because it provides an easy way to allocate available numbers for
513 	 * message tags, and also so that we can easily find the right info when
514 	 * we get the tag back in the msg header of the response.
515 	 */
516 	REPMGR_RESPONSE *responses;
517 	u_int32_t	aresp;	/* Array size. */
518 	u_int32_t	cur_resp; /* Index of response currently reading. */
519 
520 	/* =-=-=-=-= for normal repmgr connections =-=-=-=-= */
521 	/*
522 	 * Generally on a REP_CONNECTION type, we have an associated EID (which
523 	 * is an index into the sites array, by the way).  When we initiate the
524 	 * connection ("outgoing"), we know from the start what the EID is; the
525 	 * connection struct is linked from the site struct.  On the other hand,
526 	 * when we receive an incoming connection, we don't know at first what
527 	 * site it may be associated with (or even whether it's an
528 	 * APP_CONNECTION or REP_CONNECTION, for that matter).  During that
529 	 * initial uncertain time, the eid is -1.  Also, when a connection
530 	 * becomes defunct, but the conn struct hasn't yet been destroyed, the
531 	 * eid also becomes -1.
532 	 *
533 	 * The eid should be -1 if and only if the connection is on the orphans
534 	 * list.
535 	 */
536 	int eid;
537 
538 #if defined(HAVE_REPMGR_SSL)
539 	/* For ssl connection information. */
540 	REPMGR_SSL_CONN_INFO	*repmgr_ssl_info;
541 #endif
542 
543 	ENV	*env;
544 };
545 
546 #define	IS_READY_STATE(s)	((s) == CONN_READY || (s) == CONN_CONGESTED)
547 
548 #ifdef HAVE_GETADDRINFO
549 typedef struct addrinfo	ADDRINFO;
550 typedef struct sockaddr_storage ACCEPT_ADDR;
551 #else
552 typedef struct sockaddr_in ACCEPT_ADDR;
553 /*
554  * Some windows platforms have getaddrinfo (Windows XP), some don't.  We don't
555  * support conditional compilation in our Windows build, so we always use our
556  * own getaddrinfo implementation.  Rename everything so that we don't collide
557  * with the system libraries.
558  */
559 #undef	AI_PASSIVE
560 #define	AI_PASSIVE	0x01
561 #undef	AI_CANONNAME
562 #define	AI_CANONNAME	0x02
563 #undef	AI_NUMERICHOST
564 #define	AI_NUMERICHOST	0x04
565 
566 typedef struct __addrinfo {
567 	int ai_flags;		/* AI_PASSIVE, AI_CANONNAME, AI_NUMERICHOST */
568 	int ai_family;		/* PF_xxx */
569 	int ai_socktype;	/* SOCK_xxx */
570 	int ai_protocol;	/* 0 or IPPROTO_xxx for IPv4 and IPv6 */
571 	size_t ai_addrlen;	/* length of ai_addr */
572 	char *ai_canonname;	/* canonical name for nodename */
573 	struct sockaddr *ai_addr;	/* binary address */
574 	struct __addrinfo *ai_next;	/* next structure in linked list */
575 } ADDRINFO;
576 #endif /* HAVE_GETADDRINFO */
577 
578 /*
579  * Unprocessed network address configuration.
580  */
581 typedef struct {
582 	roff_t host;		/* Separately allocated copy of string. */
583 	u_int16_t port;		/* Stored in plain old host-byte-order. */
584 } SITEADDR;
585 
586 /*
587  * Site information, as stored in shared region.
588  */
589 typedef struct {
590 	SITEADDR addr;		/* Unprocessed network address of site. */
591 	u_int32_t config;	/* Configuration flags: peer, helper, etc. */
592 	u_int32_t status;	/* Group membership status. */
593 	u_int32_t flags;	/* Group membership flags. */
594 	u_int32_t listener_cand;/* Number of listener candidates of site. */
595 } SITEINFO;
596 
597 /*
598  * A site address, as stored locally.
599  */
600 typedef struct {
601 	char *host;		/* Separately allocated copy of string. */
602 	u_int16_t port;		/* Stored in plain old host-byte-order. */
603 } repmgr_netaddr_t;
604 
605 /*
606  * We store site structs in a dynamically allocated, growable array, indexed by
607  * EID.  We allocate EID numbers for all sites simply according to their
608  * index within this array.
609  */
610 #define	SITE_FROM_EID(eid)	(&db_rep->sites[eid])
611 #define	EID_FROM_SITE(s)	((int)((s) - (&db_rep->sites[0])))
612 #define	IS_VALID_EID(e)		((e) >= 0)
613 #define	IS_KNOWN_REMOTE_SITE(e)	((e) >= 0 && ((e) != db_rep->self_eid) && \
614 	    (((u_int)(e)) < db_rep->site_cnt))
615 #define	FOR_EACH_REMOTE_SITE_INDEX(i)                    \
616 	for ((i) = (db_rep->self_eid == 0 ? 1 : 0);	\
617 	     ((u_int)i) < db_rep->site_cnt;		 \
618 	     (int)(++(i)) == db_rep->self_eid ? ++(i) : i)
619 
620 /*
621  * Enable replication manager auto listener takeover.
622  */
623 #define	HAVE_REPLICATION_LISTENER_TAKEOVER	1
624 
625 /* Listener candidate, that is subordinate rep-aware process. */
626 #define	IS_LISTENER_CAND(db_rep)					\
627 	(FLD_ISSET((db_rep)->region->config, REP_C_AUTOTAKEOVER) &&	\
628 	    IS_SUBORDINATE(db_rep) && (db_rep)->repmgr_status == running)
629 
630 /*
631  * The number of listener candidates for each remote site is maintained in
632  * the listener process and used in subordinate rep-aware processes.
633  */
634 #define	SET_LISTENER_CAND(cond, op)					\
635 	do {								\
636 		if (FLD_ISSET(rep->config, REP_C_AUTOTAKEOVER) &&	\
637 		    !IS_SUBORDINATE(db_rep) && (cond)) {		\
638 			MUTEX_LOCK(env, rep->mtx_repmgr);		\
639 			sites = R_ADDR(env->reginfo, rep->siteinfo_off);\
640 			(sites[eid].listener_cand)op;			\
641 			MUTEX_UNLOCK(env, rep->mtx_repmgr);		\
642 		}							\
643 	} while (0)
644 
645 #define	CHECK_LISTENER_CAND(val, op, tval, fval)			\
646 	do {								\
647 		if (IS_LISTENER_CAND(db_rep)) {				\
648 			MUTEX_LOCK(env, rep->mtx_repmgr);		\
649 			sites = R_ADDR(env->reginfo, rep->siteinfo_off);\
650 			val = ((sites[eid].listener_cand)op) ?		\
651 			    (tval) : (fval);				\
652 			MUTEX_UNLOCK(env, rep->mtx_repmgr);		\
653 		}							\
654 	} while (0)
655 
656 struct __repmgr_site {
657 	repmgr_netaddr_t net_addr;
658 
659 	/*
660 	 * Group membership status: a copy of the status from the membership
661 	 * database, or the out-of-band value 0, meaning that it doesn't exist.
662 	 * We keep track of a "non-existent" site because the associated
663 	 * host/port network address is promised to be associated with the
664 	 * locally known EID for the life of the environment.
665 	 */
666 	u_int32_t	membership; /* Status value from GMDB. */
667 	u_int32_t	gmdb_flags; /* Flags from GMDB. */
668 	u_int32_t	config;	    /* Flags from site->set_config() */
669 
670 	/*
671 	 * Everything below here is applicable only to remote sites.
672 	 */
673 	u_int32_t max_ack_gen;	/* Master generation for max_ack. */
674 	DB_LSN max_ack;		/* Best ack we've heard from this site. */
675 	DB_LSN max_ckp_lsn;	/* Best ckp_lsn we've heard from this site. */
676 	int ack_policy;		/* Or 0 if unknown. */
677 	u_int16_t alignment;	/* Requirements for app channel msgs. */
678 	db_timespec last_rcvd_timestamp;
679 
680 	/* Contents depends on state. */
681 	struct {
682 		struct {		 /* when CONNECTED */
683 			/*
684 			 * The only time we ever have two connections is in case
685 			 * of a "collision" on the "server" side.  In that case,
686 			 * the incoming connection either will be closed
687 			 * promptly by the remote "client", or it is a half-open
688 			 * connection due to the remote client system having
689 			 * crashed and rebooted, in which case KEEPALIVE will
690 			 * eventually clear it.
691 			 */
692 			REPMGR_CONNECTION *in; /* incoming connection */
693 			REPMGR_CONNECTION *out; /* outgoing connection */
694 		} conn;
695 		REPMGR_RETRY *retry; /* when PAUSING */
696 		/* Unused when CONNECTING. */
697 	} ref;
698 
699 	/*
700 	 * Subordinate connections (connections from subordinate processes at a
701 	 * multi-process site).  Note that the SITE_CONNECTED state, and all the
702 	 * ref.retry stuff above is irrelevant to subordinate connections.  If a
703 	 * connection is on this list, it exists; and we never bother trying to
704 	 * reconnect lost connections (indeed we can't, for these are always
705 	 * incoming-only).
706 	 */
707 	CONNECTION_LIST	sub_conns;
708 	REPMGR_RUNNABLE	*connector;	/* Thread to open a connection. */
709 
710 #define	SITE_CONNECTED 1	/* We have a (main) connection. */
711 #define	SITE_CONNECTING 2	/* Trying to establish (main) connection. */
712 #define	SITE_IDLE 3		/* Doing nothing. */
713 #define	SITE_PAUSING 4		/* Waiting til time to retry connecting. */
714 	int state;
715 
716 #define	SITE_HAS_PRIO	0x01	/* Set if "electable" flag bit is valid. */
717 #define	SITE_ELECTABLE	0x02
718 #define	SITE_TOUCHED	0x04	/* Seen GMDB record during present scan. */
719 	u_int32_t flags;
720 };
721 
722 /*
723  * Flag values for the public DB_SITE handle.
724  */
725 #define	DB_SITE_PREOPEN	0x01	/* Provisional EID; may change at env open. */
726 
727 struct __repmgr_response {
728 	DBT		dbt;
729 	int		ret;
730 
731 #define	RESP_COMPLETE		0x01
732 #define	RESP_DUMMY_BUF		0x02
733 #define	RESP_IN_USE		0x04
734 #define	RESP_READING		0x08
735 #define	RESP_THREAD_WAITING	0x10
736 	u_int32_t	flags;
737 };
738 
739 /*
740  * Private structure for managing comms "channels."  This is separate from
741  * DB_CHANNEL so as to avoid dragging in other private structures (e.g.,
742  * REPMGR_CONNECTION) into db.h, similar to the relationship between DB_ENV and
743  * ENV.
744  */
745 struct __channel {
746 	DB_CHANNEL *db_channel;
747 	ENV *env;
748 
749 	union {
750 		/* For simple, specific-EID channels. */
751 		REPMGR_CONNECTION *conn;
752 
753 		/* For EID_MASTER or EID_BROADCAST channels. */
754 		struct {
755 			mgr_mutex_t *mutex;  /* For connection establishment. */
756 			REPMGR_CONNECTION **array;
757 			u_int32_t cnt;
758 		} conns;
759 	} c;
760 	REPMGR_MESSAGE *msg;	/* Incoming channel only; NULL otherwise. */
761 	int	responded;	/* Boolean flag. */
762 	__repmgr_msg_metadata_args *meta;
763 
764 	/* Used only in send-to-self request case. */
765 	struct __repmgr_response	response;
766 };
767 
768 /*
769  * Repmgr keeps track of references to connection information (instances
770  * of struct __repmgr_connection).  There are three kinds of places
771  * connections may be found: (1) SITE->ref.conn, (2) SITE->sub_conns, and
772  * (3) db_rep->connections.
773  *
774  * 1. SITE->ref.conn points to our connection with the listener process
775  * running at the given site, if such a connection exists.  We may have
776  * initiated the connection to the site ourselves, or we may have received
777  * it as an incoming connection.  Once it is established there is very
778  * little difference between those two cases.
779  *
780  * 2. SITE->sub_conns is a list of connections we have with subordinate
781  * processes running at the given site.  There can be any number of these
782  * connections, one per subordinate process.  Note that these connections
783  * are always incoming: there's no way for us to initiate this kind of
784  * connection because subordinate processes do not "listen".
785  *
786  * 3. The db_rep->connections list contains the references to any
787  * connections that are not actively associated with any site (we
788  * sometimes call these "orphans").  There are two times when this can
789  * be:
790  *
791  *   a) When we accept an incoming connection, we don't know what site it
792  *      comes from until we read the initial handshake message.
793  *
794  *   b) When an error occurs on a connection, we first mark it as DEFUNCT
795  *      and stop using it.  Then, at a later, well-defined time, we close
796  *      the connection's file descriptor and get rid of the connection
797  *      struct.
798  *
799  * In light of the above, we can see that the following describes the
800  * rules for how connections may be moved among these three kinds of
801  * "places":
802  *
803  * - when we initiate an outgoing connection, we of course know what site
804  *   it's going to be going to, and so we immediately put the pointer to
805  *   the connection struct into SITE->ref.conn
806  *
807  * - when we accept an incoming connection, we don't immediately know
808  *   whom it's from, so we have to put it on the orphans list
809  *   (db_rep->connections).
810  *
811  * - (incoming, cont.) But as soon as we complete the initial "handshake"
812  *   message exchange, we will know which site it's from and whether it's
813  *   a subordinate or main connection.  At that point we remove it from
814  *   db_rep->connections and either point to it by SITE->ref.conn, or add
815  *   it to the SITE->sub_conns list.
816  *
817  * - (for any active connection) when an error occurs, we move the
818  *   connection to the orphans list until we have a chance to close it.
819  */
820 
821 /*
822  * Repmgr message formats.
823  *
824  * Declarative definitions of current message formats appear in repmgr.msg.
825  * (The s_message/gen_msg.awk utility generates C code.)  In general, we send
826  * the buffers marshaled from those structure formats in the "control" portion
827  * of a message.
828  *
829  * Each message is prefaced by a 9-byte message header (as described in
830  * repmgr_net.c).  Different message types use the two available 32-bit integers
831  * in different ways, as codified here:
832  */
833 #define	REPMGR_HDR1(hdr)		((hdr).word1)
834 #define	REPMGR_HDR2(hdr)		((hdr).word2)
835 
836 /* REPMGR_APP_MESSAGE */
837 #define	APP_MSG_BUFFER_SIZE		REPMGR_HDR1
838 #define	APP_MSG_SEGMENT_COUNT		REPMGR_HDR2
839 
840 /* REPMGR_REP_MESSAGE and the other traditional repmgr message types. */
841 #define	REP_MSG_CONTROL_SIZE		REPMGR_HDR1
842 #define	REP_MSG_REC_SIZE		REPMGR_HDR2
843 
844 /* REPMGR_APP_RESPONSE */
845 #define	APP_RESP_BUFFER_SIZE		REPMGR_HDR1
846 #define	APP_RESP_TAG			REPMGR_HDR2
847 
848 /* REPMGR_RESP_ERROR.  Note that a zero-length message body is implied. */
849 #define	RESP_ERROR_CODE			REPMGR_HDR1
850 #define	RESP_ERROR_TAG			REPMGR_HDR2
851 
852 /* REPMGR_OWN_MSG */
853 #define	REPMGR_OWN_BUF_SIZE		REPMGR_HDR1
854 #define	REPMGR_OWN_MSG_TYPE		REPMGR_HDR2
855 
856 /*
857  * Flags for the handshake message.  As with repmgr message types, these values
858  * are transmitted between sites, and must therefore be "frozen" permanently.
859  * Names are alphabetized here for easy reference, but values reflect historical
860  * usage.
861  */
862 #define	APP_CHANNEL_CONNECTION	0x02	/* Connection used for app channel. */
863 #define	ELECTABLE_SITE		0x04
864 #define	REPMGR_AUTOTAKEOVER	0x08	/* Could become main connection. */
865 #define	REPMGR_SUBORDINATE	0x01	/* This is a subordinate connection. */
866 
867 /*
868  * Flags for application-message meta-data.
869  */
870 #define	REPMGR_MULTI_RESP	0x01
871 #define	REPMGR_REQUEST_MSG_TYPE	0x02
872 #define	REPMGR_RESPONSE_LIMIT	0x04
873 
874 /*
875  * Legacy V1 handshake message format.  This is still used in the repmgr
876  * protocol for the initial handshake sent as part of version negotiation
877  * upon connection establishment.  It is no longer needed for V1
878  * compatibility because 4.6/V1 is no longer supported.
879  */
880 typedef struct {
881 	u_int32_t version;
882 	u_int16_t port;
883 	u_int32_t priority;
884 } DB_REPMGR_V1_HANDSHAKE;
885 
886 /*
887  * Storage formats.
888  *
889  * As with message formats, stored formats are defined in repmgr.msg.
890  */
891 /*
892  * Status values for the Group Membership data portion of a record.  Like
893  * message type codes, these values are frozen across releases, in order to
894  * avoid pointless churn.  These values are mutually exclusive.
895  */
896 #define	SITE_ADDING	0x01
897 #define	SITE_DELETING	0x02
898 #define	SITE_PRESENT	0x04
899 /*
900  * Flags for the Group Membership data portion of a record.  These values are
901  * also frozen across releases.  These values are bit fields and may be OR'ed
902  * together.
903  */
904 #define	SITE_VIEW		0x01
905 #define	SITE_JOIN_ELECTABLE	0x02
906 
907 /*
908  * Message types whose processing could take a long time.  We're careful to
909  * avoid using up all our message processing threads on these message types, so
910  * that we don't starve out the more important rep messages.
911  */
912 #define	IS_DEFERRABLE(t) ((t) == REPMGR_OWN_MSG || (t) == REPMGR_APP_MESSAGE)
913 /*
914  * When using leases there are times when a thread processing a message
915  * must block, waiting for leases to be refreshed.  But refreshing the
916  * leases requires another thread to accept the lease grant messages.
917  */
918 #define	RESERVED_MSG_TH(env) (IS_USING_LEASES(env) ? 2 : 1)
919 
920 #define	IS_SUBORDINATE(db_rep)	(db_rep->listen_fd == INVALID_SOCKET)
921 
922 #define	IS_PEER_POLICY(p) ((p) == DB_REPMGR_ACKS_ALL_PEERS ||		\
923     (p) == DB_REPMGR_ACKS_QUORUM ||		\
924     (p) == DB_REPMGR_ACKS_ONE_PEER)
925 
926 /*
927  * Most of the code in repmgr runs while holding repmgr's main mutex, which
928  * resides in db_rep->mutex.  This mutex is owned by a single repmgr process,
929  * and serializes access to the (large) critical sections among threads in the
930  * process.  Unlike many other mutexes in DB, it is specifically coded as either
931  * a POSIX threads mutex or a Win32 mutex.  Note that although it's a large
932  * fraction of the code, it's a tiny fraction of the time: repmgr spends most of
933  * its time in a call to select(), and as well a bit in calls into the Base
934  * replication API.  All of those release the mutex.
935  *     Access to repmgr's shared values is protected by another mutex:
936  * mtx_repmgr.  And, when changing space allocation for that site list
937  * we conform to the convention of acquiring renv->mtx_regenv.  These are
938  * less frequent of course.
939  *     When it's necessary to acquire more than one of these mutexes, the
940  * ordering priority (or "lock ordering protocol") is:
941  *        db_rep->mutex (first)
942  *        mtx_repmgr    (briefly)
943  *        mtx_regenv    (last, and most briefly)
944  *
945  * There are also mutexes for app message "channels".  Each channel has a mutex,
946  * which is used to serialize any connection re-establishment that may become
947  * necessary during its lifetime (such as when a master changes).  This never
948  * happens on a simple, specific-EID channel, but in other cases multiple app
949  * threads could be making send_xxx() calls concurrently, and it would not do to
950  * have two of them try to re-connect concurrently.
951  *     When re-establishing a connection, the channel lock is held while
952  * grabbing first the mtx_repmgr, and then the db_rep mutex (but not both
953  * together).  I.e., we have:
954  *        channel->mutex (first)
955  *        [mtx_repmgr (very briefly)] and then [db_rep->mutex (very briefly)]
956  */
957 
958 #define	LOCK_MUTEX(m) do {						\
959 	if (__repmgr_lock_mutex(m) != 0)				\
960 		return (DB_RUNRECOVERY);				\
961 } while (0)
962 
963 #define	UNLOCK_MUTEX(m) do {						\
964 		if (__repmgr_unlock_mutex(m) != 0)			\
965 		return (DB_RUNRECOVERY);				\
966 } while (0)
967 
968 /* POSIX/Win32 socket (and other) portability. */
969 #ifdef DB_WIN32
970 #define	WOULDBLOCK		WSAEWOULDBLOCK
971 #undef	DB_REPMGR_EAGAIN
972 
973 #define	net_errno		WSAGetLastError()
974 typedef int socklen_t;
975 typedef char * sockopt_t;
976 #define	sendsocket(s, buf, len, flags) send((s), (buf), (int)(len), (flags))
977 
978 #define	iov_len len
979 #define	iov_base buf
980 
981 typedef DWORD threadsync_timeout_t;
982 
983 #define	REPMGR_INITED(db_rep) (db_rep->signaler != NULL)
984 #else
985 
986 #define	INVALID_SOCKET		-1
987 #define	SOCKET_ERROR		-1
988 #define	WOULDBLOCK		EWOULDBLOCK
989 #define	DB_REPMGR_EAGAIN	EAGAIN
990 
991 #define	net_errno		errno
992 typedef void * sockopt_t;
993 
994 #define	sendsocket(s, buf, len, flags) send((s), (buf), (len), (flags))
995 #define	closesocket(fd)		close(fd)
996 
997 typedef struct timespec threadsync_timeout_t;
998 
999 #define	REPMGR_INITED(db_rep) (db_rep->read_pipe >= 0)
1000 #endif
1001 
1002 #define	SELECTOR_RUNNING(db_rep)	((db_rep)->selector != NULL)
1003 
1004 #define	IS_REPMGR_SSL_ENABLED(env)					\
1005 	(!FLD_ISSET(((env)->rep_handle->region)->config, REP_C_DISABLE_SSL))
1006 
1007 /* Macros for SSL Diagnostic messages. */
1008 #if defined(HAVE_REPMGR_SSL)
1009 
1010 #define SSL_DEBUG_SHUTDOWN(env, format,...)				\
1011 	if (IS_REPMGR_SSL_ENABLED(env))					\
1012 		VPRINT(env, (env,					\
1013 		    DB_VERB_REPMGR_SSL_CONN|DB_VERB_REPMGR_SSL_ALL,	\
1014 		    format, ##__VA_ARGS__))
1015 #define SSL_DEBUG_CONNECT(env, format,...)				\
1016 	if (IS_REPMGR_SSL_ENABLED(env))					\
1017 		VPRINT(env, (env,					\
1018 		    DB_VERB_REPMGR_SSL_CONN|DB_VERB_REPMGR_SSL_ALL,	\
1019 		    format, ##__VA_ARGS__))
1020 #define SSL_DEBUG_ACCEPT(env, format,...)				\
1021 	if (IS_REPMGR_SSL_ENABLED(env))					\
1022 		VPRINT(env, (env,					\
1023 		    DB_VERB_REPMGR_SSL_CONN|DB_VERB_REPMGR_SSL_ALL,	\
1024 		    format, ##__VA_ARGS__))
1025 #define SSL_DEBUG_WRITE(env, format,...)				\
1026 	if (IS_REPMGR_SSL_ENABLED(env))					\
1027 		VPRINT(env, (env,					\
1028 		    DB_VERB_REPMGR_SSL_IO|DB_VERB_REPMGR_SSL_ALL,	\
1029 		    format, ##__VA_ARGS__))
1030 #define SSL_DEBUG_READ(env, format,...)					\
1031 	if (IS_REPMGR_SSL_ENABLED(env))					\
1032 		VPRINT(env, (env,					\
1033 		    DB_VERB_REPMGR_SSL_IO|DB_VERB_REPMGR_SSL_ALL,	\
1034 		    format, ##__VA_ARGS__))
1035 
1036 #else
1037 
1038 #define SSL_DEBUG_SHUTDOWN(env, format,...)
1039 #define SSL_DEBUG_CONNECT(env, format,...)
1040 #define SSL_DEBUG_ACCEPT(env, format,...)
1041 #define SSL_DEBUG_WRITE(env, format,...)
1042 #define SSL_DEBUG_READ(env, format,...)
1043 
1044 #endif
1045 
1046 /*
1047  * Generic definition of some action to be performed on each connection, in the
1048  * form of a call-back function.
1049  */
1050 typedef int (*CONNECTION_ACTION) __P((ENV *, REPMGR_CONNECTION *, void *));
1051 
1052 /*
1053  * Generic predicate to test a condition that a thread is waiting for.
1054  */
1055 typedef int (*PREDICATE) __P((ENV *, void *));
1056 
1057 #include "dbinc_auto/repmgr_ext.h"
1058 
1059 #if defined(__cplusplus)
1060 }
1061 #endif
1062 #endif /* !_DB_REPMGR_H_ */
1063