1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 2006, 2013 Oracle and/or its affiliates.  All rights reserved.
5  *
6  * $Id$
7  */
8 
9 #ifndef _DB_REPMGR_H_
10 #define	_DB_REPMGR_H_
11 
12 #include "dbinc_auto/repmgr_automsg.h"
13 
14 #if defined(__cplusplus)
15 extern "C" {
16 #endif
17 
18 /*
19  * Replication Manager message format types.  These few format codes identify
20  * enough information to describe, at the lowest level, how a message should be
21  * read from the wire, including how much memory should be allocated to hold the
22  * result.  (Often we want to allocate more than just enough to hold the
23  * received bytes, if we know that we will need more during processing.)
24  *
25  * These values are transmitted between sites, even sites running differing BDB
26  * versions.  Therefore, once assigned, the values are permanently "frozen".
27  *
28  * For example, in repmgr wire protocol version 1 the highest assigned message
29  * type value was 3, for REPMGR_REP_MESSAGE.  Wire protocol version 2 added the
30  * HEARTBEAT message type (4).
31  *
32  * New message types added in later versions always get new (higher) values.  We
33  * still list them in alphabetical order, for ease of reference.  But this
34  * generally does not correspond to numerical order.
35  */
36 #define	REPMGR_APP_MESSAGE	5	/* Msg sent from app. on DB_CHANNEL. */
37 #define	REPMGR_APP_RESPONSE	6	/* Response to a channel request. */
38 #define	REPMGR_OWN_MSG		8	/* Repmgr's own messages, to peers. */
39 #define	REPMGR_HANDSHAKE	2	/* Connection establishment sequence. */
40 #define	REPMGR_HEARTBEAT	4	/* Monitor connection health. */
41 #define	REPMGR_PERMLSN		1	/* My perm LSN. */
42 #define	REPMGR_REP_MESSAGE	3	/* Normal replication message. */
43 #define	REPMGR_RESP_ERROR	7	/* Sys-gen'd error resp to request. */
44 
45 /*
46  * Largest known message type code known in each protocol version we support.
47  * In protocol version one there were only three message types: 1, 2, and 3; so
48  * 3 was the max.  In protocol version 2 we introduced heartbeats, type 4.
49  * (Protocol version 3 did not introduce any new message types.)  In version 4
50  * we introduced a few more new message types, the largest of which had value 7.
51  */
52 #define	REPMGR_MAX_V1_MSG_TYPE	3
53 #define	REPMGR_MAX_V2_MSG_TYPE	4
54 #define	REPMGR_MAX_V3_MSG_TYPE	4
55 #define	REPMGR_MAX_V4_MSG_TYPE	8
56 #define	HEARTBEAT_MIN_VERSION	2
57 #define	CHANNEL_MIN_VERSION	4
58 #define	CONN_COLLISION_VERSION	4
59 #define	GM_MIN_VERSION		4
60 #define	OWN_MIN_VERSION		4
61 
62 /* The range of protocol versions we're willing to support. */
63 #define	DB_REPMGR_VERSION	4
64 #define	DB_REPMGR_MIN_VERSION	1
65 
66 /*
67  * For messages with the "REPMGR_OWN_MSG" format code, a message type (see
68  * REPMGR_OWN_MSG_TYPE, below) is included in the header.  While at the lowest
69  * level, the format codes identify only enough to read and allocate memory, at
70  * the next higher level the following message type codes identify the content
71  * of the message: how to unmarshal and dispatch it.
72  *
73  * Like the message format types, these message type values should be
74  * permanently frozen.
75  */
76 #define	REPMGR_CONNECT_REJECT	1
77 #define	REPMGR_GM_FAILURE	2
78 #define	REPMGR_GM_FORWARD	3
79 #define	REPMGR_JOIN_REQUEST	4
80 #define	REPMGR_JOIN_SUCCESS	5
81 #define	REPMGR_PARM_REFRESH	6
82 #define	REPMGR_REJOIN		7
83 #define	REPMGR_REMOVE_REQUEST	8
84 #define	REPMGR_REMOVE_SUCCESS	9
85 #define	REPMGR_RESOLVE_LIMBO	10
86 #define	REPMGR_SHARING		11
87 
88 
89 struct __repmgr_connection;
90     typedef struct __repmgr_connection REPMGR_CONNECTION;
91 struct __repmgr_queue; typedef struct __repmgr_queue REPMGR_QUEUE;
92 struct __queued_output; typedef struct __queued_output QUEUED_OUTPUT;
93 struct __repmgr_response; typedef struct __repmgr_response REPMGR_RESPONSE;
94 struct __repmgr_retry; typedef struct __repmgr_retry REPMGR_RETRY;
95 struct __repmgr_runnable; typedef struct __repmgr_runnable REPMGR_RUNNABLE;
96 struct __repmgr_site; typedef struct __repmgr_site REPMGR_SITE;
97 struct __cond_waiters_table;
98     typedef struct __cond_waiters_table COND_WAITERS_TABLE;
99 
100 /* Current Group Membership DB format ID. */
101 #define	REPMGR_GMDB_FMT_VERSION	1
102 
103 #ifdef DB_WIN32
104 typedef SOCKET socket_t;
105 typedef HANDLE thread_id_t;
106 typedef HANDLE mgr_mutex_t;
107 typedef HANDLE cond_var_t;
108 
109 typedef COND_WAITERS_TABLE *waiter_t;
110 typedef WSABUF db_iovec_t;
111 #else
112 typedef int socket_t;
113 typedef pthread_t thread_id_t;
114 typedef pthread_mutex_t mgr_mutex_t;
115 typedef pthread_cond_t cond_var_t;
116 typedef pthread_cond_t waiter_t;
117 typedef struct iovec db_iovec_t;
118 #endif
119 
120 /*
121  * The (arbitrary) maximum number of outgoing messages we're willing to hold, on
122  * a queue per connection, waiting for TCP buffer space to become available in
123  * the kernel.  Rather than exceeding this limit, we simply discard additional
124  * messages (since this is always allowed by the replication protocol).
125  *    As a special dispensation, if a message is destined for a specific remote
126  * site (i.e., it's not a broadcast), then we first try blocking the sending
127  * thread, waiting for space to become available (though we only wait a limited
128  * time).  This is so as to be able to handle the immediate flood of (a
129  * potentially large number of) outgoing messages that replication generates, in
130  * a tight loop, when handling PAGE_REQ, LOG_REQ and ALL_REQ requests.
131  */
132 #define	OUT_QUEUE_LIMIT	10
133 
134 /*
135  * The system value is available from sysconf(_SC_HOST_NAME_MAX).
136  * Historically, the maximum host name was 256.
137  */
138 #ifndef MAXHOSTNAMELEN
139 #define	MAXHOSTNAMELEN	256
140 #endif
141 
142 /* A buffer big enough for the string "site host.domain.com:65535". */
143 #define	MAX_SITE_LOC_STRING (MAXHOSTNAMELEN+20)
144 typedef char SITE_STRING_BUFFER[MAX_SITE_LOC_STRING+1];
145 
146 #define	MAX_MSG_BUF	(__REPMGR_MAXMSG_SIZE + MAXHOSTNAMELEN + 1)
147 
148 /* Default timeout values, in seconds. */
149 #define	DB_REPMGR_DEFAULT_ACK_TIMEOUT		(1 * US_PER_SEC)
150 #define	DB_REPMGR_DEFAULT_CONNECTION_RETRY	(30 * US_PER_SEC)
151 #define	DB_REPMGR_DEFAULT_ELECTION_RETRY	(10 * US_PER_SEC)
152 #define	DB_REPMGR_DEFAULT_CHANNEL_TIMEOUT	(5 * US_PER_SEC)
153 
154 typedef TAILQ_HEAD(__repmgr_conn_list, __repmgr_connection) CONNECTION_LIST;
155 typedef STAILQ_HEAD(__repmgr_out_q_head, __queued_output) OUT_Q_HEADER;
156 typedef TAILQ_HEAD(__repmgr_retry_q, __repmgr_retry) RETRY_Q_HEADER;
157 
158 /* Information about threads managed by Replication Framework. */
159 struct __repmgr_runnable {
160 	ENV *env;
161 	thread_id_t thread_id;
162 	void *(*run) __P((void *));
163 	int finished;		/* Boolean: thread is exiting, may be joined. */
164 	int quit_requested;	/* Boolean: thread has been asked to quit. */
165 #ifdef DB_WIN32
166 	HANDLE quit_event;
167 #endif
168 	union {
169 
170 /*
171  * Options governing requested behavior of election thread.
172  */
173 #define	ELECT_F_EVENT_NOTIFY	0x01 /* Notify application of master failure. */
174 #define	ELECT_F_FAST		0x02 /* First election "fast" (n-1 trick). */
175 #define	ELECT_F_IMMED		0x04 /* Start with immediate election. */
176 #define	ELECT_F_INVITEE		0x08 /* Honor (remote) inviter's nsites. */
177 #define	ELECT_F_STARTUP		0x10 /* Observe repmgr_start() policy. */
178 		u_int32_t flags;
179 
180 		int eid;	/* For Connector thread. */
181 
182 		/*
183 		 * Args for other thread types can be added here in the future
184 		 * as needed.
185 		 */
186 	} args;
187 };
188 
189 /*
190  * Information about pending connection establishment retry operations.
191  *
192  * We keep these in order by time.  This works, under the assumption that the
193  * DB_REP_CONNECTION_RETRY never changes once we get going (though that
194  * assumption is of course wrong, so this needs to be fixed).
195  *
196  * Usually, we put things onto the tail end of the list.  But when we add a new
197  * site while threads are running, we trigger its first connection attempt by
198  * scheduling a retry for "0" microseconds from now, putting its retry element
199  * at the head of the list instead.
200  *
201  * TODO: I think this can be fixed by defining "time" to be the time the element
202  * was added (with some convention like "0" meaning immediate), rather than the
203  * deadline time.
204  */
205 struct __repmgr_retry {
206 	TAILQ_ENTRY(__repmgr_retry) entries;
207 	int eid;
208 	db_timespec time;
209 };
210 
211 /*
212  * We use scatter/gather I/O for both reading and writing.  Repmgr messages
213  * (including rep messages) use 3 segments: envelope, control and rec.
214  * Application messages can have any number of segments (the number they
215  * specify, plus 1 for our envelope).  REPMGR_IOVECS_ALLOC_SZ should (only) be
216  * used when n > 3.
217  */
218 #define	REPMGR_IOVECS_ALLOC_SZ(n) \
219 	(sizeof(REPMGR_IOVECS) + ((n) - MIN_IOVEC) * sizeof(db_iovec_t))
220 typedef struct {
221 	/*
222 	 * Index of the first iovec to be used.  Initially of course this is
223 	 * zero.  But as we progress through partial I/O transfers, it ends up
224 	 * pointing to the first iovec to be used on the next operation.
225 	 */
226 	int offset;
227 
228 	/*
229 	 * Total number of pieces defined for this message; equal to the number
230 	 * of times add_buffer and/or add_dbt were called to populate it.  We do
231 	 * *NOT* revise this as we go along.  So subsequent I/O operations must
232 	 * use count-offset to get the number of active vector pieces still
233 	 * remaining.
234 	 */
235 	int count;
236 
237 	/*
238 	 * Total number of bytes accounted for in all the pieces of this
239 	 * message.  We do *NOT* revise this as we go along.
240 	 */
241 	size_t total_bytes;
242 
243 #define	MIN_IOVEC	3
244 	db_iovec_t vectors[MIN_IOVEC];	/* Variable length array. */
245 } REPMGR_IOVECS;
246 
247 typedef struct {
248 	size_t length;		/* number of bytes in data */
249 	int ref_count;		/* # of sites' send queues pointing to us */
250 	u_int8_t data[1];	/* variable size data area */
251 } REPMGR_FLAT;
252 
253 struct __queued_output {
254 	STAILQ_ENTRY(__queued_output) entries;
255 	REPMGR_FLAT *msg;
256 	size_t offset;
257 };
258 
259 /*
260  * The following is for input.  Once we know the sizes of the pieces of an
261  * incoming message, we can create this struct (and also the data areas for the
262  * pieces themselves, in the same memory allocation).  This is also the struct
263  * in which the message lives while it's waiting to be processed by message
264  * threads.
265  */
266 typedef struct __repmgr_message {
267 	STAILQ_ENTRY(__repmgr_message) entries;
268 	__repmgr_msg_hdr_args msg_hdr;
269 	union {
270 		struct {
271 			int originating_eid;
272 			DBT control, rec;
273 		} repmsg;
274 		struct {
275 			REPMGR_CONNECTION *conn;
276 			DBT request;
277 		} gmdb_msg;
278 		struct {
279 			/*
280 			 * Connection from which the message arrived; NULL if
281 			 * generated on the local site.
282 			 */
283 			REPMGR_CONNECTION *conn;
284 
285 			DBT buf; /* for reading */
286 			DBT segments[1]; /* expanded in msg th. before callbk */
287 		} appmsg;
288 	} v;			/* Variants */
289 } REPMGR_MESSAGE;
290 
291 typedef enum {
292 	SIZES_PHASE,
293 	DATA_PHASE
294 } phase_t;
295 
296 typedef enum {
297 	APP_CONNECTION,
298 	REP_CONNECTION,
299 	UNKNOWN_CONN_TYPE
300 } conn_type_t;
301 
302 struct __repmgr_connection {
303 	TAILQ_ENTRY(__repmgr_connection) entries;
304 
305 	socket_t fd;
306 #ifdef DB_WIN32
307 	WSAEVENT event_object;
308 #endif
309 
310 	/*
311 	 * Number of other structures referring to this conn struct.  This
312 	 * ref_count must be reduced to zero before this conn struct can be
313 	 * destroyed.  Referents include:
314 	 *
315 	 * - the select() loop, which owns the right to do all reading, as well
316 	 *   as the exclusive right to eventually close the socket
317 	 *
318 	 * - a "channel" that owns this APP_CONNECTION (on the originating side)
319 	 *
320 	 * - a message received on this APP_CONNECTION, queued for processing
321 	 *
322 	 * - any writer blocked on waiting for the outbound queue to drain
323 	 */
324 	u_int32_t	ref_count;
325 
326 	conn_type_t type;
327 	u_int32_t version;	/* Wire protocol version on this connection. */
328 				/* (0 means not yet determined.) */
329 
330 /*
331  * When we make an outgoing connection, it starts in CONNECTED state.  When we
332  * get the response to our version negotiation, we move to READY.
333  *     For incoming connections that we accept, we start in NEGOTIATE, then to
334  * PARAMETERS, and then to READY.
335  *     CONGESTED is a hierarchical substate of READY: it's just like READY, with
336  * the additional wrinkle that we don't bother waiting for the outgoing queue to
337  * drain in certain circumstances.
338  */
339 #define	CONN_CONGESTED	1	/* Long-lived full outgoing queue. */
340 #define	CONN_CONNECTED	2	/* Awaiting reply to our version negotiation. */
341 #define	CONN_DEFUNCT	3	/* Basically dead, awaiting clean-up. */
342 #define	CONN_NEGOTIATE	4	/* Awaiting version proposal. */
343 #define	CONN_PARAMETERS	5	/* Awaiting parameters handshake. */
344 #define	CONN_READY	6	/* Everything's fine. */
345 	int state;
346 
347 	/*
348 	 * Input: while we're reading a message, we keep track of what phase
349 	 * we're in.  In both phases, we use a REPMGR_IOVECS to keep track of
350 	 * our progress within the phase.  Depending upon the message type, we
351 	 * end up with either a rep_message (which is a wrapper for the control
352 	 * and rec DBTs), or a single generic DBT.
353 	 *     Any time we're in DATA_PHASE, it means we have already received
354 	 * the message header (consisting of msg_type and 2 sizes), and
355 	 * therefore we have allocated buffer space to read the data.  (This is
356 	 * important for resource clean-up.)
357 	 */
358 	phase_t		reading_phase;
359 	REPMGR_IOVECS iovecs;
360 
361 	u_int8_t	msg_type;
362 	u_int8_t	msg_hdr_buf[__REPMGR_MSG_HDR_SIZE];
363 
364 	union {
365 		REPMGR_MESSAGE *rep_message;
366 		struct {
367 			DBT cntrl, rec;
368 		} repmgr_msg;
369 	} input;
370 
371 	/*
372 	 * Output: usually we just simply write messages right in line, in the
373 	 * send() function's thread.  But if TCP doesn't have enough network
374 	 * buffer space for us when we first try it, we instead allocate some
375 	 * memory, and copy the message, and then send it as space becomes
376 	 * available in our main select() thread.  In some cases, if the queue
377 	 * gets too long we wait until it's drained, and then append to it.
378 	 * This condition variable's associated mutex is the normal per-repmgr
379 	 * db_rep->mutex, because that mutex is always held anyway whenever the
380 	 * output queue is consulted.
381 	 */
382 	OUT_Q_HEADER outbound_queue;
383 	int out_queue_length;
384 	cond_var_t drained;
385 
386 	/* =-=-=-=-= app-channel stuff =-=-=-=-= */
387 	waiter_t	response_waiters;
388 
389 	/*
390 	 * Array of info about pending responses to requests.  This info is here
391 	 * (rather than on the stack of the thread calling send_request())
392 	 * because it provides an easy way to allocate available numbers for
393 	 * message tags, and also so that we can easily find the right info when
394 	 * we get the tag back in the msg header of the response.
395 	 */
396 	REPMGR_RESPONSE *responses;
397 	u_int32_t	aresp;	/* Array size. */
398 	u_int32_t	cur_resp; /* Index of response currently reading. */
399 
400 	/* =-=-=-=-= for normal repmgr connections =-=-=-=-= */
401 	/*
402 	 * Generally on a REP_CONNECTION type, we have an associated EID (which
403 	 * is an index into the sites array, by the way).  When we initiate the
404 	 * connection ("outgoing"), we know from the start what the EID is; the
405 	 * connection struct is linked from the site struct.  On the other hand,
406 	 * when we receive an incoming connection, we don't know at first what
407 	 * site it may be associated with (or even whether it's an
408 	 * APP_CONNECTION or REP_CONNECTION, for that matter).  During that
409 	 * initial uncertain time, the eid is -1.  Also, when a connection
410 	 * becomes defunct, but the conn struct hasn't yet been destroyed, the
411 	 * eid also becomes -1.
412 	 *
413 	 * The eid should be -1 if and only if the connection is on the orphans
414 	 * list.
415 	 */
416 	int eid;
417 
418 };
419 
420 #define	IS_READY_STATE(s)	((s) == CONN_READY || (s) == CONN_CONGESTED)
421 
422 #ifdef HAVE_GETADDRINFO
423 typedef struct addrinfo	ADDRINFO;
424 typedef struct sockaddr_storage ACCEPT_ADDR;
425 #else
426 typedef struct sockaddr_in ACCEPT_ADDR;
427 /*
428  * Some windows platforms have getaddrinfo (Windows XP), some don't.  We don't
429  * support conditional compilation in our Windows build, so we always use our
430  * own getaddrinfo implementation.  Rename everything so that we don't collide
431  * with the system libraries.
432  */
433 #undef	AI_PASSIVE
434 #define	AI_PASSIVE	0x01
435 #undef	AI_CANONNAME
436 #define	AI_CANONNAME	0x02
437 #undef	AI_NUMERICHOST
438 #define	AI_NUMERICHOST	0x04
439 
440 typedef struct __addrinfo {
441 	int ai_flags;		/* AI_PASSIVE, AI_CANONNAME, AI_NUMERICHOST */
442 	int ai_family;		/* PF_xxx */
443 	int ai_socktype;	/* SOCK_xxx */
444 	int ai_protocol;	/* 0 or IPPROTO_xxx for IPv4 and IPv6 */
445 	size_t ai_addrlen;	/* length of ai_addr */
446 	char *ai_canonname;	/* canonical name for nodename */
447 	struct sockaddr *ai_addr;	/* binary address */
448 	struct __addrinfo *ai_next;	/* next structure in linked list */
449 } ADDRINFO;
450 #endif /* HAVE_GETADDRINFO */
451 
452 /*
453  * Unprocessed network address configuration.
454  */
455 typedef struct {
456 	roff_t host;		/* Separately allocated copy of string. */
457 	u_int16_t port;		/* Stored in plain old host-byte-order. */
458 } SITEADDR;
459 
460 /*
461  * Site information, as stored in shared region.
462  */
463 typedef struct {
464 	SITEADDR addr;		/* Unprocessed network address of site. */
465 	u_int32_t config;	/* Configuration flags: peer, helper, etc. */
466 	u_int32_t status;	/* Group membership status. */
467 } SITEINFO;
468 
469 /*
470  * A site address, as stored locally.
471  */
472 typedef struct {
473 	char *host;		/* Separately allocated copy of string. */
474 	u_int16_t port;		/* Stored in plain old host-byte-order. */
475 } repmgr_netaddr_t;
476 
477 /*
478  * We store site structs in a dynamically allocated, growable array, indexed by
479  * EID.  We allocate EID numbers for all sites simply according to their
480  * index within this array.
481  */
482 #define	SITE_FROM_EID(eid)	(&db_rep->sites[eid])
483 #define	EID_FROM_SITE(s)	((int)((s) - (&db_rep->sites[0])))
484 #define	IS_VALID_EID(e)		((e) >= 0)
485 #define	IS_KNOWN_REMOTE_SITE(e)	((e) >= 0 && ((e) != db_rep->self_eid) && \
486 	    (((u_int)(e)) < db_rep->site_cnt))
487 #define	FOR_EACH_REMOTE_SITE_INDEX(i)                    \
488 	for ((i) = (db_rep->self_eid == 0 ? 1 : 0);	\
489 	     ((u_int)i) < db_rep->site_cnt;		 \
490 	     (int)(++(i)) == db_rep->self_eid ? ++(i) : i)
491 
492 struct __repmgr_site {
493 	repmgr_netaddr_t net_addr;
494 
495 	/*
496 	 * Group membership status: a copy of the status from the membership
497 	 * database, or the out-of-band value 0, meaning that it doesn't exist.
498 	 * We keep track of a "non-existent" site because the associated
499 	 * host/port network address is promised to be associated with the
500 	 * locally known EID for the life of the environment.
501 	 */
502 	u_int32_t	membership; /* Status flags from GMDB. */
503 	u_int32_t	config;	    /* Flags from site->set_config() */
504 
505 	/*
506 	 * Everything below here is applicable only to remote sites.
507 	 */
508 	DB_LSN max_ack;		/* Best ack we've heard from this site. */
509 	int ack_policy;		/* Or 0 if unknown. */
510 	u_int16_t alignment;	/* Requirements for app channel msgs. */
511 	db_timespec last_rcvd_timestamp;
512 
513 	/* Contents depends on state. */
514 	struct {
515 		struct {		 /* when CONNECTED */
516 			/*
517 			 * The only time we ever have two connections is in case
518 			 * of a "collision" on the "server" side.  In that case,
519 			 * the incoming connection either will be closed
520 			 * promptly by the remote "client", or it is a half-open
521 			 * connection due to the remote client system having
522 			 * crashed and rebooted, in which case KEEPALIVE will
523 			 * eventually clear it.
524 			 */
525 			REPMGR_CONNECTION *in; /* incoming connection */
526 			REPMGR_CONNECTION *out; /* outgoing connection */
527 		} conn;
528 		REPMGR_RETRY *retry; /* when PAUSING */
529 		/* Unused when CONNECTING. */
530 	} ref;
531 
532 	/*
533 	 * Subordinate connections (connections from subordinate processes at a
534 	 * multi-process site).  Note that the SITE_CONNECTED state, and all the
535 	 * ref.retry stuff above is irrelevant to subordinate connections.  If a
536 	 * connection is on this list, it exists; and we never bother trying to
537 	 * reconnect lost connections (indeed we can't, for these are always
538 	 * incoming-only).
539 	 */
540 	CONNECTION_LIST	sub_conns;
541 	REPMGR_RUNNABLE	*connector;	/* Thread to open a connection. */
542 
543 #define	SITE_CONNECTED 1	/* We have a (main) connection. */
544 #define	SITE_CONNECTING 2	/* Trying to establish (main) connection. */
545 #define	SITE_IDLE 3		/* Doing nothing. */
546 #define	SITE_PAUSING 4		/* Waiting til time to retry connecting. */
547 	int state;
548 
549 #define	SITE_HAS_PRIO	0x01	/* Set if "electable" flag bit is valid. */
550 #define	SITE_ELECTABLE	0x02
551 #define	SITE_TOUCHED	0x04	/* Seen GMDB record during present scan. */
552 	u_int32_t flags;
553 };
554 
555 /*
556  * Flag values for the public DB_SITE handle.
557  */
558 #define	DB_SITE_PREOPEN	0x01	/* Provisional EID; may change at env open. */
559 
560 struct __repmgr_response {
561 	DBT		dbt;
562 	int		ret;
563 
564 #define	RESP_COMPLETE		0x01
565 #define	RESP_DUMMY_BUF		0x02
566 #define	RESP_IN_USE		0x04
567 #define	RESP_READING		0x08
568 #define	RESP_THREAD_WAITING	0x10
569 	u_int32_t	flags;
570 };
571 
572 /*
573  * Private structure for managing comms "channels."  This is separate from
574  * DB_CHANNEL so as to avoid dragging in other private structures (e.g.,
575  * REPMGR_CONNECTION) into db.h, similar to the relationship between DB_ENV and
576  * ENV.
577  */
578 struct __channel {
579 	DB_CHANNEL *db_channel;
580 	ENV *env;
581 
582 	union {
583 		/* For simple, specific-EID channels. */
584 		REPMGR_CONNECTION *conn;
585 
586 		/* For EID_MASTER or EID_BROADCAST channels. */
587 		struct {
588 			mgr_mutex_t *mutex;  /* For connection establishment. */
589 			REPMGR_CONNECTION **array;
590 			u_int32_t cnt;
591 		} conns;
592 	} c;
593 	REPMGR_MESSAGE *msg;	/* Incoming channel only; NULL otherwise. */
594 	int	responded;	/* Boolean flag. */
595 	__repmgr_msg_metadata_args *meta;
596 
597 	/* Used only in send-to-self request case. */
598 	struct __repmgr_response	response;
599 };
600 
601 /*
602  * Repmgr keeps track of references to connection information (instances
603  * of struct __repmgr_connection).  There are three kinds of places
604  * connections may be found: (1) SITE->ref.conn, (2) SITE->sub_conns, and
605  * (3) db_rep->connections.
606  *
607  * 1. SITE->ref.conn points to our connection with the main process running
608  * at the given site, if such a connection exists.  We may have initiated
609  * the connection to the site ourselves, or we may have received it as an
610  * incoming connection.  Once it is established there is very little
611  * difference between those two cases.
612  *
613  * 2. SITE->sub_conns is a list of connections we have with subordinate
614  * processes running at the given site.  There can be any number of these
615  * connections, one per subordinate process.  Note that these connections
616  * are always incoming: there's no way for us to initiate this kind of
617  * connection because subordinate processes do not "listen".
618  *
619  * 3. The db_rep->connections list contains the references to any
620  * connections that are not actively associated with any site (we
621  * sometimes call these "orphans").  There are two times when this can
622  * be:
623  *
624  *   a) When we accept an incoming connection, we don't know what site it
625  *      comes from until we read the initial handshake message.
626  *
627  *   b) When an error occurs on a connection, we first mark it as DEFUNCT
628  *      and stop using it.  Then, at a later, well-defined time, we close
629  *      the connection's file descriptor and get rid of the connection
630  *      struct.
631  *
632  * In light of the above, we can see that the following describes the
633  * rules for how connections may be moved among these three kinds of
634  * "places":
635  *
636  * - when we initiate an outgoing connection, we of course know what site
637  *   it's going to be going to, and so we immediately put the pointer to
638  *   the connection struct into SITE->ref.conn
639  *
640  * - when we accept an incoming connection, we don't immediately know
641  *   whom it's from, so we have to put it on the orphans list
642  *   (db_rep->connections).
643  *
644  * - (incoming, cont.) But as soon as we complete the initial "handshake"
645  *   message exchange, we will know which site it's from and whether it's
646  *   a subordinate or main connection.  At that point we remove it from
647  *   db_rep->connections and either point to it by SITE->ref.conn, or add
648  *   it to the SITE->sub_conns list.
649  *
650  * - (for any active connection) when an error occurs, we move the
651  *   connection to the orphans list until we have a chance to close it.
652  */
653 
654 /*
655  * Repmgr message formats.
656  *
657  * Declarative definitions of current message formats appear in repmgr.msg.
658  * (The s_message/gen_msg.awk utility generates C code.)  In general, we send
659  * the buffers marshaled from those structure formats in the "control" portion
660  * of a message.
661  *
662  * Each message is prefaced by a 9-byte message header (as described in
663  * repmgr_net.c).  Different message types use the two available 32-bit integers
664  * in different ways, as codified here:
665  */
666 #define	REPMGR_HDR1(hdr)		((hdr).word1)
667 #define	REPMGR_HDR2(hdr)		((hdr).word2)
668 
669 /* REPMGR_APP_MESSAGE */
670 #define APP_MSG_BUFFER_SIZE		REPMGR_HDR1
671 #define	APP_MSG_SEGMENT_COUNT		REPMGR_HDR2
672 
673 /* REPMGR_REP_MESSAGE and the other traditional repmgr message types. */
674 #define	REP_MSG_CONTROL_SIZE		REPMGR_HDR1
675 #define	REP_MSG_REC_SIZE		REPMGR_HDR2
676 
677 /* REPMGR_APP_RESPONSE */
678 #define	APP_RESP_BUFFER_SIZE		REPMGR_HDR1
679 #define	APP_RESP_TAG			REPMGR_HDR2
680 
681 /* REPMGR_RESP_ERROR.  Note that a zero-length message body is implied. */
682 #define	RESP_ERROR_CODE			REPMGR_HDR1
683 #define	RESP_ERROR_TAG			REPMGR_HDR2
684 
685 /* REPMGR_OWN_MSG */
686 #define	REPMGR_OWN_BUF_SIZE		REPMGR_HDR1
687 #define	REPMGR_OWN_MSG_TYPE		REPMGR_HDR2
688 
689 /*
690  * Flags for the handshake message.  As with repmgr message types, these values
691  * are transmitted between sites, and must therefore be "frozen" permanently.
692  * Names are alphabetized here for easy reference, but values reflect historical
693  * usage.
694  */
695 #define	APP_CHANNEL_CONNECTION	0x02	/* Connection used for app channel. */
696 #define	ELECTABLE_SITE		0x04
697 #define	REPMGR_SUBORDINATE	0x01	/* This is a subordinate connection. */
698 
699 /*
700  * Flags for application-message meta-data.
701  */
702 #define	REPMGR_MULTI_RESP	0x01
703 #define	REPMGR_REQUEST_MSG_TYPE	0x02
704 #define	REPMGR_RESPONSE_LIMIT	0x04
705 
706 /*
707  * Legacy V1 handshake message format.  For compatibility, we send this as part
708  * of version negotiation upon connection establishment.
709  */
710 typedef struct {
711 	u_int32_t version;
712 	u_int16_t port;
713 	u_int32_t priority;
714 } DB_REPMGR_V1_HANDSHAKE;
715 
716 /*
717  * Storage formats.
718  *
719  * As with message formats, stored formats are defined in repmgr.msg.
720  */
721 /*
722  * Flags for the Group Membership data portion of a record.  Like message type
723  * codes, these values are frozen across releases, in order to avoid pointless
724  * churn.
725  */
726 #define	SITE_ADDING	0x01
727 #define	SITE_DELETING	0x02
728 #define	SITE_PRESENT	0x04
729 
730 /*
731  * Message types whose processing could take a long time.  We're careful to
732  * avoid using up all our message processing threads on these message types, so
733  * that we don't starve out the more important rep messages.
734  */
735 #define	IS_DEFERRABLE(t) ((t) == REPMGR_OWN_MSG || (t) == REPMGR_APP_MESSAGE)
736 /*
737  * When using leases there are times when a thread processing a message
738  * must block, waiting for leases to be refreshed.  But refreshing the
739  * leases requires another thread to accept the lease grant messages.
740  */
741 #define	RESERVED_MSG_TH(env) (IS_USING_LEASES(env) ? 2 : 1)
742 
743 #define	IS_SUBORDINATE(db_rep)	(db_rep->listen_fd == INVALID_SOCKET)
744 
745 #define	IS_PEER_POLICY(p) ((p) == DB_REPMGR_ACKS_ALL_PEERS ||		\
746     (p) == DB_REPMGR_ACKS_QUORUM ||		\
747     (p) == DB_REPMGR_ACKS_ONE_PEER)
748 
749 /*
750  * Most of the code in repmgr runs while holding repmgr's main mutex, which
751  * resides in db_rep->mutex.  This mutex is owned by a single repmgr process,
752  * and serializes access to the (large) critical sections among threads in the
753  * process.  Unlike many other mutexes in DB, it is specifically coded as either
754  * a POSIX threads mutex or a Win32 mutex.  Note that although it's a large
755  * fraction of the code, it's a tiny fraction of the time: repmgr spends most of
756  * its time in a call to select(), and as well a bit in calls into the Base
757  * replication API.  All of those release the mutex.
758  *     Access to repmgr's shared list of site addresses is protected by
759  * another mutex: mtx_repmgr.  And, when changing space allocation for that site
760  * list we conform to the convention of acquiring renv->mtx_regenv.  These are
761  * less frequent of course.
762  *     When it's necessary to acquire more than one of these mutexes, the
763  * ordering priority (or "lock ordering protocol") is:
764  *        db_rep->mutex (first)
765  *        mtx_repmgr    (briefly)
766  *        mtx_regenv    (last, and most briefly)
767  *
768  * There are also mutexes for app message "channels".  Each channel has a mutex,
769  * which is used to serialize any connection re-establishment that may become
770  * necessary during its lifetime (such as when a master changes).  This never
771  * happens on a simple, specific-EID channel, but in other cases multiple app
772  * threads could be making send_xxx() calls concurrently, and it would not do to
773  * have two of them try to re-connect concurrently.
774  *     When re-establishing a connection, the channel lock is held while
775  * grabbing first the mtx_repmgr, and then the db_rep mutex (but not both
776  * together).  I.e., we have:
777  *        channel->mutex (first)
778  *        [mtx_repmgr (very briefly)] and then [db_rep->mutex (very briefly)]
779  */
780 
781 #define	LOCK_MUTEX(m) do {						\
782 	if (__repmgr_lock_mutex(m) != 0)				\
783 		return (DB_RUNRECOVERY);				\
784 } while (0)
785 
786 #define	UNLOCK_MUTEX(m) do {						\
787 		if (__repmgr_unlock_mutex(m) != 0)			\
788 		return (DB_RUNRECOVERY);				\
789 } while (0)
790 
791 /* POSIX/Win32 socket (and other) portability. */
792 #ifdef DB_WIN32
793 #define	WOULDBLOCK		WSAEWOULDBLOCK
794 #undef	DB_REPMGR_EAGAIN
795 
796 #define	net_errno		WSAGetLastError()
797 typedef int socklen_t;
798 typedef char * sockopt_t;
799 #define	sendsocket(s, buf, len, flags) send((s), (buf), (int)(len), (flags))
800 
801 #define	iov_len len
802 #define	iov_base buf
803 
804 typedef DWORD threadsync_timeout_t;
805 
806 #define	REPMGR_INITED(db_rep) (db_rep->signaler != NULL)
807 #else
808 
809 #define	INVALID_SOCKET		-1
810 #define	SOCKET_ERROR		-1
811 #define	WOULDBLOCK		EWOULDBLOCK
812 #define	DB_REPMGR_EAGAIN	EAGAIN
813 
814 #define	net_errno		errno
815 typedef void * sockopt_t;
816 
817 #define	sendsocket(s, buf, len, flags) send((s), (buf), (len), (flags))
818 #define	closesocket(fd)		close(fd)
819 
820 typedef struct timespec threadsync_timeout_t;
821 
822 #define	REPMGR_INITED(db_rep) (db_rep->read_pipe >= 0)
823 #endif
824 
825 #define	SELECTOR_RUNNING(db_rep)	((db_rep)->selector != NULL)
826 
827 /*
828  * Generic definition of some action to be performed on each connection, in the
829  * form of a call-back function.
830  */
831 typedef int (*CONNECTION_ACTION) __P((ENV *, REPMGR_CONNECTION *, void *));
832 
833 /*
834  * Generic predicate to test a condition that a thread is waiting for.
835  */
836 typedef int (*PREDICATE) __P((ENV *, void *));
837 
838 #include "dbinc_auto/repmgr_ext.h"
839 
840 #if defined(__cplusplus)
841 }
842 #endif
843 #endif /* !_DB_REPMGR_H_ */
844