1 /*-
2  * Copyright (c) 2001, 2020 Oracle and/or its affiliates.  All rights reserved.
3  *
4  * See the file LICENSE for license information.
5  *
6  * $Id$
7  */
8 
9 #ifndef _DB_REP_H_
10 #define	_DB_REP_H_
11 
12 #include "dbinc_auto/rep_automsg.h"
13 
14 #if defined(__cplusplus)
15 extern "C" {
16 #endif
17 
18 /*
19  * Names of client temp databases.
20  */
21 #define	REPFILEPREFIX	"__db.rep"
22 #define	REPBLOBNAME	"__db.rep.blob.db"
23 #define	REPDBNAME	"__db.rep.db"
24 #define	REPPAGENAME     "__db.reppg.db"
25 
26 /*
27  * Name of replicated system database file, and LSN history subdatabase within
28  * it.  If the INMEM config flag is set, we create the database in memory, with
29  * the REPLSNHIST name (so that is why it also follows the __db naming
30  * convention).
31  */
32 #define	REPSYSDBNAME	"__db.rep.system"
33 #define	REPLSNHIST	"__db.lsn.history"
34 #define	REPMEMBERSHIP	"__db.membership"
35 #define	REPSYSDBPGSZ	1024
36 #define	IS_REP_FILE(name)	(strcmp(name, REPSYSDBNAME) == 0)
37 
38 /* Current version of commit token format, and LSN history database format. */
39 #define	REP_COMMIT_TOKEN_FMT_VERSION	1
40 #define	REP_LSN_HISTORY_FMT_VERSION	1
41 
42 /*
43  * Message types
44  */
45 #define	REP_INVALID		0	/* Invalid message type. */
46 #define	REP_ALIVE		1	/* I am alive message. */
47 #define	REP_ALIVE_REQ		2	/* Request for alive messages. */
48 #define	REP_ALL_REQ		3	/* Request all log records greater than
49 					 * LSN. */
50 #define	REP_BLOB_ALL_REQ	4	/* Request all the given blob files. */
51 #define	REP_BLOB_CHUNK		5	/* A piece of data contained in a blob
52 					 * file. */
53 #define	REP_BLOB_CHUNK_REQ	6	/* Request a piece of data from a blob
54 					 * file. */
55 #define	REP_BLOB_UPDATE		7	/* A list of blob files for a
56 					 * database. */
57 #define	REP_BLOB_UPDATE_REQ	8	/* Request blob files. */
58 #define	REP_BULK_LOG		9	/* Bulk transfer of log records. */
59 #define	REP_BULK_PAGE		10	/* Bulk transfer of pages. */
60 #define	REP_DUPMASTER		11	/* Duplicate master detected;
61 					 * propagate. */
62 #define	REP_FILE		12	/* Page of a database file. NOTUSED */
63 #define	REP_FILE_FAIL		13	/* File requested does not exist. */
64 #define	REP_FILE_REQ		14	/* Request for a database file.
65 					 * NOTUSED */
66 #define	REP_LEASE_GRANT		15	/* Client grants a lease to a master. */
67 #define	REP_LOG			16	/* Log record. */
68 #define	REP_LOG_MORE		17	/* There are more log records to
69 					 * request. */
70 #define	REP_LOG_REQ		18	/* Request for a log record. */
71 #define	REP_MASTER_REQ		19	/* Who is the master */
72 #define	REP_NEWCLIENT		20	/* Announces the presence of a new
73 					 * client. */
74 #define	REP_NEWFILE		21	/* Announce a log file change. */
75 #define	REP_NEWMASTER		22	/* Announces who the master is. */
76 #define	REP_NEWSITE		23	/* Announces that a site has heard from
77 					 * a new site; like NEWCLIENT, but
78 					 * indirect.  A NEWCLIENT message comes
79 					 * directly from the new client while a
80 					 * NEWSITE comes indirectly from
81 					 * someone who heard about a NEWSITE.*/
82 #define	REP_PAGE		24	/* Database page. */
83 #define	REP_PAGE_FAIL		25	/* Requested page does not exist. */
84 #define	REP_PAGE_MORE		26	/* There are more pages to request. */
85 #define	REP_PAGE_REQ		27	/* Request for a database page. */
86 #define	REP_REREQUEST		28	/* Force rerequest. */
87 #define	REP_START_SYNC		29	/* Tell client to begin syncing a ckp.*/
88 #define	REP_UPDATE		30	/* Environment hotcopy information. */
89 #define	REP_UPDATE_REQ		31	/* Request for hotcopy information. */
90 #define	REP_VERIFY		32	/* A log record for verification. */
91 #define	REP_VERIFY_FAIL		33	/* The client is outdated. */
92 #define	REP_VERIFY_REQ		34	/* Request for a log record to
93 					 * verify. */
94 #define	REP_VOTE1		35	/* Send out your information for an
95 					 * election. */
96 #define	REP_VOTE2		36	/* Send a "you are master" vote. */
97 /*
98  * Maximum message number for conversion tables.  Update this
99  * value as the largest message number above increases.
100  * It might make processing messages more straightforward if
101  * the *_MORE and BULK* messages were flags within the regular
102  * message type instead of separate message types themselves.
103  *
104  * !!!
105  * NOTE: When changing messages above, the two tables for upgrade support
106  * need adjusting.  They are in rep_util.c.
107  */
108 #define	REP_MAX_MSG	36
109 
110 /*
111  * This is the list of client-to-client requests messages.
112  * We use this to decide if we're doing client-to-client and
113  * might need to send a rerequest.
114  */
115 #define	REP_MSG_REQ(rectype)			\
116     (rectype == REP_ALL_REQ ||			\
117     rectype == REP_BLOB_ALL_REQ ||		\
118     rectype == REP_BLOB_CHUNK_REQ ||		\
119     rectype == REP_LOG_REQ ||			\
120     rectype == REP_PAGE_REQ ||			\
121     rectype == REP_VERIFY_REQ)
122 
123 /*
124  * This is the default value for the FD_SET_SIZE.  It is used when client
125  * doesn't provide any value or the client provided value is smaller
126  */
127 #define	REPMGR_FD_SET_DEFAULT_SIZE	1024
128 
129 /*
130  * Note that the version information should be at the beginning of the
131  * structure, so that we can rearrange the rest of it while letting the
132  * version checks continue to work.  DB_REPVERSION should be revved any time
133  * the rest of the structure changes or when the message numbers change.
134  *
135  * Define also, the corresponding log versions that are tied to the
136  * replication/release versions.  These are only needed in replication
137  * and that is why they're defined here. db_printlog takes notice as well.
138  */
139 #define	DB_LOGVERSION_42	8
140 #define	DB_LOGVERSION_43	10
141 #define	DB_LOGVERSION_44	11
142 #define	DB_LOGVERSION_45	12
143 #define	DB_LOGVERSION_46	13
144 #define	DB_LOGVERSION_47	14
145 #define	DB_LOGVERSION_48	15
146 #define	DB_LOGVERSION_48p2	16
147 #define	DB_LOGVERSION_50	17
148 #define	DB_LOGVERSION_51	17
149 #define	DB_LOGVERSION_52	18
150 #define	DB_LOGVERSION_53	19
151 #define	DB_LOGVERSION_60	20
152 #define	DB_LOGVERSION_60p1	21
153 #define	DB_LOGVERSION_61	22
154 #define	DB_LOGVERSION_62	23
155 #define	DB_LOGVERSION_MIN	DB_LOGVERSION_47
156 #define	DB_REPVERSION_INVALID	0
157 #define	DB_REPVERSION_44	3
158 #define	DB_REPVERSION_45	3
159 #define	DB_REPVERSION_46	4
160 #define	DB_REPVERSION_47	5
161 #define	DB_REPVERSION_48	5
162 #define	DB_REPVERSION_51	5
163 #define	DB_REPVERSION_52	6
164 #define	DB_REPVERSION_53	7
165 #define	DB_REPVERSION_60	7
166 #define	DB_REPVERSION_61	8
167 #define	DB_REPVERSION_62	9
168 #define	DB_REPVERSION		DB_REPVERSION_62
169 #define	DB_REPVERSION_MIN	DB_REPVERSION_47
170 
171 /*
172  * RPRINT - Replication diagnostic output
173  * VPRINT - Replication verbose output (superset of RPRINT).
174  * REP_PRINT_MESSAGE
175  *	Macros for verbose replication messages.
176  *
177  * Everything using RPRINT will go to the system diag file (if it
178  * is configured) and also to the user's verbose output if
179  * they have that verbose level configured.
180  * Messages using VPRINT do not ever go to the system diag file,
181  * but will go to the user's verbose output if configured.
182  *
183  * Use VPRINT for anything that might be printed on a standard,
184  * successful transaction.  Use RPRINT for error paths, rep
185  * state changes, elections, etc.
186  */
187 #define	REP_DIAGNAME	"__db.rep.diag%02d"
188 #define	REP_DIAGSIZE	MEGABYTE
189 #define	RPRINT(env, x) do {						\
190 	if ((env)->dbenv->verbose != 0)					\
191 		(void)__rep_print_system x;				\
192 } while (0)
193 #define	VPRINT(env, x) do {						\
194 	if ((env)->dbenv->verbose != 0)					\
195 		(void)__rep_print x;					\
196 } while (0)
197 #define	REP_PRINT_MESSAGE(env, eid, rp, str, fl) do {			\
198 	if ((env)->dbenv->verbose != 0)					\
199 		__rep_print_message(env, eid, rp, str, fl);		\
200 } while (0)
201 
202 /*
203  * Election gen file name
204  * The file contains an egen number for an election this client has NOT
205  * participated in.  I.e. it is the number of a future election.  We
206  * create it when we create the rep region, if it doesn't already exist
207  * and initialize egen to 1.  If it does exist, we read it when we create
208  * the rep region.  We write it immediately before sending our VOTE1 in
209  * an election.  That way, if a client has ever sent a vote for any
210  * election, the file is already going to be updated to reflect a future
211  * election, should it crash.
212  */
213 #define	REP_EGENNAME	"__db.rep.egen"
214 #define	REP_GENNAME	"__db.rep.gen"
215 
216 /*
217  * Internal init flag file name:
218  * The existence of this file serves as an indication that the client is in the
219  * process of Internal Initialization, in case it crashes before completing.
220  * During internal init the client's partially reconstructed database pages and
221  * logs may be in an inconsistent state, so much so that running recovery must
222  * be avoided.  Furthermore, there is no other way to reliably recognize this
223  * condition.  Therefore, when we open an environment, and we're just about to
224  * run recovery, we check for this file first.  If it exists we must discard all
225  * logs and databases.  This avoids the recovery problems, and leads to a fresh
226  * attempt at internal init if the environment becomes a replication client and
227  * finds a master.  The list of databases which may need to be removed is stored
228  * in this file.
229  */
230 #define	REP_INITNAME	"__db.rep.init"
231 #define	REP_INITVERSION_46	1
232 #define	REP_INITVERSION_47	2
233 #define	REP_INITVERSION		3
234 
235 /*
236  * View/partial replication file name.
237  * The file is empty.  It exists as a permanent indicator that this
238  * environment can never be master.
239  */
240 #define	REPVIEW		"__db.rep.view"
241 #define	IS_VIEW_SITE(env)						\
242 	(REP_ON(env) &&							\
243 	    ((env)->rep_handle->region->stat.st_view != 0))
244 
245 /*
246  * Database types for __rep_client_dbinit
247  */
248 typedef enum {
249 	REP_BLOB,	/* Blob file database. */
250 	REP_DB,		/* Log record database. */
251 	REP_PG		/* Pg database. */
252 } repdb_t;
253 
254 /* Macros to lock/unlock the replication region as a whole. */
255 #define	REP_SYSTEM_LOCK(env)						\
256 	MUTEX_LOCK(env, (env)->rep_handle->region->mtx_region)
257 #define	REP_SYSTEM_UNLOCK(env)						\
258 	MUTEX_UNLOCK(env, (env)->rep_handle->region->mtx_region)
259 
260 /*
261  * Macros for manipulating the event synchronization.  We use a separate mutex
262  * so that an application's call-back function can be invoked without locking
263  * the whole region.
264  */
265 #define	REP_EVENT_LOCK(env)						\
266 	MUTEX_LOCK(env, (env)->rep_handle->region->mtx_event)
267 #define	REP_EVENT_UNLOCK(env)						\
268 	MUTEX_UNLOCK(env, (env)->rep_handle->region->mtx_event)
269 
270 /*
271  * Synchronization states
272  * Please change __rep_syncstate_to_string (rep_stat.c) to track any changes
273  * made to these states.
274  *
275  * The states are in alphabetical order (except for OFF).  The usual
276  * order of progression for a full internal init is:
277  * VERIFY, UPDATE, PAGE, LOG (then back to OFF when we're done).
278  */
279 typedef enum {
280 	SYNC_OFF,	/* No recovery. */
281 	SYNC_LOG,	/* Recovery - log. */
282 	SYNC_PAGE,	/* Recovery - pages and blobs. */
283 	SYNC_UPDATE,	/* Recovery - update. */
284 	SYNC_VERIFY	/* Recovery - verify. */
285 } repsync_t;
286 
287 /*
288  * A record of the contents of the VOTE1 msg we sent out at current egen, in
289  * case we need to send out a duplicate VOTE1 to a late-joining client in a full
290  * election.  The nsites, nvotes, and priority fields of the REP struct can't be
291  * used, because those could change.  It's only safe to send out a dup if we
292  * send out the exact same info.
293  */
294 typedef struct {
295 	DB_LSN lsn;
296 	u_int32_t nsites;
297 	u_int32_t nvotes;
298 	u_int32_t priority;
299 	u_int32_t tiebreaker;
300 	u_int32_t ctlflags;
301 	u_int32_t data_gen;
302 } VOTE1_CONTENT;
303 
304 /*
305  * In a 2 node scenario, it is important that the newly joining node wait
306  * for the peer connection to be fully established before proceeding for
307  * election. Otherwise, it might end up electing itself as MASTER with its
308  * own vote (inspite of the fact that a fully functional master exists).
309  * Normally it takes about 5-20ms for the connection to go through. Setting
310  * the timeout to 500ms to account for connection time variations associated
311  * with SSL.
312  */
313 #define DB_REP_PEER_WAIT_TIMEOUT 500000
314 
315 /*
316  * REP --
317  * Shared replication structure.
318  */
319 typedef struct __rep { /* SHARED */
320 	db_mutex_t	mtx_region;	/* Region mutex. */
321 	db_mutex_t	mtx_clientdb;	/* Client database mutex. */
322 	db_mutex_t	mtx_ckp;	/* Checkpoint mutex. */
323 	db_mutex_t	mtx_diag;	/* Diagnostic message mutex. */
324 	db_mutex_t	mtx_repstart;	/* Role change mutex. */
325 	int		diag_index;	/* Diagnostic file index. */
326 	off_t		diag_off;	/* Diagnostic message offset. */
327 	roff_t		lease_off;	/* Offset of the lease table. */
328 	roff_t		tally_off;	/* Offset of the tally region. */
329 	roff_t		v2tally_off;	/* Offset of the vote2 tally region. */
330 	int		eid;		/* Environment id. */
331 	int		master_id;	/* ID of the master site. */
332 	u_int32_t	version;	/* Current replication version. */
333 	u_int32_t	egen;		/* Replication election generation. */
334 	u_int32_t	spent_egen;	/* Egen satisfied by rep_elect call. */
335 	u_int32_t	gen;		/* Replication generation number. */
336 	u_int32_t	mgen;		/* Master gen seen by client. */
337 	u_int32_t	asites;		/* Space allocated for sites. */
338 	u_int32_t	nsites;		/* Number of sites in group. */
339 	u_int32_t	nvotes;		/* Number of votes needed. */
340 	u_int32_t	priority;	/* My priority in an election. */
341 	u_int32_t	config_nsites;
342 
343 	db_timeout_t	elect_timeout;	/* Normal/full election timeouts. */
344 	db_timeout_t	full_elect_timeout;
345 
346 	db_timeout_t	chkpt_delay;	/* Master checkpoint delay. */
347 
348 #define	REP_DEFAULT_THROTTLE	(10 * MEGABYTE) /* Default value is < 1Gig. */
349 	u_int32_t	gbytes;		/* Limit on data sent in single... */
350 	u_int32_t	bytes;		/* __rep_process_message call. */
351 #define	DB_REP_REQUEST_GAP	40000	/* 40 msecs */
352 #define	DB_REP_MAX_GAP		1280000	/* 1.28 seconds */
353 	db_timespec	request_gap;	/* Minimum time to wait before we
354 					 * request a missing log record. */
355 	db_timespec	max_gap;	/* Maximum time to wait before
356 					 * requesting a missing log record. */
357 	u_int32_t	initsites;	/* Number of sites for
358 					 * set_memory_init(). */
359 	/* Status change information */
360 	u_int32_t	apply_th;	/* Number of callers in rep_apply. */
361 	u_int32_t	arch_th;	/* Number of callers in log_archive. */
362 	u_int32_t	elect_th;	/* Elect threads in lock-out. */
363 	u_int32_t	msg_th;		/* Number of callers in rep_proc_msg.*/
364 	u_int32_t	handle_cnt;	/* Count of handles in library. */
365 	u_int32_t	op_cnt;		/* Multi-step operation count.*/
366 	DB_LSN		ckp_lsn;	/* LSN for syncing a checkpoint. */
367 	DB_LSN		max_prep_lsn;	/* Max LSN of txn_prepare record. */
368 
369 	/*
370 	 * Event notification synchronization: the mtx_event and associate
371 	 * fields which it protects govern event notification to the
372 	 * application.  They form a guarantee that no matter how crazy the
373 	 * thread scheduling gets, the application sees a sensible, orderly
374 	 * progression of events.
375 	 */
376 	db_mutex_t	mtx_event;	/* Serializes event notification. */
377 	/*
378 	 * Latest generation whose NEWMASTER event the application has been
379 	 * notified of.  Also serves to force STARTUPDONE to occur after
380 	 * NEWMASTER.
381 	 */
382 	u_int32_t	newmaster_event_gen;
383 	/*
384 	 * Latest local victory of an election that the application has been
385 	 * notified of, expressed as the election generation number.  This
386 	 * ensures we notify the application exactly once when it wins an
387 	 * election.
388 	 */
389 	u_int32_t	notified_egen;
390 
391 	/* Internal init information. */
392 	u_int32_t	nfiles;		/* Number of files we have info on. */
393 	u_int32_t	curfile;	/* Cur file we're getting (0-based). */
394 	roff_t		originfo_off;	/* Offset of original file info. */
395 	u_int32_t	infolen;	/* Remaining length file info buffer. */
396 	u_int32_t	originfolen;	/* Original length file info buffer. */
397 	u_int32_t	infoversion;	/* Original file info version. */
398 	DB_LSN		first_lsn;	/* Earliest LSN we need. */
399 	u_int32_t	first_vers;	/* Log version of first log file. */
400 	DB_LSN		last_lsn;	/* Latest LSN we need. */
401 	/* These are protected by mtx_clientdb. */
402 	db_seq_t	gap_bl_hi_id;	/* Last id in the blob gap. */
403 	db_seq_t	gap_bl_hi_sid;	/* Last sid in the blob gap. */
404 	off_t		gap_bl_hi_off;	/* Last offset in the blob gap. */
405 	db_seq_t	last_blob_id;	/* Last id on the list to process. */
406 	db_seq_t	last_blob_sid;	/* Last sid on the list to process. */
407 	db_seq_t	prev_blob_id;	/* Previous last id on list. */
408 	db_seq_t	prev_blob_sid;	/* Previous last sid on list. */
409 	db_seq_t	highest_id;	/* Highest file id to request. */
410 	u_int32_t	blob_more_files;/* More blob files to be processed. */
411 	int		blob_sync;	/* Currently handling blobs. */
412 	db_timespec	last_pg_ts;	/* Last page stored timestamp. */
413 	db_pgno_t	ready_pg;	/* Next pg expected. */
414 	db_pgno_t	waiting_pg;	/* First pg after gap. */
415 	db_pgno_t	max_wait_pg;	/* Maximum pg requested. */
416 	u_int32_t	npages;		/* Num of pages rcvd for this file. */
417 	roff_t		curinfo_off;	/* Offset of current file info. */
418 					/* Always access with GET_CURINFO(). */
419 
420 	/* Vote tallying information. */
421 	u_int32_t	sites;		/* Sites heard from. */
422 	int		winner;		/* Current winner EID. */
423 	u_int32_t	w_priority;	/* Winner priority. */
424 	u_int32_t	w_gen;		/* Winner generation. */
425 	u_int32_t	w_datagen;	/* Winner data generation. */
426 	DB_LSN		w_lsn;		/* Winner LSN. */
427 	u_int32_t	w_tiebreaker;	/* Winner tiebreaking value. */
428 	u_int32_t	votes;		/* Number of votes for this site. */
429 
430 	VOTE1_CONTENT	vote1;		/* Valid until rep->egen changes. */
431 
432 	db_timespec	etime;		/* Election start timestamp. */
433 	int		full_elect;	/* Is current election a "full" one? */
434 
435 	/* Leases. */
436 	db_timeout_t	lease_timeout;	/* Lease timeout. */
437 	db_timespec	lease_duration;	/* Lease timeout with clock skew. */
438 	u_int32_t	clock_skew;	/* Clock skew. */
439 	u_int32_t	clock_base;	/* Clock scale factor base. */
440 	db_timespec	grant_expire;	/* Local grant expiration time. */
441 
442 	/* Cached LSN history, matching current gen. */
443 	DB_LSN		gen_base_lsn;	/* Base LSN of current generation. */
444 	u_int32_t	master_envid;	/* Current master's "unique" env ID. */
445 
446 	SH_TAILQ_HEAD(__wait) waiters;	/* List of threads in txn_applied(). */
447 	SH_TAILQ_HEAD(__wfree) free_waiters;/* Free list of waiter structs. */
448 
449 #ifdef HAVE_REPLICATION_THREADS
450 	/*
451 	 * Replication Framework (repmgr) shared config information.
452 	 */
453 	db_mutex_t	mtx_repmgr;	/* Region mutex. */
454 	roff_t		siteinfo_off;	/* Offset of site array region. */
455 	u_int		site_cnt;	/* Array slots in use. */
456 	u_int		site_max;	/* Total array slots allocated. */
457 	u_int		sites_avail;	/* Total number of available sites. */
458 	int		self_eid;	/* Where to find the local site. */
459 	u_int		siteinfo_seq;	/* Number of updates to this info. */
460 	u_int32_t	min_log_file;	/* Earliest log needed by repgroup. */
461 	DB_LSN		last_ckp_lsn;	/* The last checkpoint's ckp_lsn. */
462 
463 	pid_t		listener;
464 	u_int		listener_nthreads; /* # of msg threads in listener. */
465 
466 	int		perm_policy;
467 	db_timeout_t	ack_timeout;
468 	db_timeout_t	election_retry_wait;
469 	db_timeout_t	connection_retry_wait;
470 	db_timeout_t	heartbeat_frequency; /* Max period between msgs. */
471 	db_timeout_t	heartbeat_monitor_timeout;
472 	db_timeout_t	write_forward_timeout;
473 	u_int32_t	inqueue_max_gbytes;
474 	u_int32_t	inqueue_max_bytes;
475 	u_int32_t	inqueue_rz_gbytes;
476 	u_int32_t	inqueue_rz_bytes;
477 	u_int32_t	inqueue_full_event_on;
478 #endif  /* HAVE_REPLICATION_THREADS */
479 
480 	/* Statistics. */
481 	DB_REP_STAT	stat;
482 #if defined(HAVE_REPLICATION_THREADS) && defined(HAVE_STATISTICS)
483 	DB_REPMGR_STAT	mstat;
484 #endif
485 
486 	/*
487 	 * Please change __rep_print_all (rep_stat.c) to track any changes made
488 	 * to all these flag families below.
489 	 */
490 	/* Configuration. */
491 #define	REP_C_2SITE_STRICT	0x00001		/* Don't cheat on elections. */
492 #define	REP_C_AUTOINIT		0x00002		/* Auto initialization. */
493 #define	REP_C_AUTOROLLBACK	0x00004		/* Discard client txns: sync. */
494 #define	REP_C_AUTOTAKEOVER	0x00008		/* Auto listener take over. */
495 #define	REP_C_BULK		0x00010		/* Bulk transfer. */
496 #define	REP_C_DELAYCLIENT	0x00020		/* Delay client sync-up. */
497 #define	REP_C_DISABLE_POLL	0x00040		/* Avoid POLL for network io */
498 #define	REP_C_DISABLE_SSL	0x00080		/* Avoid POLL for network io */
499 #define	REP_C_ELECT_LOGLENGTH	0x00100		/* Log length wins election. */
500 #define	REP_C_ELECTIONS		0x00200		/* Repmgr to use elections. */
501 #define	REP_C_ENABLE_EPOLL	0x00400		/* Use EPOLL for network io */
502 #define	REP_C_FORWARD_WRITES	0x00800		/* Repmgr write forwarding. */
503 #define	REP_C_INMEM		0x01000		/* In-memory replication. */
504 #define	REP_C_LEASE		0x02000		/* Leases configured. */
505 #define	REP_C_NOWAIT		0x04000		/* Immediate error return. */
506 #define	REP_C_PREFMAS_CLIENT	0x08000		/* Preferred master client. */
507 #define	REP_C_PREFMAS_MASTER	0x10000		/* Preferred master site. */
508 	u_int32_t	config;		/* Configuration flags. */
509 
510 	/* Election. */
511 #define	REP_E_PHASE0		0x00000001	/* In phase 0 of election. */
512 #define	REP_E_PHASE1		0x00000002	/* In phase 1 of election. */
513 #define	REP_E_PHASE2		0x00000004	/* In phase 2 of election. */
514 #define	REP_E_TALLY		0x00000008	/* Tallied vote before elect. */
515 #define	REP_E_PEER_CONN_WAIT	0x00000010	/* Election waiting for peer. */
516 	u_int32_t	elect_flags;	/* Election flags. */
517 
518 	/* Lockout. */
519 #define	REP_LOCKOUT_API		0x00000001	/* BDB API - handle_cnt. */
520 #define	REP_LOCKOUT_APPLY	0x00000002	/* apply msgs - apply_th. */
521 #define	REP_LOCKOUT_ARCHIVE	0x00000004	/* log_archive. */
522 #define	REP_LOCKOUT_MSG		0x00000008	/* Message process - msg_th. */
523 #define	REP_LOCKOUT_OP		0x00000010	/* BDB ops txn,curs - op_cnt. */
524 	u_int32_t	lockout_flags;	/* Lockout flags. */
525 
526 	/* See above for enumerated sync states. */
527 	repsync_t	sync_state;	/* Recovery/synchronization flags. */
528 
529 	/*
530 	 * When adding a new flag value, consider whether it should be
531 	 * cleared in rep_start() when starting as a master or a client.
532 	 */
533 #define	REP_F_ABBREVIATED	0x00000001	/* Recover NIMDB pages only. */
534 #define	REP_F_APP_BASEAPI	0x00000002	/* Base API application. */
535 #define	REP_F_APP_REPMGR	0x00000004	/* repmgr application. */
536 #define	REP_F_CLIENT		0x00000008	/* Client replica. */
537 #define	REP_F_DELAY		0x00000010	/* Delaying client sync-up. */
538 #define	REP_F_GROUP_ESTD	0x00000020	/* Rep group is established. */
539 #define	REP_F_HOLD_GEN		0x00000040	/* PrefMas startup hold gen. */
540 #define	REP_F_INUPDREQ		0x00000080	/* Thread in rep_update_req. */
541 #define	REP_F_LEASE_EXPIRED	0x00000100	/* Leases guaranteed expired. */
542 #define	REP_F_MASTER		0x00000200	/* Master replica. */
543 #define	REP_F_MASTERELECT	0x00000400	/* Master elect. */
544 #define	REP_F_NEWFILE		0x00000800	/* Newfile in progress. */
545 #define	REP_F_NIMDBS_LOADED	0x00001000	/* NIMDBs are materialized. */
546 #define	REP_F_READONLY_MASTER	0x00002000	/* PrefMas readonly master. */
547 #define	REP_F_SKIPPED_APPLY	0x00004000	/* Skipped applying a record. */
548 #define	REP_F_START_CALLED	0x00008000	/* Rep_start called. */
549 #define	REP_F_SYS_DB_OP		0x00010000	/* Operation in progress. */
550 	u_int32_t	flags;
551 } REP;
552 
553 /* Information about a thread waiting in txn_applied(). */
554 typedef enum {
555 	AWAIT_GEN,		/* Client's gen is behind token gen. */
556 	AWAIT_HISTORY,		/* Haven't received master's LSN db update. */
557 	AWAIT_LSN,		/* Awaiting replication of user txn. */
558 	AWAIT_NIMDB,		/* LSN db missing: maybe it's INMEM. */
559 	LOCKOUT			/* Thread awoken due to pending lockout. */
560 } rep_waitreason_t;
561 
562 struct rep_waitgoal {
563 	rep_waitreason_t	why;
564 	union {
565 		DB_LSN	lsn;	/* For AWAIT_LSN and AWAIT_HISTORY. */
566 		u_int32_t gen;	/* AWAIT_GEN */
567 	} u;
568 };
569 
570 struct __rep_waiter {
571 	db_mutex_t	mtx_repwait; /* Self-blocking mutex. */
572 	struct rep_waitgoal	goal;
573 	SH_TAILQ_ENTRY	links;	     /* On either free or waiting list. */
574 
575 #define	REP_F_PENDING_LOCKOUT	0x00000001
576 #define	REP_F_WOKEN		0x00000002
577 	u_int32_t	flags;
578 };
579 
580 /*
581  * Macros to check and clear the BDB lockouts.  Currently they are
582  * locked out/set individually because they pertain to different pieces of
583  * the BDB API, they are otherwise always checked and cleared together.
584  */
585 #define	ISSET_LOCKOUT_BDB(R)						\
586     (FLD_ISSET((R)->lockout_flags, (REP_LOCKOUT_API | REP_LOCKOUT_OP)))
587 
588 #define	CLR_LOCKOUT_BDB(R)						\
589     (FLD_CLR((R)->lockout_flags, (REP_LOCKOUT_API | REP_LOCKOUT_OP)))
590 
591 /*
592  * Recovery flag mask to easily check any/all recovery bits.  That is
593  * REP_LOCKOUT_{API|OP} and most REP_S_*.  This must change if the values
594  * of the flags change.  NOTE:  We do not include REP_LOCKOUT_MSG in
595  * this mask because it is used frequently in non-recovery related
596  * areas and we want to manipulate it separately (see especially
597  * in __rep_new_master).
598  */
599 #define	CLR_RECOVERY_SETTINGS(R)					\
600 do {									\
601 	(R)->sync_state = SYNC_OFF;					\
602 	CLR_LOCKOUT_BDB(R);						\
603 } while (0)
604 
605 #define	IS_REP_RECOVERING(R)						\
606     ((R)->sync_state != SYNC_OFF || ISSET_LOCKOUT_BDB(R))
607 
608 /*
609  * REP_F_EPHASE0 is not a *real* election phase.  It is used for
610  * master leases and allowing the client to find the master or
611  * expire its lease.
612  */
613 #define	IN_ELECTION(R)							\
614 	FLD_ISSET((R)->elect_flags, REP_E_PHASE1 | REP_E_PHASE2)
615 #define	IN_ELECTION_TALLY(R) \
616 	FLD_ISSET((R)->elect_flags, REP_E_PHASE1 | REP_E_PHASE2 | REP_E_TALLY)
617 #define	ELECTION_MAJORITY(n) (((n) / 2) + 1)
618 
619 #define	IN_INTERNAL_INIT(R) \
620 	((R)->sync_state == SYNC_LOG || (R)->sync_state == SYNC_PAGE)
621 
622 #define	IS_REP_MASTER(env)						\
623 	(REP_ON(env) &&							\
624 	    F_ISSET(((env)->rep_handle->region), REP_F_MASTER))
625 
626 #define	IS_REP_CLIENT(env)						\
627 	(REP_ON(env) &&							\
628 	    F_ISSET(((env)->rep_handle->region), REP_F_CLIENT))
629 
630 #define	IS_REP_STARTED(env)						\
631 	(REP_ON(env) &&							\
632 	    F_ISSET(((env)->rep_handle->region), REP_F_START_CALLED))
633 
634 #define	IS_USING_LEASES(env)						\
635 	(REP_ON(env) &&							\
636 	    FLD_ISSET(((env)->rep_handle->region)->config, REP_C_LEASE))
637 
638 #define	IS_CLIENT_PGRECOVER(env)					\
639 	(IS_REP_CLIENT(env) &&						\
640 	    (((env)->rep_handle->region)->sync_state == SYNC_PAGE))
641 
642 /*
643  * Macros to figure out if we need to do replication pre/post-amble processing.
644  * Skip for specific DB handles owned by the replication layer, either because
645  * replication is running recovery or because it's a handle entirely owned by
646  * the replication code (replication opens its own databases to track state).
647  */
648 #define	REP_FLAGS_SET(env)						\
649 	((env)->rep_handle->region->flags != 0 ||			\
650 	(env)->rep_handle->region->elect_flags != 0 ||			\
651 	(env)->rep_handle->region->lockout_flags != 0)
652 
653 #define	IS_ENV_REPLICATED(env)						\
654 	(REP_ON(env) && REP_FLAGS_SET(env))
655 
656 /*
657  * Update the temporary log archive block timer.
658  */
659 #define	MASTER_UPDATE(env, renv) do {					\
660 	REP_SYSTEM_LOCK(env);						\
661 	F_SET((renv), DB_REGENV_REPLOCKED);				\
662 	(void)time(&(renv)->op_timestamp);				\
663 	REP_SYSTEM_UNLOCK(env);						\
664 } while (0)
665 
666 /*
667  * Macro to set a new generation number.  Cached values from the LSN history
668  * database are associated with the current gen, so when the gen changes we must
669  * invalidate the cache.  Use this macro for all gen changes, to avoid
670  * forgetting to do so.  This macro should be used while holding the rep system
671  * mutex (unless we know we're single-threaded for some other reason, like at
672  * region create time).
673  */
674 #define	SET_GEN(g) do {							\
675 	rep->gen = (g);							\
676 	ZERO_LSN(rep->gen_base_lsn);					\
677 } while (0)
678 
679 /* Macros to determine current replication configuration options. */
680 #define	REP_CONFIG_IS_SET(env, flags)					\
681 	(REP_ON(env) ?							\
682 	FLD_ISSET(((env)->rep_handle->region)->config, flags) :	\
683 	FLD_ISSET(((env)->rep_handle)->config, flags))
684 #ifdef HAVE_REPLICATION_THREADS
685 #define	PREFMAS_IS_SET(env)						\
686 	(REP_CONFIG_IS_SET(env,					\
687 	(REP_C_PREFMAS_MASTER | REP_C_PREFMAS_CLIENT)))
688 #else
689 #define	PREFMAS_IS_SET(env)	0
690 #endif
691 #define	IS_PREFMAS_MODE(env)						\
692 	(REP_ON(env) && PREFMAS_IS_SET(env) &&				\
693 	((env)->rep_handle->region)->config_nsites < 3)
694 
695 /*
696  * Gap processing flags.  These provide control over the basic
697  * gap processing algorithm for some special cases.
698  */
699 #define	REP_GAP_FORCE		0x001	/* Force a request for a gap. */
700 #define	REP_GAP_REREQUEST	0x002	/* Gap request is a forced rerequest. */
701 					/* REREQUEST is a superset of FORCE. */
702 
703 /*
704  * Internal options for rep_start_int().  These are used by preferred master
705  * mode to help coordinate between the sites during changes of master.
706  */
707 #define	REP_START_FORCE_ROLECHG	0x001	/* Force role change to advance gen. */
708 #define	REP_START_HOLD_CLIGEN	0x002	/* Hold client gen before doing
709 					 * lsnhist match. */
710 #define	REP_START_WAIT_LOCKMSG	0x004	/* Wait for REP_LOCKOUT_MSG. */
711 
712 /*
713  * Flags indicating what kind of record we want to back up to, in the log.
714  */
715 #define	REP_REC_COMMIT		0x001   /* Most recent commit record. */
716 #define	REP_REC_PERM		0x002   /* Most recent perm record. */
717 					/* PERM is a superset of COMMIT. */
718 #define	REP_REC_PERM_DEL	0x004   /* Most recent PERM, or fail if a
719 					 * file delete is found first. */
720 
721 /*
722  * Permanent record types.
723  */
724 #define	IS_PERM_RECTYPE(rectype)					\
725     ((rectype) == DB___txn_regop || (rectype) == DB___txn_ckp)
726 
727 /*
728  * Basic pre/post-amble processing.
729  */
730 #define	REPLICATION_WRAP(env, func_call, checklock, ret) do {		\
731 	int __rep_check, __t_ret;					\
732 	__rep_check = IS_ENV_REPLICATED(env) ? 1 : 0;			\
733 	(ret) = __rep_check ? __env_rep_enter(env, checklock) : 0;	\
734 	if ((ret) == 0) {						\
735 		(ret) = func_call;					\
736 		if (__rep_check && (__t_ret =				\
737 		    __env_db_rep_exit(env)) != 0 && (ret) == 0)		\
738 		(ret) = __t_ret;					\
739 	}								\
740 } while (0)
741 
742 /*
743  * Macro to safely access curinfo and its internal DBT pointers from
744  * any process.  This should always be used to access curinfo.  If
745  * the internal DBT pointers are to be used, mtx_clientdb must be held
746  * between the time of this call and the use of the pointers.
747  *
748  * The current file information (curinfo) is stored in shared region
749  * memory and accessed via an offset.  It contains DBTs that themselves
750  * point to allocated data.  __rep_nextfile() manages this information in a
751  * single chunk of shared memory.
752  *
753  * If different processes access curinfo, they may have different shared
754  * region addresses.  This means that curinfo and its pointers to DBT data
755  * must be recalculated for each process starting with the offset.
756  */
757 #define	GET_CURINFO(rep, infop, curinfo)				\
758 do {									\
759 	curinfo = R_ADDR(infop, rep->curinfo_off);			\
760 	if ((curinfo)->uid.size > 0)					\
761 		(curinfo)->uid.data = R_ADDR(infop,			\
762 		    rep->curinfo_off + sizeof(__rep_fileinfo_args));	\
763 	else								\
764 		(curinfo)->uid.data = NULL;				\
765 	if ((curinfo)->info.size > 0)					\
766 		(curinfo)->info.data = R_ADDR(infop, rep->curinfo_off +	\
767 		    sizeof(__rep_fileinfo_args) + (curinfo)->uid.size);	\
768 	else								\
769 		(curinfo)->info.data = NULL;				\
770 	if ((curinfo)->dir.size > 0)					\
771 		(curinfo)->dir.data = R_ADDR(infop, rep->curinfo_off +	\
772 		    sizeof(__rep_fileinfo_args) + (curinfo)->uid.size +	\
773 		    (curinfo)->info.size);				\
774 	else								\
775 		(curinfo)->dir.data = NULL;				\
776 } while (0)
777 
778 #if defined(HAVE_REPMGR_SSL)
779 
780 typedef struct __repmgr_ssl_config {
781 	char	*repmgr_ca_cert_file;
782 	char	*repmgr_ca_dir;
783 	char	*repmgr_cert_file;
784 	char	*repmgr_key_file;
785 	char	*repmgr_key_file_passwd;
786 	u_int	repmgr_ssl_verify_depth;
787 } REPMGR_SSL_CONFIG;
788 
789 #endif
790 
791 /*
792  * Per-process replication structure.
793  *
794  * There are 2 mutexes used in the Base replication API.  (See LOCK_MUTEX in
795  * repmgr.h for a discussion of repmgr.)
796  * 1.  mtx_region - This protects the fields of the rep region above.
797  * 2.  mtx_clientdb - This protects the per-process flags, and bookkeeping
798  * database and all of the components that maintain it.  Those
799  * components include the following fields in the log region (see log.h):
800  *	a. ready_lsn
801  *	b. waiting_lsn
802  *	c. verify_lsn
803  *	d. wait_recs
804  *	e. rcvd_recs
805  *	f. max_wait_lsn
806  * These fields in the log region are NOT protected by the log region lock at
807  * all.
808  *
809  * Note that the per-process flags should truly be protected by a special
810  * per-process thread mutex, but it is currently set in so isolated a manner
811  * that it didn't make sense to do so and in most case we're already holding
812  * the mtx_clientdb anyway.
813  *
814  * The lock ordering protocol is that mtx_clientdb must be acquired first and
815  * then either REP->mtx_region, or the LOG->mtx_region mutex may be acquired if
816  * necessary.
817  *
818  * Note that the appropriate mutex is needed any time one or more related
819  * values are read or written that could possibly use more than one atomic
820  * machine instruction.  A single 32-bit integer value is safe without a
821  * mutex, but most other types of value should use a mutex.
822  *
823  * Use of a db_mutex_t mutex must be inside a matched pair of ENV_ENTER() and
824  * ENV_LEAVE() macros.  This ensures that if a thread dies while holding
825  * a lock (i.e. a mutex), recovery can clean it up so that it does not
826  * indefinitely block other threads.
827  */
828 struct __db_rep {
829 	/*
830 	 * Shared configuration information -- copied to and maintained in the
831 	 * shared region as soon as the shared region is created.
832 	 */
833 	int		eid;		/* Environment ID. */
834 
835 	u_int32_t	gbytes;		/* Limit on data sent in single... */
836 	u_int32_t	bytes;		/* __rep_process_message call. */
837 
838 	db_timespec	request_gap;	/* Minimum time to wait before we
839 					 * request a missing log record. */
840 	db_timespec	max_gap;	/* Maximum time to wait before
841 					 * requesting a missing log record. */
842 
843 	u_int32_t	clock_skew;	/* Clock skew factor. */
844 	u_int32_t	clock_base;	/* Clock skew base. */
845 	u_int32_t	config;		/* Configuration flags. */
846 	u_int32_t	config_nsites;
847 
848 	db_timeout_t	elect_timeout;	/* Normal/full election timeouts. */
849 	db_timeout_t	full_elect_timeout;
850 
851 	db_timeout_t	chkpt_delay;	/* Master checkpoint delay. */
852 
853 	u_int32_t	my_priority;
854 	db_timeout_t	lease_timeout;	/* Master leases. */
855 	/*
856 	 * End of shared configuration information.
857 	 */
858 	int		(*partial)	/* View/partial replication function. */
859 			    __P((DB_ENV *, const char *, int *, u_int32_t));
860 
861 	int		(*send)		/* Send function. */
862 			    __P((DB_ENV *, const DBT *, const DBT *,
863 			    const DB_LSN *, int, u_int32_t));
864 
865 	DB		*rep_db;	/* Bookkeeping database. */
866 	DB		*lsn_db;	/* (Replicated) LSN history database. */
867 	db_mutex_t	mtx_lsnhist;	/* LSN history database mutex. */
868 
869 	REP		*region;	/* In memory structure. */
870 	u_int8_t	*bulk;		/* Shared memory bulk area. */
871 
872 #define	DBREP_DIAG_FILES	2
873 	DB_FH		*diagfile[DBREP_DIAG_FILES];	/* Diag files fhp. */
874 	off_t		diag_off;	/* Current diag file offset. */
875 
876 	/* These are protected by mtx_clientdb. */
877 	DB_MPOOLFILE	*file_mpf;	/* Mpoolfile for current database. */
878 	DB		*file_dbp;	/* This file's page info. */
879 	DBC		*queue_dbc;	/* Dbc for a queue file. */
880 	DB		*blob_dbp;	/* Blob file database. */
881 
882 	/*
883 	 * Please change __rep_print_all (rep_stat.c) to track any changes made
884 	 * to these flags.
885 	 */
886 #define	DBREP_APP_BASEAPI	0x0001	/* Base API application. */
887 #define	DBREP_APP_REPMGR	0x0002	/* repmgr application. */
888 #define	DBREP_OPENFILES		0x0004	/* This handle has opened files. */
889 	u_int32_t	flags;		/* per-process flags. */
890 
891 #ifdef HAVE_REPLICATION_THREADS
892 	/*
893 	 * Replication Framework (repmgr) per-process information.
894 	 */
895 	int		config_nthreads;/* Configured msg processing threads. */
896 	u_int		nthreads;	/* Msg processing threads. */
897 	u_int		athreads;	/* Space allocated for msg threads. */
898 	u_int		non_rep_th;	/* Threads in GMDB or channel msgs. */
899 	u_int		aelect_threads; /* Space allocated for elect threads. */
900 	u_int32_t	init_policy;
901 	int		perm_policy;
902 	DB_LSN		perm_lsn; /* Last perm LSN we've announced. */
903 	db_timeout_t	ack_timeout;
904 	db_timeout_t	election_retry_wait;
905 	db_timeout_t	connection_retry_wait;
906 	db_timeout_t	heartbeat_frequency; /* Max period between msgs. */
907 	db_timeout_t	heartbeat_monitor_timeout;
908 	db_timeout_t	write_forward_timeout;
909 	u_int32_t	inqueue_max_gbytes;
910 	u_int32_t	inqueue_max_bytes;
911 
912 	/* Thread synchronization. */
913 	REPMGR_RUNNABLE *selector, **messengers, **elect_threads;
914 	REPMGR_RUNNABLE	*preferred_elect_thr;
915 	REPMGR_RUNNABLE	*takeover_thread;
916 	db_timespec	repstart_time;
917 	mgr_mutex_t	*mutex;
918 	cond_var_t	check_election, gmdb_idle, msg_avail;
919 	waiter_t	ack_waiters; /* For threads awaiting PERM acks. */
920 #ifdef DB_WIN32
921 	HANDLE		signaler;
922 #else
923 	int		read_pipe, write_pipe;
924 #endif
925 
926 	/* Operational stuff. */
927 	REPMGR_SITE	*sites;		/* Array of known sites. */
928 	u_int		site_cnt;	/* Array slots in use. */
929 	u_int		site_max;	/* Total array slots allocated. */
930 	int		self_eid;	/* Where to find the local site. */
931 	u_int		siteinfo_seq;	/* Last known update to this list. */
932 
933 	/*
934 	 * The connections list contains only those connections not actively
935 	 * associated with a known site (see repmgr.h).
936 	 */
937 	CONNECTION_LIST	connections;
938 	RETRY_Q_HEADER	retries;	/* Sites needing connection retry. */
939 	struct {
940 		u_int32_t gbytes;
941 		u_int32_t bytes;
942 		STAILQ_HEAD(__repmgr_q_header, __repmgr_message) header;
943 	} input_queue;
944 
945 	socket_t	listen_fd;
946 	db_timespec	last_bcast;	/* Time of last broadcast msg. */
947 	db_timespec	last_hbeat;	/* Time of last heartbeat (prefmas). */
948 	db_timespec	l_listener_chk; /* Time to check local listener. */
949 	db_timeout_t	l_listener_wait;/* Timeout to check local listener. */
950 	db_timespec	m_listener_chk; /* Time to check master listener. */
951 	db_timeout_t	m_listener_wait;/* Timeout to check master listener. */
952 
953 #if defined(HAVE_REPMGR_SSL)
954 	REPMGR_SSL_CONFIG	repmgr_ssl_conf;
955 	SSL_CTX		*repmgr_ssl_ctx;
956 #endif
957 
958 	/*
959 	 * Status of repmgr.  It is ready when repmgr is not yet started.  It
960 	 * is running after repmgr is (re)started.  It is stopped if the env
961 	 * of the running repmgr is closed, or the site is removed.
962 	 */
963 	enum { ready, running, stopped } repmgr_status;
964 	int		new_connection;	  /* Since last master seek attempt. */
965 	int		demotion_pending; /* We're being demoted to a view. */
966 	int		takeover_pending; /* We've been elected master. */
967 	int		rejoin_pending; /* Join group retry after rejection. */
968 	int		gmdb_busy;
969 	int		client_intent;	/* Will relinquish master role. */
970 	int		gmdb_dirty;
971 	int		have_gmdb;
972 	int		seen_repmsg;
973 	int		view_mismatch; /* View callback and gmdb don't match. */
974 	int		abbrev_init;	/* For mixed version gmdb upgrade. */
975 
976 	/*
977 	 * Flag to show what kind of transaction is currently in progress.
978 	 * Primary means we're doing the first (critical) phase of a membership
979 	 * DB update, where we care about perm failures.  In the secondary phase
980 	 * we don't care.  Usually the value is "none", when normal user
981 	 * transactions are happening.  We need to use this global flag because
982 	 * we don't have a more proper direct channel to communicate information
983 	 * between the originator of a transaction and the replication send()
984 	 * function that has to wait for acks and decide what to do about them.
985 	 */
986 	enum { none, gmdb_primary, gmdb_secondary } active_gmdb_update;
987 	int		limbo_resolution_needed;
988 
989 	/*
990 	 * GMDB update sequence count.  On creation we write version 1; so, once
991 	 * repmgr has started and tried to read, a 0 here can be taken to mean
992 	 * that the DB doesn't exist yet.
993 	 */
994 	u_int32_t	membership_version;
995 	u_int32_t	member_version_gen;
996 
997 	/* LSN of GMDB txn that got a perm failure. */
998 	DB_LSN		limbo_failure;
999 	/* EID whose membership status is therefore unresolved */
1000 	int		limbo_victim;
1001 	/* LSN of a later txn that achieves perm success. */
1002 	DB_LSN		durable_lsn;
1003 	DB		*gmdb;	/* Membership database handle. */
1004 	/*
1005 	 * Membership list restored from init file after crash
1006 	 * during internal init.
1007 	 */
1008 	u_int8_t	*restored_list;
1009 	size_t		restored_list_length;
1010 
1011 	/*
1012 	 * Preferred master mode indicator for a pending action.  A
1013 	 * master_switch is initiated when the preferred master site is
1014 	 * ready to take over as master.  A start_temp_master is initiated
1015 	 * when the client site needs to start as the temporary master.
1016 	 */
1017 	enum { no_action, master_switch, start_temp_master } prefmas_pending;
1018 	/* The LSN at the very beginning of preferred master site startup. */
1019 	DB_LSN prefmas_init_lsn;
1020 
1021 	/* Application's message dispatch call-back function. */
1022 	void  (*msg_dispatch) __P((DB_ENV *, DB_CHANNEL *,
1023 		DBT *, u_int32_t, u_int32_t));
1024 
1025 	/* Socket approval callback function. */
1026 	int	(*approval) __P((DB_ENV *, DB_REPMGR_SOCKET, int *, u_int32_t));
1027 #endif  /* HAVE_REPLICATION_THREADS */
1028 };
1029 
1030 /*
1031  * Determine whether application is repmgr or base replication API.  If
1032  * repmgr was configured, base the test on internal replication flags for
1033  * APP_REPMGR and APP_BASEAPI.  These flags get set by the appropriate parts
1034  * of the various replication APIs.
1035  */
1036 #ifdef HAVE_REPLICATION_THREADS
1037 /*
1038  * Application type is set to be repmgr when:
1039  *   1. A local site is defined.
1040  *   2. A remote site is defined.
1041  *   3. An acknowledgement policy is configured.
1042  *   4. A repmgr flag is configured.
1043  *   5. A timeout value is configured for one of the repmgr timeouts.
1044  */
1045 #define	APP_IS_REPMGR(env)						\
1046 	(REP_ON(env) ?							\
1047 	    F_ISSET((env)->rep_handle->region, REP_F_APP_REPMGR) :	\
1048 	    F_ISSET((env)->rep_handle, DBREP_APP_REPMGR))
1049 
1050 /*
1051  * Application type is set to be base replication API when:
1052  *   1. Transport send function is defined and is not the repmgr send
1053  *      function.
1054  */
1055 #define	APP_IS_BASEAPI(env)						\
1056 	(REP_ON(env) ?							\
1057 	    F_ISSET((env)->rep_handle->region, REP_F_APP_BASEAPI) :	\
1058 	    F_ISSET((env)->rep_handle, DBREP_APP_BASEAPI))
1059 
1060 /*
1061  * Set application type.  These macros do extra checking to guarantee that
1062  * only one application type is ever set.
1063  */
1064 #define	APP_SET_REPMGR(env) do {					\
1065 	if (REP_ON(env)) {						\
1066 		ENV_ENTER(env, ip);					\
1067 		REP_SYSTEM_LOCK(env);					\
1068 		if (!F_ISSET((env)->rep_handle->region,			\
1069 		    REP_F_APP_BASEAPI))					\
1070 			F_SET((env)->rep_handle->region,		\
1071 			    REP_F_APP_REPMGR);				\
1072 		REP_SYSTEM_UNLOCK(env);					\
1073 		ENV_LEAVE(env, ip);					\
1074 	} else if (!F_ISSET((env)->rep_handle, DBREP_APP_BASEAPI))	\
1075 		F_SET((env)->rep_handle, DBREP_APP_REPMGR);		\
1076 } while (0)
1077 #define	APP_SET_BASEAPI(env) do {					\
1078 	if (REP_ON(env)) {						\
1079 		ENV_ENTER(env, ip);					\
1080 		REP_SYSTEM_LOCK(env);					\
1081 		if (!F_ISSET((env)->rep_handle->region,			\
1082 		    REP_F_APP_REPMGR))					\
1083 			F_SET((env)->rep_handle->region,		\
1084 			    REP_F_APP_BASEAPI);				\
1085 		REP_SYSTEM_UNLOCK(env);					\
1086 		ENV_LEAVE(env, ip);					\
1087 	} else if (!F_ISSET((env)->rep_handle, DBREP_APP_REPMGR))	\
1088 		F_SET((env)->rep_handle, DBREP_APP_BASEAPI);		\
1089 } while (0)
1090 #define	ADJUST_AUTOTAKEOVER_WAITS(db_rep, timeout) do {			\
1091 	(db_rep)->l_listener_wait = timeout;				\
1092 	(db_rep)->m_listener_wait = 3 * timeout;			\
1093 } while (0)
1094 
1095 #else
1096 /*
1097  * We did not configure repmgr, application must be base replication API.
1098  * The APP_SET_* macros are noops in this case, but they must be defined
1099  * with a null body to avoid compiler warnings on some platforms.
1100  */
1101 #define	APP_IS_REPMGR(env) 0
1102 #define	APP_SET_REPMGR(env) do {					\
1103 	;								\
1104 } while (0)
1105 #define	APP_IS_BASEAPI(env) 1
1106 #define	APP_SET_BASEAPI(env) do {					\
1107 	;								\
1108 } while (0)
1109 #define	ADJUST_AUTOTAKEOVER_WAITS(db_rep, timeout) do {			\
1110 	;								\
1111 } while (0)
1112 #endif  /* HAVE_REPLICATION_THREADS */
1113 
1114 /*
1115  * Control structure flags for replication communication infrastructure.
1116  */
1117 /*
1118  * Add new REPCTL flags to the end of this list to preserve compatibility
1119  * with old versions.
1120  */
1121 #define	REPCTL_ELECTABLE	0x001	/* Upgraded client is electable. */
1122 #define	REPCTL_FLUSH		0x002	/* Record should be flushed. */
1123 #define	REPCTL_GROUP_ESTD	0x004	/* Message from site in a group. */
1124 #define	REPCTL_INIT		0x008	/* Internal init message. */
1125 #define	REPCTL_LEASE		0x010	/* Lease related message. */
1126 #define	REPCTL_PERM		0x020	/* Permanent message (e.g. commit). */
1127 #define	REPCTL_RESEND		0x040	/* Rerequested message. */
1128 #define	REPCTL_LOG_END		0x080	/* Approximate end of group-wide log. */
1129 #define	REPCTL_INMEM_ONLY	0x100	/* In-memory databases only. */
1130 #define	REPCTL_ENCRYPTED	0x200	/* Master is using encryption. */
1131 
1132 /*
1133  * File info flags for internal init.  The per-database (i.e., file) flag
1134  * represents the on-disk format of the file, and is conveyed from the master to
1135  * the initializing client in the UPDATE message, so that the client can know
1136  * how to create the file.  The per-page flag is conveyed along with each PAGE
1137  * message, describing the format of the page image being transmitted; it is of
1138  * course set by the site serving the PAGE_REQ.  The serving site gets the page
1139  * image from its own mpool, and thus the page is in the native format of the
1140  * serving site.  This format may be different (i.e., opposite) from the on-disk
1141  * format, and in fact can vary per-page, since with client-to-client sync it is
1142  * possible for various different sites to serve the various PAGE_REQ requests.
1143  */
1144 #define	REPINFO_DB_LITTLEENDIAN	0x0001	/* File is little-endian lorder. */
1145 #define	REPINFO_PG_LITTLEENDIAN	0x0002	/* Page is little-endian lorder. */
1146 
1147 #define	LEASE_REFRESH_MIN	30     /* Minimum number of refresh retries. */
1148 #define	LEASE_REFRESH_USEC	50000  /* Microseconds between refresh tries. */
1149 
1150 /* Master granted lease information. */
1151 typedef struct __rep_lease_entry {
1152 	int		eid;		/* EID of client grantor. */
1153 	db_timespec	start_time;	/* Start time clients echo back. */
1154 	db_timespec	end_time;	/* Master lease expiration time. */
1155 	DB_LSN		lease_lsn;	/* Durable LSN lease applies to. */
1156 } REP_LEASE_ENTRY;
1157 
1158 typedef struct {
1159 	u_int32_t	egen;		/* Voter's election generation. */
1160 	int		eid;		/* Voter's ID. */
1161 } REP_VTALLY;
1162 
1163 /*
1164  * The REP_THROTTLE_ONLY flag is used to do throttle processing only.
1165  * If set, it will only allow sending the REP_*_MORE message, but not
1166  * the normal, non-throttled message.  It is used to support throttling
1167  * with bulk transfer.
1168  */
1169 /* Flags for __rep_send_throttle. */
1170 #define	REP_THROTTLE_ONLY	0x0001	/* Send _MORE message only. */
1171 
1172 /* Throttled message processing information. */
1173 typedef struct {
1174 	DB_LSN		lsn;		/* LSN of this record. */
1175 	DBT		*data_dbt;	/* DBT of this record. */
1176 	u_int32_t	gbytes;		/* This call's max gbytes sent. */
1177 	u_int32_t	bytes;		/* This call's max bytes sent. */
1178 	u_int32_t	type;		/* Record type. */
1179 } REP_THROTTLE;
1180 
1181 /* Bulk processing information. */
1182 /*
1183  * !!!
1184  * We use a roff_t for the offset.  We'd really like to use a ptrdiff_t
1185  * since that really is what it is.  But ptrdiff_t is not portable and
1186  * doesn't exist everywhere.
1187  */
1188 typedef struct {
1189 	u_int8_t	*addr;		/* Address of bulk buffer. */
1190 	roff_t		*offp;		/* Ptr to current offset into buffer. */
1191 	u_int32_t	len;		/* Bulk buffer length. */
1192 	u_int32_t	type;		/* Item type in buffer (log, page). */
1193 	DB_LSN		lsn;		/* First LSN in buffer. */
1194 	int		eid;		/* ID of potential recipients. */
1195 #define	BULK_XMIT	0x001		/* Buffer in transit. */
1196 	u_int32_t	*flagsp;	/* Buffer flags. */
1197 } REP_BULK;
1198 
1199 /*
1200  * This structure takes care of representing a transaction.
1201  * It holds all the records, sorted by page number so that
1202  * we can obtain locks and apply updates in a deadlock free
1203  * order.
1204  */
1205 typedef struct {
1206 	u_int nlsns;
1207 	u_int nalloc;
1208 	DB_LSN *array;
1209 } LSN_COLLECTION;
1210 
1211 /*
1212  * This is used by the page-prep routines to do the lock_vec call to
1213  * apply the updates for a single transaction or a collection of
1214  * transactions.
1215  */
1216 typedef struct {
1217 	int		n;
1218 	DB_LOCKREQ	*reqs;
1219 	DBT		*objs;
1220 } linfo_t;
1221 
1222 /*
1223  * Used to store information on the child transaction that opens a blob meta
1224  * database.  In partial replication processing the child transaction of the
1225  * blob meta database must be delayed until after processing the child
1226  * transaction that opens the database that owns the BMD.
1227  */
1228 typedef struct {
1229 	db_seq_t blob_file_id;
1230 	DB_LSN lsn;
1231 	u_int32_t child;
1232 	void *next;
1233 	void *prev;
1234 } DELAYED_BLOB_LIST;
1235 
1236 #if defined(__cplusplus)
1237 }
1238 #endif
1239 
1240 #include "dbinc_auto/rep_ext.h"
1241 #endif	/* !_DB_REP_H_ */
1242