1 /*- 2 * Copyright (c) 2001, 2020 Oracle and/or its affiliates. All rights reserved. 3 * 4 * See the file LICENSE for license information. 5 * 6 * $Id$ 7 */ 8 9 #ifndef _DB_REP_H_ 10 #define _DB_REP_H_ 11 12 #include "dbinc_auto/rep_automsg.h" 13 14 #if defined(__cplusplus) 15 extern "C" { 16 #endif 17 18 /* 19 * Names of client temp databases. 20 */ 21 #define REPFILEPREFIX "__db.rep" 22 #define REPBLOBNAME "__db.rep.blob.db" 23 #define REPDBNAME "__db.rep.db" 24 #define REPPAGENAME "__db.reppg.db" 25 26 /* 27 * Name of replicated system database file, and LSN history subdatabase within 28 * it. If the INMEM config flag is set, we create the database in memory, with 29 * the REPLSNHIST name (so that is why it also follows the __db naming 30 * convention). 31 */ 32 #define REPSYSDBNAME "__db.rep.system" 33 #define REPLSNHIST "__db.lsn.history" 34 #define REPMEMBERSHIP "__db.membership" 35 #define REPSYSDBPGSZ 1024 36 #define IS_REP_FILE(name) (strcmp(name, REPSYSDBNAME) == 0) 37 38 /* Current version of commit token format, and LSN history database format. */ 39 #define REP_COMMIT_TOKEN_FMT_VERSION 1 40 #define REP_LSN_HISTORY_FMT_VERSION 1 41 42 /* 43 * Message types 44 */ 45 #define REP_INVALID 0 /* Invalid message type. */ 46 #define REP_ALIVE 1 /* I am alive message. */ 47 #define REP_ALIVE_REQ 2 /* Request for alive messages. */ 48 #define REP_ALL_REQ 3 /* Request all log records greater than 49 * LSN. */ 50 #define REP_BLOB_ALL_REQ 4 /* Request all the given blob files. */ 51 #define REP_BLOB_CHUNK 5 /* A piece of data contained in a blob 52 * file. */ 53 #define REP_BLOB_CHUNK_REQ 6 /* Request a piece of data from a blob 54 * file. */ 55 #define REP_BLOB_UPDATE 7 /* A list of blob files for a 56 * database. */ 57 #define REP_BLOB_UPDATE_REQ 8 /* Request blob files. */ 58 #define REP_BULK_LOG 9 /* Bulk transfer of log records. */ 59 #define REP_BULK_PAGE 10 /* Bulk transfer of pages. */ 60 #define REP_DUPMASTER 11 /* Duplicate master detected; 61 * propagate. */ 62 #define REP_FILE 12 /* Page of a database file. NOTUSED */ 63 #define REP_FILE_FAIL 13 /* File requested does not exist. */ 64 #define REP_FILE_REQ 14 /* Request for a database file. 65 * NOTUSED */ 66 #define REP_LEASE_GRANT 15 /* Client grants a lease to a master. */ 67 #define REP_LOG 16 /* Log record. */ 68 #define REP_LOG_MORE 17 /* There are more log records to 69 * request. */ 70 #define REP_LOG_REQ 18 /* Request for a log record. */ 71 #define REP_MASTER_REQ 19 /* Who is the master */ 72 #define REP_NEWCLIENT 20 /* Announces the presence of a new 73 * client. */ 74 #define REP_NEWFILE 21 /* Announce a log file change. */ 75 #define REP_NEWMASTER 22 /* Announces who the master is. */ 76 #define REP_NEWSITE 23 /* Announces that a site has heard from 77 * a new site; like NEWCLIENT, but 78 * indirect. A NEWCLIENT message comes 79 * directly from the new client while a 80 * NEWSITE comes indirectly from 81 * someone who heard about a NEWSITE.*/ 82 #define REP_PAGE 24 /* Database page. */ 83 #define REP_PAGE_FAIL 25 /* Requested page does not exist. */ 84 #define REP_PAGE_MORE 26 /* There are more pages to request. */ 85 #define REP_PAGE_REQ 27 /* Request for a database page. */ 86 #define REP_REREQUEST 28 /* Force rerequest. */ 87 #define REP_START_SYNC 29 /* Tell client to begin syncing a ckp.*/ 88 #define REP_UPDATE 30 /* Environment hotcopy information. */ 89 #define REP_UPDATE_REQ 31 /* Request for hotcopy information. */ 90 #define REP_VERIFY 32 /* A log record for verification. */ 91 #define REP_VERIFY_FAIL 33 /* The client is outdated. */ 92 #define REP_VERIFY_REQ 34 /* Request for a log record to 93 * verify. */ 94 #define REP_VOTE1 35 /* Send out your information for an 95 * election. */ 96 #define REP_VOTE2 36 /* Send a "you are master" vote. */ 97 /* 98 * Maximum message number for conversion tables. Update this 99 * value as the largest message number above increases. 100 * It might make processing messages more straightforward if 101 * the *_MORE and BULK* messages were flags within the regular 102 * message type instead of separate message types themselves. 103 * 104 * !!! 105 * NOTE: When changing messages above, the two tables for upgrade support 106 * need adjusting. They are in rep_util.c. 107 */ 108 #define REP_MAX_MSG 36 109 110 /* 111 * This is the list of client-to-client requests messages. 112 * We use this to decide if we're doing client-to-client and 113 * might need to send a rerequest. 114 */ 115 #define REP_MSG_REQ(rectype) \ 116 (rectype == REP_ALL_REQ || \ 117 rectype == REP_BLOB_ALL_REQ || \ 118 rectype == REP_BLOB_CHUNK_REQ || \ 119 rectype == REP_LOG_REQ || \ 120 rectype == REP_PAGE_REQ || \ 121 rectype == REP_VERIFY_REQ) 122 123 /* 124 * This is the default value for the FD_SET_SIZE. It is used when client 125 * doesn't provide any value or the client provided value is smaller 126 */ 127 #define REPMGR_FD_SET_DEFAULT_SIZE 1024 128 129 /* 130 * Note that the version information should be at the beginning of the 131 * structure, so that we can rearrange the rest of it while letting the 132 * version checks continue to work. DB_REPVERSION should be revved any time 133 * the rest of the structure changes or when the message numbers change. 134 * 135 * Define also, the corresponding log versions that are tied to the 136 * replication/release versions. These are only needed in replication 137 * and that is why they're defined here. db_printlog takes notice as well. 138 */ 139 #define DB_LOGVERSION_42 8 140 #define DB_LOGVERSION_43 10 141 #define DB_LOGVERSION_44 11 142 #define DB_LOGVERSION_45 12 143 #define DB_LOGVERSION_46 13 144 #define DB_LOGVERSION_47 14 145 #define DB_LOGVERSION_48 15 146 #define DB_LOGVERSION_48p2 16 147 #define DB_LOGVERSION_50 17 148 #define DB_LOGVERSION_51 17 149 #define DB_LOGVERSION_52 18 150 #define DB_LOGVERSION_53 19 151 #define DB_LOGVERSION_60 20 152 #define DB_LOGVERSION_60p1 21 153 #define DB_LOGVERSION_61 22 154 #define DB_LOGVERSION_62 23 155 #define DB_LOGVERSION_MIN DB_LOGVERSION_47 156 #define DB_REPVERSION_INVALID 0 157 #define DB_REPVERSION_44 3 158 #define DB_REPVERSION_45 3 159 #define DB_REPVERSION_46 4 160 #define DB_REPVERSION_47 5 161 #define DB_REPVERSION_48 5 162 #define DB_REPVERSION_51 5 163 #define DB_REPVERSION_52 6 164 #define DB_REPVERSION_53 7 165 #define DB_REPVERSION_60 7 166 #define DB_REPVERSION_61 8 167 #define DB_REPVERSION_62 9 168 #define DB_REPVERSION DB_REPVERSION_62 169 #define DB_REPVERSION_MIN DB_REPVERSION_47 170 171 /* 172 * RPRINT - Replication diagnostic output 173 * VPRINT - Replication verbose output (superset of RPRINT). 174 * REP_PRINT_MESSAGE 175 * Macros for verbose replication messages. 176 * 177 * Everything using RPRINT will go to the system diag file (if it 178 * is configured) and also to the user's verbose output if 179 * they have that verbose level configured. 180 * Messages using VPRINT do not ever go to the system diag file, 181 * but will go to the user's verbose output if configured. 182 * 183 * Use VPRINT for anything that might be printed on a standard, 184 * successful transaction. Use RPRINT for error paths, rep 185 * state changes, elections, etc. 186 */ 187 #define REP_DIAGNAME "__db.rep.diag%02d" 188 #define REP_DIAGSIZE MEGABYTE 189 #define RPRINT(env, x) do { \ 190 if ((env)->dbenv->verbose != 0) \ 191 (void)__rep_print_system x; \ 192 } while (0) 193 #define VPRINT(env, x) do { \ 194 if ((env)->dbenv->verbose != 0) \ 195 (void)__rep_print x; \ 196 } while (0) 197 #define REP_PRINT_MESSAGE(env, eid, rp, str, fl) do { \ 198 if ((env)->dbenv->verbose != 0) \ 199 __rep_print_message(env, eid, rp, str, fl); \ 200 } while (0) 201 202 /* 203 * Election gen file name 204 * The file contains an egen number for an election this client has NOT 205 * participated in. I.e. it is the number of a future election. We 206 * create it when we create the rep region, if it doesn't already exist 207 * and initialize egen to 1. If it does exist, we read it when we create 208 * the rep region. We write it immediately before sending our VOTE1 in 209 * an election. That way, if a client has ever sent a vote for any 210 * election, the file is already going to be updated to reflect a future 211 * election, should it crash. 212 */ 213 #define REP_EGENNAME "__db.rep.egen" 214 #define REP_GENNAME "__db.rep.gen" 215 216 /* 217 * Internal init flag file name: 218 * The existence of this file serves as an indication that the client is in the 219 * process of Internal Initialization, in case it crashes before completing. 220 * During internal init the client's partially reconstructed database pages and 221 * logs may be in an inconsistent state, so much so that running recovery must 222 * be avoided. Furthermore, there is no other way to reliably recognize this 223 * condition. Therefore, when we open an environment, and we're just about to 224 * run recovery, we check for this file first. If it exists we must discard all 225 * logs and databases. This avoids the recovery problems, and leads to a fresh 226 * attempt at internal init if the environment becomes a replication client and 227 * finds a master. The list of databases which may need to be removed is stored 228 * in this file. 229 */ 230 #define REP_INITNAME "__db.rep.init" 231 #define REP_INITVERSION_46 1 232 #define REP_INITVERSION_47 2 233 #define REP_INITVERSION 3 234 235 /* 236 * View/partial replication file name. 237 * The file is empty. It exists as a permanent indicator that this 238 * environment can never be master. 239 */ 240 #define REPVIEW "__db.rep.view" 241 #define IS_VIEW_SITE(env) \ 242 (REP_ON(env) && \ 243 ((env)->rep_handle->region->stat.st_view != 0)) 244 245 /* 246 * Database types for __rep_client_dbinit 247 */ 248 typedef enum { 249 REP_BLOB, /* Blob file database. */ 250 REP_DB, /* Log record database. */ 251 REP_PG /* Pg database. */ 252 } repdb_t; 253 254 /* Macros to lock/unlock the replication region as a whole. */ 255 #define REP_SYSTEM_LOCK(env) \ 256 MUTEX_LOCK(env, (env)->rep_handle->region->mtx_region) 257 #define REP_SYSTEM_UNLOCK(env) \ 258 MUTEX_UNLOCK(env, (env)->rep_handle->region->mtx_region) 259 260 /* 261 * Macros for manipulating the event synchronization. We use a separate mutex 262 * so that an application's call-back function can be invoked without locking 263 * the whole region. 264 */ 265 #define REP_EVENT_LOCK(env) \ 266 MUTEX_LOCK(env, (env)->rep_handle->region->mtx_event) 267 #define REP_EVENT_UNLOCK(env) \ 268 MUTEX_UNLOCK(env, (env)->rep_handle->region->mtx_event) 269 270 /* 271 * Synchronization states 272 * Please change __rep_syncstate_to_string (rep_stat.c) to track any changes 273 * made to these states. 274 * 275 * The states are in alphabetical order (except for OFF). The usual 276 * order of progression for a full internal init is: 277 * VERIFY, UPDATE, PAGE, LOG (then back to OFF when we're done). 278 */ 279 typedef enum { 280 SYNC_OFF, /* No recovery. */ 281 SYNC_LOG, /* Recovery - log. */ 282 SYNC_PAGE, /* Recovery - pages and blobs. */ 283 SYNC_UPDATE, /* Recovery - update. */ 284 SYNC_VERIFY /* Recovery - verify. */ 285 } repsync_t; 286 287 /* 288 * A record of the contents of the VOTE1 msg we sent out at current egen, in 289 * case we need to send out a duplicate VOTE1 to a late-joining client in a full 290 * election. The nsites, nvotes, and priority fields of the REP struct can't be 291 * used, because those could change. It's only safe to send out a dup if we 292 * send out the exact same info. 293 */ 294 typedef struct { 295 DB_LSN lsn; 296 u_int32_t nsites; 297 u_int32_t nvotes; 298 u_int32_t priority; 299 u_int32_t tiebreaker; 300 u_int32_t ctlflags; 301 u_int32_t data_gen; 302 } VOTE1_CONTENT; 303 304 /* 305 * In a 2 node scenario, it is important that the newly joining node wait 306 * for the peer connection to be fully established before proceeding for 307 * election. Otherwise, it might end up electing itself as MASTER with its 308 * own vote (inspite of the fact that a fully functional master exists). 309 * Normally it takes about 5-20ms for the connection to go through. Setting 310 * the timeout to 500ms to account for connection time variations associated 311 * with SSL. 312 */ 313 #define DB_REP_PEER_WAIT_TIMEOUT 500000 314 315 /* 316 * REP -- 317 * Shared replication structure. 318 */ 319 typedef struct __rep { /* SHARED */ 320 db_mutex_t mtx_region; /* Region mutex. */ 321 db_mutex_t mtx_clientdb; /* Client database mutex. */ 322 db_mutex_t mtx_ckp; /* Checkpoint mutex. */ 323 db_mutex_t mtx_diag; /* Diagnostic message mutex. */ 324 db_mutex_t mtx_repstart; /* Role change mutex. */ 325 int diag_index; /* Diagnostic file index. */ 326 off_t diag_off; /* Diagnostic message offset. */ 327 roff_t lease_off; /* Offset of the lease table. */ 328 roff_t tally_off; /* Offset of the tally region. */ 329 roff_t v2tally_off; /* Offset of the vote2 tally region. */ 330 int eid; /* Environment id. */ 331 int master_id; /* ID of the master site. */ 332 u_int32_t version; /* Current replication version. */ 333 u_int32_t egen; /* Replication election generation. */ 334 u_int32_t spent_egen; /* Egen satisfied by rep_elect call. */ 335 u_int32_t gen; /* Replication generation number. */ 336 u_int32_t mgen; /* Master gen seen by client. */ 337 u_int32_t asites; /* Space allocated for sites. */ 338 u_int32_t nsites; /* Number of sites in group. */ 339 u_int32_t nvotes; /* Number of votes needed. */ 340 u_int32_t priority; /* My priority in an election. */ 341 u_int32_t config_nsites; 342 343 db_timeout_t elect_timeout; /* Normal/full election timeouts. */ 344 db_timeout_t full_elect_timeout; 345 346 db_timeout_t chkpt_delay; /* Master checkpoint delay. */ 347 348 #define REP_DEFAULT_THROTTLE (10 * MEGABYTE) /* Default value is < 1Gig. */ 349 u_int32_t gbytes; /* Limit on data sent in single... */ 350 u_int32_t bytes; /* __rep_process_message call. */ 351 #define DB_REP_REQUEST_GAP 40000 /* 40 msecs */ 352 #define DB_REP_MAX_GAP 1280000 /* 1.28 seconds */ 353 db_timespec request_gap; /* Minimum time to wait before we 354 * request a missing log record. */ 355 db_timespec max_gap; /* Maximum time to wait before 356 * requesting a missing log record. */ 357 u_int32_t initsites; /* Number of sites for 358 * set_memory_init(). */ 359 /* Status change information */ 360 u_int32_t apply_th; /* Number of callers in rep_apply. */ 361 u_int32_t arch_th; /* Number of callers in log_archive. */ 362 u_int32_t elect_th; /* Elect threads in lock-out. */ 363 u_int32_t msg_th; /* Number of callers in rep_proc_msg.*/ 364 u_int32_t handle_cnt; /* Count of handles in library. */ 365 u_int32_t op_cnt; /* Multi-step operation count.*/ 366 DB_LSN ckp_lsn; /* LSN for syncing a checkpoint. */ 367 DB_LSN max_prep_lsn; /* Max LSN of txn_prepare record. */ 368 369 /* 370 * Event notification synchronization: the mtx_event and associate 371 * fields which it protects govern event notification to the 372 * application. They form a guarantee that no matter how crazy the 373 * thread scheduling gets, the application sees a sensible, orderly 374 * progression of events. 375 */ 376 db_mutex_t mtx_event; /* Serializes event notification. */ 377 /* 378 * Latest generation whose NEWMASTER event the application has been 379 * notified of. Also serves to force STARTUPDONE to occur after 380 * NEWMASTER. 381 */ 382 u_int32_t newmaster_event_gen; 383 /* 384 * Latest local victory of an election that the application has been 385 * notified of, expressed as the election generation number. This 386 * ensures we notify the application exactly once when it wins an 387 * election. 388 */ 389 u_int32_t notified_egen; 390 391 /* Internal init information. */ 392 u_int32_t nfiles; /* Number of files we have info on. */ 393 u_int32_t curfile; /* Cur file we're getting (0-based). */ 394 roff_t originfo_off; /* Offset of original file info. */ 395 u_int32_t infolen; /* Remaining length file info buffer. */ 396 u_int32_t originfolen; /* Original length file info buffer. */ 397 u_int32_t infoversion; /* Original file info version. */ 398 DB_LSN first_lsn; /* Earliest LSN we need. */ 399 u_int32_t first_vers; /* Log version of first log file. */ 400 DB_LSN last_lsn; /* Latest LSN we need. */ 401 /* These are protected by mtx_clientdb. */ 402 db_seq_t gap_bl_hi_id; /* Last id in the blob gap. */ 403 db_seq_t gap_bl_hi_sid; /* Last sid in the blob gap. */ 404 off_t gap_bl_hi_off; /* Last offset in the blob gap. */ 405 db_seq_t last_blob_id; /* Last id on the list to process. */ 406 db_seq_t last_blob_sid; /* Last sid on the list to process. */ 407 db_seq_t prev_blob_id; /* Previous last id on list. */ 408 db_seq_t prev_blob_sid; /* Previous last sid on list. */ 409 db_seq_t highest_id; /* Highest file id to request. */ 410 u_int32_t blob_more_files;/* More blob files to be processed. */ 411 int blob_sync; /* Currently handling blobs. */ 412 db_timespec last_pg_ts; /* Last page stored timestamp. */ 413 db_pgno_t ready_pg; /* Next pg expected. */ 414 db_pgno_t waiting_pg; /* First pg after gap. */ 415 db_pgno_t max_wait_pg; /* Maximum pg requested. */ 416 u_int32_t npages; /* Num of pages rcvd for this file. */ 417 roff_t curinfo_off; /* Offset of current file info. */ 418 /* Always access with GET_CURINFO(). */ 419 420 /* Vote tallying information. */ 421 u_int32_t sites; /* Sites heard from. */ 422 int winner; /* Current winner EID. */ 423 u_int32_t w_priority; /* Winner priority. */ 424 u_int32_t w_gen; /* Winner generation. */ 425 u_int32_t w_datagen; /* Winner data generation. */ 426 DB_LSN w_lsn; /* Winner LSN. */ 427 u_int32_t w_tiebreaker; /* Winner tiebreaking value. */ 428 u_int32_t votes; /* Number of votes for this site. */ 429 430 VOTE1_CONTENT vote1; /* Valid until rep->egen changes. */ 431 432 db_timespec etime; /* Election start timestamp. */ 433 int full_elect; /* Is current election a "full" one? */ 434 435 /* Leases. */ 436 db_timeout_t lease_timeout; /* Lease timeout. */ 437 db_timespec lease_duration; /* Lease timeout with clock skew. */ 438 u_int32_t clock_skew; /* Clock skew. */ 439 u_int32_t clock_base; /* Clock scale factor base. */ 440 db_timespec grant_expire; /* Local grant expiration time. */ 441 442 /* Cached LSN history, matching current gen. */ 443 DB_LSN gen_base_lsn; /* Base LSN of current generation. */ 444 u_int32_t master_envid; /* Current master's "unique" env ID. */ 445 446 SH_TAILQ_HEAD(__wait) waiters; /* List of threads in txn_applied(). */ 447 SH_TAILQ_HEAD(__wfree) free_waiters;/* Free list of waiter structs. */ 448 449 #ifdef HAVE_REPLICATION_THREADS 450 /* 451 * Replication Framework (repmgr) shared config information. 452 */ 453 db_mutex_t mtx_repmgr; /* Region mutex. */ 454 roff_t siteinfo_off; /* Offset of site array region. */ 455 u_int site_cnt; /* Array slots in use. */ 456 u_int site_max; /* Total array slots allocated. */ 457 u_int sites_avail; /* Total number of available sites. */ 458 int self_eid; /* Where to find the local site. */ 459 u_int siteinfo_seq; /* Number of updates to this info. */ 460 u_int32_t min_log_file; /* Earliest log needed by repgroup. */ 461 DB_LSN last_ckp_lsn; /* The last checkpoint's ckp_lsn. */ 462 463 pid_t listener; 464 u_int listener_nthreads; /* # of msg threads in listener. */ 465 466 int perm_policy; 467 db_timeout_t ack_timeout; 468 db_timeout_t election_retry_wait; 469 db_timeout_t connection_retry_wait; 470 db_timeout_t heartbeat_frequency; /* Max period between msgs. */ 471 db_timeout_t heartbeat_monitor_timeout; 472 db_timeout_t write_forward_timeout; 473 u_int32_t inqueue_max_gbytes; 474 u_int32_t inqueue_max_bytes; 475 u_int32_t inqueue_rz_gbytes; 476 u_int32_t inqueue_rz_bytes; 477 u_int32_t inqueue_full_event_on; 478 #endif /* HAVE_REPLICATION_THREADS */ 479 480 /* Statistics. */ 481 DB_REP_STAT stat; 482 #if defined(HAVE_REPLICATION_THREADS) && defined(HAVE_STATISTICS) 483 DB_REPMGR_STAT mstat; 484 #endif 485 486 /* 487 * Please change __rep_print_all (rep_stat.c) to track any changes made 488 * to all these flag families below. 489 */ 490 /* Configuration. */ 491 #define REP_C_2SITE_STRICT 0x00001 /* Don't cheat on elections. */ 492 #define REP_C_AUTOINIT 0x00002 /* Auto initialization. */ 493 #define REP_C_AUTOROLLBACK 0x00004 /* Discard client txns: sync. */ 494 #define REP_C_AUTOTAKEOVER 0x00008 /* Auto listener take over. */ 495 #define REP_C_BULK 0x00010 /* Bulk transfer. */ 496 #define REP_C_DELAYCLIENT 0x00020 /* Delay client sync-up. */ 497 #define REP_C_DISABLE_POLL 0x00040 /* Avoid POLL for network io */ 498 #define REP_C_DISABLE_SSL 0x00080 /* Avoid POLL for network io */ 499 #define REP_C_ELECT_LOGLENGTH 0x00100 /* Log length wins election. */ 500 #define REP_C_ELECTIONS 0x00200 /* Repmgr to use elections. */ 501 #define REP_C_ENABLE_EPOLL 0x00400 /* Use EPOLL for network io */ 502 #define REP_C_FORWARD_WRITES 0x00800 /* Repmgr write forwarding. */ 503 #define REP_C_INMEM 0x01000 /* In-memory replication. */ 504 #define REP_C_LEASE 0x02000 /* Leases configured. */ 505 #define REP_C_NOWAIT 0x04000 /* Immediate error return. */ 506 #define REP_C_PREFMAS_CLIENT 0x08000 /* Preferred master client. */ 507 #define REP_C_PREFMAS_MASTER 0x10000 /* Preferred master site. */ 508 u_int32_t config; /* Configuration flags. */ 509 510 /* Election. */ 511 #define REP_E_PHASE0 0x00000001 /* In phase 0 of election. */ 512 #define REP_E_PHASE1 0x00000002 /* In phase 1 of election. */ 513 #define REP_E_PHASE2 0x00000004 /* In phase 2 of election. */ 514 #define REP_E_TALLY 0x00000008 /* Tallied vote before elect. */ 515 #define REP_E_PEER_CONN_WAIT 0x00000010 /* Election waiting for peer. */ 516 u_int32_t elect_flags; /* Election flags. */ 517 518 /* Lockout. */ 519 #define REP_LOCKOUT_API 0x00000001 /* BDB API - handle_cnt. */ 520 #define REP_LOCKOUT_APPLY 0x00000002 /* apply msgs - apply_th. */ 521 #define REP_LOCKOUT_ARCHIVE 0x00000004 /* log_archive. */ 522 #define REP_LOCKOUT_MSG 0x00000008 /* Message process - msg_th. */ 523 #define REP_LOCKOUT_OP 0x00000010 /* BDB ops txn,curs - op_cnt. */ 524 u_int32_t lockout_flags; /* Lockout flags. */ 525 526 /* See above for enumerated sync states. */ 527 repsync_t sync_state; /* Recovery/synchronization flags. */ 528 529 /* 530 * When adding a new flag value, consider whether it should be 531 * cleared in rep_start() when starting as a master or a client. 532 */ 533 #define REP_F_ABBREVIATED 0x00000001 /* Recover NIMDB pages only. */ 534 #define REP_F_APP_BASEAPI 0x00000002 /* Base API application. */ 535 #define REP_F_APP_REPMGR 0x00000004 /* repmgr application. */ 536 #define REP_F_CLIENT 0x00000008 /* Client replica. */ 537 #define REP_F_DELAY 0x00000010 /* Delaying client sync-up. */ 538 #define REP_F_GROUP_ESTD 0x00000020 /* Rep group is established. */ 539 #define REP_F_HOLD_GEN 0x00000040 /* PrefMas startup hold gen. */ 540 #define REP_F_INUPDREQ 0x00000080 /* Thread in rep_update_req. */ 541 #define REP_F_LEASE_EXPIRED 0x00000100 /* Leases guaranteed expired. */ 542 #define REP_F_MASTER 0x00000200 /* Master replica. */ 543 #define REP_F_MASTERELECT 0x00000400 /* Master elect. */ 544 #define REP_F_NEWFILE 0x00000800 /* Newfile in progress. */ 545 #define REP_F_NIMDBS_LOADED 0x00001000 /* NIMDBs are materialized. */ 546 #define REP_F_READONLY_MASTER 0x00002000 /* PrefMas readonly master. */ 547 #define REP_F_SKIPPED_APPLY 0x00004000 /* Skipped applying a record. */ 548 #define REP_F_START_CALLED 0x00008000 /* Rep_start called. */ 549 #define REP_F_SYS_DB_OP 0x00010000 /* Operation in progress. */ 550 u_int32_t flags; 551 } REP; 552 553 /* Information about a thread waiting in txn_applied(). */ 554 typedef enum { 555 AWAIT_GEN, /* Client's gen is behind token gen. */ 556 AWAIT_HISTORY, /* Haven't received master's LSN db update. */ 557 AWAIT_LSN, /* Awaiting replication of user txn. */ 558 AWAIT_NIMDB, /* LSN db missing: maybe it's INMEM. */ 559 LOCKOUT /* Thread awoken due to pending lockout. */ 560 } rep_waitreason_t; 561 562 struct rep_waitgoal { 563 rep_waitreason_t why; 564 union { 565 DB_LSN lsn; /* For AWAIT_LSN and AWAIT_HISTORY. */ 566 u_int32_t gen; /* AWAIT_GEN */ 567 } u; 568 }; 569 570 struct __rep_waiter { 571 db_mutex_t mtx_repwait; /* Self-blocking mutex. */ 572 struct rep_waitgoal goal; 573 SH_TAILQ_ENTRY links; /* On either free or waiting list. */ 574 575 #define REP_F_PENDING_LOCKOUT 0x00000001 576 #define REP_F_WOKEN 0x00000002 577 u_int32_t flags; 578 }; 579 580 /* 581 * Macros to check and clear the BDB lockouts. Currently they are 582 * locked out/set individually because they pertain to different pieces of 583 * the BDB API, they are otherwise always checked and cleared together. 584 */ 585 #define ISSET_LOCKOUT_BDB(R) \ 586 (FLD_ISSET((R)->lockout_flags, (REP_LOCKOUT_API | REP_LOCKOUT_OP))) 587 588 #define CLR_LOCKOUT_BDB(R) \ 589 (FLD_CLR((R)->lockout_flags, (REP_LOCKOUT_API | REP_LOCKOUT_OP))) 590 591 /* 592 * Recovery flag mask to easily check any/all recovery bits. That is 593 * REP_LOCKOUT_{API|OP} and most REP_S_*. This must change if the values 594 * of the flags change. NOTE: We do not include REP_LOCKOUT_MSG in 595 * this mask because it is used frequently in non-recovery related 596 * areas and we want to manipulate it separately (see especially 597 * in __rep_new_master). 598 */ 599 #define CLR_RECOVERY_SETTINGS(R) \ 600 do { \ 601 (R)->sync_state = SYNC_OFF; \ 602 CLR_LOCKOUT_BDB(R); \ 603 } while (0) 604 605 #define IS_REP_RECOVERING(R) \ 606 ((R)->sync_state != SYNC_OFF || ISSET_LOCKOUT_BDB(R)) 607 608 /* 609 * REP_F_EPHASE0 is not a *real* election phase. It is used for 610 * master leases and allowing the client to find the master or 611 * expire its lease. 612 */ 613 #define IN_ELECTION(R) \ 614 FLD_ISSET((R)->elect_flags, REP_E_PHASE1 | REP_E_PHASE2) 615 #define IN_ELECTION_TALLY(R) \ 616 FLD_ISSET((R)->elect_flags, REP_E_PHASE1 | REP_E_PHASE2 | REP_E_TALLY) 617 #define ELECTION_MAJORITY(n) (((n) / 2) + 1) 618 619 #define IN_INTERNAL_INIT(R) \ 620 ((R)->sync_state == SYNC_LOG || (R)->sync_state == SYNC_PAGE) 621 622 #define IS_REP_MASTER(env) \ 623 (REP_ON(env) && \ 624 F_ISSET(((env)->rep_handle->region), REP_F_MASTER)) 625 626 #define IS_REP_CLIENT(env) \ 627 (REP_ON(env) && \ 628 F_ISSET(((env)->rep_handle->region), REP_F_CLIENT)) 629 630 #define IS_REP_STARTED(env) \ 631 (REP_ON(env) && \ 632 F_ISSET(((env)->rep_handle->region), REP_F_START_CALLED)) 633 634 #define IS_USING_LEASES(env) \ 635 (REP_ON(env) && \ 636 FLD_ISSET(((env)->rep_handle->region)->config, REP_C_LEASE)) 637 638 #define IS_CLIENT_PGRECOVER(env) \ 639 (IS_REP_CLIENT(env) && \ 640 (((env)->rep_handle->region)->sync_state == SYNC_PAGE)) 641 642 /* 643 * Macros to figure out if we need to do replication pre/post-amble processing. 644 * Skip for specific DB handles owned by the replication layer, either because 645 * replication is running recovery or because it's a handle entirely owned by 646 * the replication code (replication opens its own databases to track state). 647 */ 648 #define REP_FLAGS_SET(env) \ 649 ((env)->rep_handle->region->flags != 0 || \ 650 (env)->rep_handle->region->elect_flags != 0 || \ 651 (env)->rep_handle->region->lockout_flags != 0) 652 653 #define IS_ENV_REPLICATED(env) \ 654 (REP_ON(env) && REP_FLAGS_SET(env)) 655 656 /* 657 * Update the temporary log archive block timer. 658 */ 659 #define MASTER_UPDATE(env, renv) do { \ 660 REP_SYSTEM_LOCK(env); \ 661 F_SET((renv), DB_REGENV_REPLOCKED); \ 662 (void)time(&(renv)->op_timestamp); \ 663 REP_SYSTEM_UNLOCK(env); \ 664 } while (0) 665 666 /* 667 * Macro to set a new generation number. Cached values from the LSN history 668 * database are associated with the current gen, so when the gen changes we must 669 * invalidate the cache. Use this macro for all gen changes, to avoid 670 * forgetting to do so. This macro should be used while holding the rep system 671 * mutex (unless we know we're single-threaded for some other reason, like at 672 * region create time). 673 */ 674 #define SET_GEN(g) do { \ 675 rep->gen = (g); \ 676 ZERO_LSN(rep->gen_base_lsn); \ 677 } while (0) 678 679 /* Macros to determine current replication configuration options. */ 680 #define REP_CONFIG_IS_SET(env, flags) \ 681 (REP_ON(env) ? \ 682 FLD_ISSET(((env)->rep_handle->region)->config, flags) : \ 683 FLD_ISSET(((env)->rep_handle)->config, flags)) 684 #ifdef HAVE_REPLICATION_THREADS 685 #define PREFMAS_IS_SET(env) \ 686 (REP_CONFIG_IS_SET(env, \ 687 (REP_C_PREFMAS_MASTER | REP_C_PREFMAS_CLIENT))) 688 #else 689 #define PREFMAS_IS_SET(env) 0 690 #endif 691 #define IS_PREFMAS_MODE(env) \ 692 (REP_ON(env) && PREFMAS_IS_SET(env) && \ 693 ((env)->rep_handle->region)->config_nsites < 3) 694 695 /* 696 * Gap processing flags. These provide control over the basic 697 * gap processing algorithm for some special cases. 698 */ 699 #define REP_GAP_FORCE 0x001 /* Force a request for a gap. */ 700 #define REP_GAP_REREQUEST 0x002 /* Gap request is a forced rerequest. */ 701 /* REREQUEST is a superset of FORCE. */ 702 703 /* 704 * Internal options for rep_start_int(). These are used by preferred master 705 * mode to help coordinate between the sites during changes of master. 706 */ 707 #define REP_START_FORCE_ROLECHG 0x001 /* Force role change to advance gen. */ 708 #define REP_START_HOLD_CLIGEN 0x002 /* Hold client gen before doing 709 * lsnhist match. */ 710 #define REP_START_WAIT_LOCKMSG 0x004 /* Wait for REP_LOCKOUT_MSG. */ 711 712 /* 713 * Flags indicating what kind of record we want to back up to, in the log. 714 */ 715 #define REP_REC_COMMIT 0x001 /* Most recent commit record. */ 716 #define REP_REC_PERM 0x002 /* Most recent perm record. */ 717 /* PERM is a superset of COMMIT. */ 718 #define REP_REC_PERM_DEL 0x004 /* Most recent PERM, or fail if a 719 * file delete is found first. */ 720 721 /* 722 * Permanent record types. 723 */ 724 #define IS_PERM_RECTYPE(rectype) \ 725 ((rectype) == DB___txn_regop || (rectype) == DB___txn_ckp) 726 727 /* 728 * Basic pre/post-amble processing. 729 */ 730 #define REPLICATION_WRAP(env, func_call, checklock, ret) do { \ 731 int __rep_check, __t_ret; \ 732 __rep_check = IS_ENV_REPLICATED(env) ? 1 : 0; \ 733 (ret) = __rep_check ? __env_rep_enter(env, checklock) : 0; \ 734 if ((ret) == 0) { \ 735 (ret) = func_call; \ 736 if (__rep_check && (__t_ret = \ 737 __env_db_rep_exit(env)) != 0 && (ret) == 0) \ 738 (ret) = __t_ret; \ 739 } \ 740 } while (0) 741 742 /* 743 * Macro to safely access curinfo and its internal DBT pointers from 744 * any process. This should always be used to access curinfo. If 745 * the internal DBT pointers are to be used, mtx_clientdb must be held 746 * between the time of this call and the use of the pointers. 747 * 748 * The current file information (curinfo) is stored in shared region 749 * memory and accessed via an offset. It contains DBTs that themselves 750 * point to allocated data. __rep_nextfile() manages this information in a 751 * single chunk of shared memory. 752 * 753 * If different processes access curinfo, they may have different shared 754 * region addresses. This means that curinfo and its pointers to DBT data 755 * must be recalculated for each process starting with the offset. 756 */ 757 #define GET_CURINFO(rep, infop, curinfo) \ 758 do { \ 759 curinfo = R_ADDR(infop, rep->curinfo_off); \ 760 if ((curinfo)->uid.size > 0) \ 761 (curinfo)->uid.data = R_ADDR(infop, \ 762 rep->curinfo_off + sizeof(__rep_fileinfo_args)); \ 763 else \ 764 (curinfo)->uid.data = NULL; \ 765 if ((curinfo)->info.size > 0) \ 766 (curinfo)->info.data = R_ADDR(infop, rep->curinfo_off + \ 767 sizeof(__rep_fileinfo_args) + (curinfo)->uid.size); \ 768 else \ 769 (curinfo)->info.data = NULL; \ 770 if ((curinfo)->dir.size > 0) \ 771 (curinfo)->dir.data = R_ADDR(infop, rep->curinfo_off + \ 772 sizeof(__rep_fileinfo_args) + (curinfo)->uid.size + \ 773 (curinfo)->info.size); \ 774 else \ 775 (curinfo)->dir.data = NULL; \ 776 } while (0) 777 778 #if defined(HAVE_REPMGR_SSL) 779 780 typedef struct __repmgr_ssl_config { 781 char *repmgr_ca_cert_file; 782 char *repmgr_ca_dir; 783 char *repmgr_cert_file; 784 char *repmgr_key_file; 785 char *repmgr_key_file_passwd; 786 u_int repmgr_ssl_verify_depth; 787 } REPMGR_SSL_CONFIG; 788 789 #endif 790 791 /* 792 * Per-process replication structure. 793 * 794 * There are 2 mutexes used in the Base replication API. (See LOCK_MUTEX in 795 * repmgr.h for a discussion of repmgr.) 796 * 1. mtx_region - This protects the fields of the rep region above. 797 * 2. mtx_clientdb - This protects the per-process flags, and bookkeeping 798 * database and all of the components that maintain it. Those 799 * components include the following fields in the log region (see log.h): 800 * a. ready_lsn 801 * b. waiting_lsn 802 * c. verify_lsn 803 * d. wait_recs 804 * e. rcvd_recs 805 * f. max_wait_lsn 806 * These fields in the log region are NOT protected by the log region lock at 807 * all. 808 * 809 * Note that the per-process flags should truly be protected by a special 810 * per-process thread mutex, but it is currently set in so isolated a manner 811 * that it didn't make sense to do so and in most case we're already holding 812 * the mtx_clientdb anyway. 813 * 814 * The lock ordering protocol is that mtx_clientdb must be acquired first and 815 * then either REP->mtx_region, or the LOG->mtx_region mutex may be acquired if 816 * necessary. 817 * 818 * Note that the appropriate mutex is needed any time one or more related 819 * values are read or written that could possibly use more than one atomic 820 * machine instruction. A single 32-bit integer value is safe without a 821 * mutex, but most other types of value should use a mutex. 822 * 823 * Use of a db_mutex_t mutex must be inside a matched pair of ENV_ENTER() and 824 * ENV_LEAVE() macros. This ensures that if a thread dies while holding 825 * a lock (i.e. a mutex), recovery can clean it up so that it does not 826 * indefinitely block other threads. 827 */ 828 struct __db_rep { 829 /* 830 * Shared configuration information -- copied to and maintained in the 831 * shared region as soon as the shared region is created. 832 */ 833 int eid; /* Environment ID. */ 834 835 u_int32_t gbytes; /* Limit on data sent in single... */ 836 u_int32_t bytes; /* __rep_process_message call. */ 837 838 db_timespec request_gap; /* Minimum time to wait before we 839 * request a missing log record. */ 840 db_timespec max_gap; /* Maximum time to wait before 841 * requesting a missing log record. */ 842 843 u_int32_t clock_skew; /* Clock skew factor. */ 844 u_int32_t clock_base; /* Clock skew base. */ 845 u_int32_t config; /* Configuration flags. */ 846 u_int32_t config_nsites; 847 848 db_timeout_t elect_timeout; /* Normal/full election timeouts. */ 849 db_timeout_t full_elect_timeout; 850 851 db_timeout_t chkpt_delay; /* Master checkpoint delay. */ 852 853 u_int32_t my_priority; 854 db_timeout_t lease_timeout; /* Master leases. */ 855 /* 856 * End of shared configuration information. 857 */ 858 int (*partial) /* View/partial replication function. */ 859 __P((DB_ENV *, const char *, int *, u_int32_t)); 860 861 int (*send) /* Send function. */ 862 __P((DB_ENV *, const DBT *, const DBT *, 863 const DB_LSN *, int, u_int32_t)); 864 865 DB *rep_db; /* Bookkeeping database. */ 866 DB *lsn_db; /* (Replicated) LSN history database. */ 867 db_mutex_t mtx_lsnhist; /* LSN history database mutex. */ 868 869 REP *region; /* In memory structure. */ 870 u_int8_t *bulk; /* Shared memory bulk area. */ 871 872 #define DBREP_DIAG_FILES 2 873 DB_FH *diagfile[DBREP_DIAG_FILES]; /* Diag files fhp. */ 874 off_t diag_off; /* Current diag file offset. */ 875 876 /* These are protected by mtx_clientdb. */ 877 DB_MPOOLFILE *file_mpf; /* Mpoolfile for current database. */ 878 DB *file_dbp; /* This file's page info. */ 879 DBC *queue_dbc; /* Dbc for a queue file. */ 880 DB *blob_dbp; /* Blob file database. */ 881 882 /* 883 * Please change __rep_print_all (rep_stat.c) to track any changes made 884 * to these flags. 885 */ 886 #define DBREP_APP_BASEAPI 0x0001 /* Base API application. */ 887 #define DBREP_APP_REPMGR 0x0002 /* repmgr application. */ 888 #define DBREP_OPENFILES 0x0004 /* This handle has opened files. */ 889 u_int32_t flags; /* per-process flags. */ 890 891 #ifdef HAVE_REPLICATION_THREADS 892 /* 893 * Replication Framework (repmgr) per-process information. 894 */ 895 int config_nthreads;/* Configured msg processing threads. */ 896 u_int nthreads; /* Msg processing threads. */ 897 u_int athreads; /* Space allocated for msg threads. */ 898 u_int non_rep_th; /* Threads in GMDB or channel msgs. */ 899 u_int aelect_threads; /* Space allocated for elect threads. */ 900 u_int32_t init_policy; 901 int perm_policy; 902 DB_LSN perm_lsn; /* Last perm LSN we've announced. */ 903 db_timeout_t ack_timeout; 904 db_timeout_t election_retry_wait; 905 db_timeout_t connection_retry_wait; 906 db_timeout_t heartbeat_frequency; /* Max period between msgs. */ 907 db_timeout_t heartbeat_monitor_timeout; 908 db_timeout_t write_forward_timeout; 909 u_int32_t inqueue_max_gbytes; 910 u_int32_t inqueue_max_bytes; 911 912 /* Thread synchronization. */ 913 REPMGR_RUNNABLE *selector, **messengers, **elect_threads; 914 REPMGR_RUNNABLE *preferred_elect_thr; 915 REPMGR_RUNNABLE *takeover_thread; 916 db_timespec repstart_time; 917 mgr_mutex_t *mutex; 918 cond_var_t check_election, gmdb_idle, msg_avail; 919 waiter_t ack_waiters; /* For threads awaiting PERM acks. */ 920 #ifdef DB_WIN32 921 HANDLE signaler; 922 #else 923 int read_pipe, write_pipe; 924 #endif 925 926 /* Operational stuff. */ 927 REPMGR_SITE *sites; /* Array of known sites. */ 928 u_int site_cnt; /* Array slots in use. */ 929 u_int site_max; /* Total array slots allocated. */ 930 int self_eid; /* Where to find the local site. */ 931 u_int siteinfo_seq; /* Last known update to this list. */ 932 933 /* 934 * The connections list contains only those connections not actively 935 * associated with a known site (see repmgr.h). 936 */ 937 CONNECTION_LIST connections; 938 RETRY_Q_HEADER retries; /* Sites needing connection retry. */ 939 struct { 940 u_int32_t gbytes; 941 u_int32_t bytes; 942 STAILQ_HEAD(__repmgr_q_header, __repmgr_message) header; 943 } input_queue; 944 945 socket_t listen_fd; 946 db_timespec last_bcast; /* Time of last broadcast msg. */ 947 db_timespec last_hbeat; /* Time of last heartbeat (prefmas). */ 948 db_timespec l_listener_chk; /* Time to check local listener. */ 949 db_timeout_t l_listener_wait;/* Timeout to check local listener. */ 950 db_timespec m_listener_chk; /* Time to check master listener. */ 951 db_timeout_t m_listener_wait;/* Timeout to check master listener. */ 952 953 #if defined(HAVE_REPMGR_SSL) 954 REPMGR_SSL_CONFIG repmgr_ssl_conf; 955 SSL_CTX *repmgr_ssl_ctx; 956 #endif 957 958 /* 959 * Status of repmgr. It is ready when repmgr is not yet started. It 960 * is running after repmgr is (re)started. It is stopped if the env 961 * of the running repmgr is closed, or the site is removed. 962 */ 963 enum { ready, running, stopped } repmgr_status; 964 int new_connection; /* Since last master seek attempt. */ 965 int demotion_pending; /* We're being demoted to a view. */ 966 int takeover_pending; /* We've been elected master. */ 967 int rejoin_pending; /* Join group retry after rejection. */ 968 int gmdb_busy; 969 int client_intent; /* Will relinquish master role. */ 970 int gmdb_dirty; 971 int have_gmdb; 972 int seen_repmsg; 973 int view_mismatch; /* View callback and gmdb don't match. */ 974 int abbrev_init; /* For mixed version gmdb upgrade. */ 975 976 /* 977 * Flag to show what kind of transaction is currently in progress. 978 * Primary means we're doing the first (critical) phase of a membership 979 * DB update, where we care about perm failures. In the secondary phase 980 * we don't care. Usually the value is "none", when normal user 981 * transactions are happening. We need to use this global flag because 982 * we don't have a more proper direct channel to communicate information 983 * between the originator of a transaction and the replication send() 984 * function that has to wait for acks and decide what to do about them. 985 */ 986 enum { none, gmdb_primary, gmdb_secondary } active_gmdb_update; 987 int limbo_resolution_needed; 988 989 /* 990 * GMDB update sequence count. On creation we write version 1; so, once 991 * repmgr has started and tried to read, a 0 here can be taken to mean 992 * that the DB doesn't exist yet. 993 */ 994 u_int32_t membership_version; 995 u_int32_t member_version_gen; 996 997 /* LSN of GMDB txn that got a perm failure. */ 998 DB_LSN limbo_failure; 999 /* EID whose membership status is therefore unresolved */ 1000 int limbo_victim; 1001 /* LSN of a later txn that achieves perm success. */ 1002 DB_LSN durable_lsn; 1003 DB *gmdb; /* Membership database handle. */ 1004 /* 1005 * Membership list restored from init file after crash 1006 * during internal init. 1007 */ 1008 u_int8_t *restored_list; 1009 size_t restored_list_length; 1010 1011 /* 1012 * Preferred master mode indicator for a pending action. A 1013 * master_switch is initiated when the preferred master site is 1014 * ready to take over as master. A start_temp_master is initiated 1015 * when the client site needs to start as the temporary master. 1016 */ 1017 enum { no_action, master_switch, start_temp_master } prefmas_pending; 1018 /* The LSN at the very beginning of preferred master site startup. */ 1019 DB_LSN prefmas_init_lsn; 1020 1021 /* Application's message dispatch call-back function. */ 1022 void (*msg_dispatch) __P((DB_ENV *, DB_CHANNEL *, 1023 DBT *, u_int32_t, u_int32_t)); 1024 1025 /* Socket approval callback function. */ 1026 int (*approval) __P((DB_ENV *, DB_REPMGR_SOCKET, int *, u_int32_t)); 1027 #endif /* HAVE_REPLICATION_THREADS */ 1028 }; 1029 1030 /* 1031 * Determine whether application is repmgr or base replication API. If 1032 * repmgr was configured, base the test on internal replication flags for 1033 * APP_REPMGR and APP_BASEAPI. These flags get set by the appropriate parts 1034 * of the various replication APIs. 1035 */ 1036 #ifdef HAVE_REPLICATION_THREADS 1037 /* 1038 * Application type is set to be repmgr when: 1039 * 1. A local site is defined. 1040 * 2. A remote site is defined. 1041 * 3. An acknowledgement policy is configured. 1042 * 4. A repmgr flag is configured. 1043 * 5. A timeout value is configured for one of the repmgr timeouts. 1044 */ 1045 #define APP_IS_REPMGR(env) \ 1046 (REP_ON(env) ? \ 1047 F_ISSET((env)->rep_handle->region, REP_F_APP_REPMGR) : \ 1048 F_ISSET((env)->rep_handle, DBREP_APP_REPMGR)) 1049 1050 /* 1051 * Application type is set to be base replication API when: 1052 * 1. Transport send function is defined and is not the repmgr send 1053 * function. 1054 */ 1055 #define APP_IS_BASEAPI(env) \ 1056 (REP_ON(env) ? \ 1057 F_ISSET((env)->rep_handle->region, REP_F_APP_BASEAPI) : \ 1058 F_ISSET((env)->rep_handle, DBREP_APP_BASEAPI)) 1059 1060 /* 1061 * Set application type. These macros do extra checking to guarantee that 1062 * only one application type is ever set. 1063 */ 1064 #define APP_SET_REPMGR(env) do { \ 1065 if (REP_ON(env)) { \ 1066 ENV_ENTER(env, ip); \ 1067 REP_SYSTEM_LOCK(env); \ 1068 if (!F_ISSET((env)->rep_handle->region, \ 1069 REP_F_APP_BASEAPI)) \ 1070 F_SET((env)->rep_handle->region, \ 1071 REP_F_APP_REPMGR); \ 1072 REP_SYSTEM_UNLOCK(env); \ 1073 ENV_LEAVE(env, ip); \ 1074 } else if (!F_ISSET((env)->rep_handle, DBREP_APP_BASEAPI)) \ 1075 F_SET((env)->rep_handle, DBREP_APP_REPMGR); \ 1076 } while (0) 1077 #define APP_SET_BASEAPI(env) do { \ 1078 if (REP_ON(env)) { \ 1079 ENV_ENTER(env, ip); \ 1080 REP_SYSTEM_LOCK(env); \ 1081 if (!F_ISSET((env)->rep_handle->region, \ 1082 REP_F_APP_REPMGR)) \ 1083 F_SET((env)->rep_handle->region, \ 1084 REP_F_APP_BASEAPI); \ 1085 REP_SYSTEM_UNLOCK(env); \ 1086 ENV_LEAVE(env, ip); \ 1087 } else if (!F_ISSET((env)->rep_handle, DBREP_APP_REPMGR)) \ 1088 F_SET((env)->rep_handle, DBREP_APP_BASEAPI); \ 1089 } while (0) 1090 #define ADJUST_AUTOTAKEOVER_WAITS(db_rep, timeout) do { \ 1091 (db_rep)->l_listener_wait = timeout; \ 1092 (db_rep)->m_listener_wait = 3 * timeout; \ 1093 } while (0) 1094 1095 #else 1096 /* 1097 * We did not configure repmgr, application must be base replication API. 1098 * The APP_SET_* macros are noops in this case, but they must be defined 1099 * with a null body to avoid compiler warnings on some platforms. 1100 */ 1101 #define APP_IS_REPMGR(env) 0 1102 #define APP_SET_REPMGR(env) do { \ 1103 ; \ 1104 } while (0) 1105 #define APP_IS_BASEAPI(env) 1 1106 #define APP_SET_BASEAPI(env) do { \ 1107 ; \ 1108 } while (0) 1109 #define ADJUST_AUTOTAKEOVER_WAITS(db_rep, timeout) do { \ 1110 ; \ 1111 } while (0) 1112 #endif /* HAVE_REPLICATION_THREADS */ 1113 1114 /* 1115 * Control structure flags for replication communication infrastructure. 1116 */ 1117 /* 1118 * Add new REPCTL flags to the end of this list to preserve compatibility 1119 * with old versions. 1120 */ 1121 #define REPCTL_ELECTABLE 0x001 /* Upgraded client is electable. */ 1122 #define REPCTL_FLUSH 0x002 /* Record should be flushed. */ 1123 #define REPCTL_GROUP_ESTD 0x004 /* Message from site in a group. */ 1124 #define REPCTL_INIT 0x008 /* Internal init message. */ 1125 #define REPCTL_LEASE 0x010 /* Lease related message. */ 1126 #define REPCTL_PERM 0x020 /* Permanent message (e.g. commit). */ 1127 #define REPCTL_RESEND 0x040 /* Rerequested message. */ 1128 #define REPCTL_LOG_END 0x080 /* Approximate end of group-wide log. */ 1129 #define REPCTL_INMEM_ONLY 0x100 /* In-memory databases only. */ 1130 #define REPCTL_ENCRYPTED 0x200 /* Master is using encryption. */ 1131 1132 /* 1133 * File info flags for internal init. The per-database (i.e., file) flag 1134 * represents the on-disk format of the file, and is conveyed from the master to 1135 * the initializing client in the UPDATE message, so that the client can know 1136 * how to create the file. The per-page flag is conveyed along with each PAGE 1137 * message, describing the format of the page image being transmitted; it is of 1138 * course set by the site serving the PAGE_REQ. The serving site gets the page 1139 * image from its own mpool, and thus the page is in the native format of the 1140 * serving site. This format may be different (i.e., opposite) from the on-disk 1141 * format, and in fact can vary per-page, since with client-to-client sync it is 1142 * possible for various different sites to serve the various PAGE_REQ requests. 1143 */ 1144 #define REPINFO_DB_LITTLEENDIAN 0x0001 /* File is little-endian lorder. */ 1145 #define REPINFO_PG_LITTLEENDIAN 0x0002 /* Page is little-endian lorder. */ 1146 1147 #define LEASE_REFRESH_MIN 30 /* Minimum number of refresh retries. */ 1148 #define LEASE_REFRESH_USEC 50000 /* Microseconds between refresh tries. */ 1149 1150 /* Master granted lease information. */ 1151 typedef struct __rep_lease_entry { 1152 int eid; /* EID of client grantor. */ 1153 db_timespec start_time; /* Start time clients echo back. */ 1154 db_timespec end_time; /* Master lease expiration time. */ 1155 DB_LSN lease_lsn; /* Durable LSN lease applies to. */ 1156 } REP_LEASE_ENTRY; 1157 1158 typedef struct { 1159 u_int32_t egen; /* Voter's election generation. */ 1160 int eid; /* Voter's ID. */ 1161 } REP_VTALLY; 1162 1163 /* 1164 * The REP_THROTTLE_ONLY flag is used to do throttle processing only. 1165 * If set, it will only allow sending the REP_*_MORE message, but not 1166 * the normal, non-throttled message. It is used to support throttling 1167 * with bulk transfer. 1168 */ 1169 /* Flags for __rep_send_throttle. */ 1170 #define REP_THROTTLE_ONLY 0x0001 /* Send _MORE message only. */ 1171 1172 /* Throttled message processing information. */ 1173 typedef struct { 1174 DB_LSN lsn; /* LSN of this record. */ 1175 DBT *data_dbt; /* DBT of this record. */ 1176 u_int32_t gbytes; /* This call's max gbytes sent. */ 1177 u_int32_t bytes; /* This call's max bytes sent. */ 1178 u_int32_t type; /* Record type. */ 1179 } REP_THROTTLE; 1180 1181 /* Bulk processing information. */ 1182 /* 1183 * !!! 1184 * We use a roff_t for the offset. We'd really like to use a ptrdiff_t 1185 * since that really is what it is. But ptrdiff_t is not portable and 1186 * doesn't exist everywhere. 1187 */ 1188 typedef struct { 1189 u_int8_t *addr; /* Address of bulk buffer. */ 1190 roff_t *offp; /* Ptr to current offset into buffer. */ 1191 u_int32_t len; /* Bulk buffer length. */ 1192 u_int32_t type; /* Item type in buffer (log, page). */ 1193 DB_LSN lsn; /* First LSN in buffer. */ 1194 int eid; /* ID of potential recipients. */ 1195 #define BULK_XMIT 0x001 /* Buffer in transit. */ 1196 u_int32_t *flagsp; /* Buffer flags. */ 1197 } REP_BULK; 1198 1199 /* 1200 * This structure takes care of representing a transaction. 1201 * It holds all the records, sorted by page number so that 1202 * we can obtain locks and apply updates in a deadlock free 1203 * order. 1204 */ 1205 typedef struct { 1206 u_int nlsns; 1207 u_int nalloc; 1208 DB_LSN *array; 1209 } LSN_COLLECTION; 1210 1211 /* 1212 * This is used by the page-prep routines to do the lock_vec call to 1213 * apply the updates for a single transaction or a collection of 1214 * transactions. 1215 */ 1216 typedef struct { 1217 int n; 1218 DB_LOCKREQ *reqs; 1219 DBT *objs; 1220 } linfo_t; 1221 1222 /* 1223 * Used to store information on the child transaction that opens a blob meta 1224 * database. In partial replication processing the child transaction of the 1225 * blob meta database must be delayed until after processing the child 1226 * transaction that opens the database that owns the BMD. 1227 */ 1228 typedef struct { 1229 db_seq_t blob_file_id; 1230 DB_LSN lsn; 1231 u_int32_t child; 1232 void *next; 1233 void *prev; 1234 } DELAYED_BLOB_LIST; 1235 1236 #if defined(__cplusplus) 1237 } 1238 #endif 1239 1240 #include "dbinc_auto/rep_ext.h" 1241 #endif /* !_DB_REP_H_ */ 1242