1 2 /** 3 * Copyright (C) 2018-present MongoDB, Inc. 4 * 5 * This program is free software: you can redistribute it and/or modify 6 * it under the terms of the Server Side Public License, version 1, 7 * as published by MongoDB, Inc. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * Server Side Public License for more details. 13 * 14 * You should have received a copy of the Server Side Public License 15 * along with this program. If not, see 16 * <http://www.mongodb.com/licensing/server-side-public-license>. 17 * 18 * As a special exception, the copyright holders give permission to link the 19 * code of portions of this program with the OpenSSL library under certain 20 * conditions as described in each individual source file and distribute 21 * linked combinations including the program with the OpenSSL library. You 22 * must comply with the Server Side Public License in all respects for 23 * all of the code used other than as permitted herein. If you modify file(s) 24 * with this exception, you may extend this exception to your version of the 25 * file(s), but you are not obligated to do so. If you do not wish to do so, 26 * delete this exception statement from your version. If you delete this 27 * exception statement from all source files in the program, then also delete 28 * it in the license file. 29 */ 30 31 #pragma once 32 33 #include "mongo/db/repl/replication_coordinator_fwd.h" 34 35 #include <vector> 36 37 #include "mongo/base/disallow_copying.h" 38 #include "mongo/base/status.h" 39 #include "mongo/base/status_with.h" 40 #include "mongo/bson/timestamp.h" 41 #include "mongo/db/repl/member_state.h" 42 #include "mongo/db/repl/repl_settings.h" 43 #include "mongo/db/repl/split_horizon.h" 44 #include "mongo/db/repl/sync_source_selector.h" 45 #include "mongo/util/net/hostandport.h" 46 #include "mongo/util/time_support.h" 47 48 namespace mongo { 49 50 class BSONObj; 51 class BSONObjBuilder; 52 class IndexDescriptor; 53 class NamespaceString; 54 class OperationContext; 55 class ServiceContext; 56 class Timestamp; 57 struct WriteConcernOptions; 58 59 namespace executor { 60 struct ConnectionPoolStats; 61 } // namespace executor 62 63 namespace rpc { 64 65 class OplogQueryMetadata; 66 class ReplSetMetadata; 67 68 } // namespace rpc 69 70 namespace repl { 71 72 class BackgroundSync; 73 class HandshakeArgs; 74 class IsMasterResponse; 75 class OplogReader; 76 class OpTime; 77 class ReadConcernArgs; 78 class ReplSetConfig; 79 class ReplSetHeartbeatArgs; 80 class ReplSetHeartbeatArgsV1; 81 class ReplSetHeartbeatResponse; 82 class ReplSetHtmlSummary; 83 class ReplSetRequestVotesArgs; 84 class ReplSetRequestVotesResponse; 85 class UpdatePositionArgs; 86 87 /** 88 * Global variable that contains a std::string telling why master/slave halted 89 * 90 * "dead" means something really bad happened like replication falling completely out of sync. 91 * when non-null, we are dead and the string is informational 92 * 93 * TODO(dannenberg) remove when master slave goes 94 */ 95 extern const char* replAllDead; 96 97 /** 98 * The ReplicationCoordinator is responsible for coordinating the interaction of replication 99 * with the rest of the system. The public methods on ReplicationCoordinator are the public 100 * API that the replication subsystem presents to the rest of the codebase. 101 */ 102 class ReplicationCoordinator : public SyncSourceSelector { 103 MONGO_DISALLOW_COPYING(ReplicationCoordinator); 104 105 public: 106 static ReplicationCoordinator* get(ServiceContext* service); 107 static ReplicationCoordinator* get(ServiceContext& service); 108 static ReplicationCoordinator* get(OperationContext* ctx); 109 110 static void set(ServiceContext* service, 111 std::unique_ptr<ReplicationCoordinator> replCoordinator); 112 113 struct StatusAndDuration { 114 public: 115 Status status; 116 Milliseconds duration; 117 StatusAndDurationStatusAndDuration118 StatusAndDuration(const Status& stat, Milliseconds ms) : status(stat), duration(ms) {} 119 }; 120 121 virtual ~ReplicationCoordinator(); 122 123 /** 124 * Does any initial bookkeeping needed to start replication, and instructs the other 125 * components of the replication system to start up whatever threads and do whatever 126 * initialization they need. 127 */ 128 virtual void startup(OperationContext* opCtx) = 0; 129 130 /** 131 * Start terminal shutdown. This causes the topology coordinator to refuse to vote in any 132 * further elections. This should only be called from global shutdown after we've passed the 133 * point of no return. 134 * 135 * This should be called once we are sure to call shutdown(). 136 */ 137 virtual void enterTerminalShutdown() = 0; 138 139 /** 140 * Does whatever cleanup is required to stop replication, including instructing the other 141 * components of the replication system to shut down and stop any threads they are using, 142 * blocking until all replication-related shutdown tasks are complete. 143 */ 144 virtual void shutdown(OperationContext* opCtx) = 0; 145 146 /** 147 * Returns a reference to the parsed command line arguments that are related to replication. 148 */ 149 virtual const ReplSettings& getSettings() const = 0; 150 151 enum Mode { modeNone = 0, modeReplSet, modeMasterSlave }; 152 153 /** 154 * Returns a value indicating whether this node was configured at start-up to run 155 * standalone, as part of a master-slave pair, or as a member of a replica set. 156 */ 157 virtual Mode getReplicationMode() const = 0; 158 159 /** 160 * Returns true if this node is configured to be a member of a replica set or master/slave 161 * setup. 162 */ 163 virtual bool isReplEnabled() const = 0; 164 165 /** 166 * Returns the current replica set state of this node (PRIMARY, SECONDARY, STARTUP, etc). 167 * It is invalid to call this unless getReplicationMode() == modeReplSet. 168 */ 169 virtual MemberState getMemberState() const = 0; 170 171 /** 172 * Waits for 'timeout' ms for member state to become 'state'. 173 * Returns OK if member state is 'state'. 174 * Returns ErrorCodes::ExceededTimeLimit if we timed out waiting for the state change. 175 * Returns ErrorCodes::BadValue if timeout is negative. 176 */ 177 virtual Status waitForMemberState(MemberState expectedState, Milliseconds timeout) = 0; 178 179 /** 180 * Returns true if this node is in state PRIMARY or SECONDARY. 181 * 182 * It is invalid to call this unless getReplicationMode() == modeReplSet. 183 * 184 * This method may be optimized to reduce synchronization overhead compared to 185 * reading the current member state with getMemberState(). 186 */ 187 virtual bool isInPrimaryOrSecondaryState() const = 0; 188 189 190 /** 191 * Returns how slave delayed this node is configured to be, or 0 seconds if this node is not a 192 * member of the current replica set configuration. 193 */ 194 virtual Seconds getSlaveDelaySecs() const = 0; 195 196 /** 197 * Blocks the calling thread for up to writeConcern.wTimeout millis, or until "opTime" has 198 * been replicated to at least a set of nodes that satisfies the writeConcern, whichever 199 * comes first. A writeConcern.wTimeout of 0 indicates no timeout (block forever) and a 200 * writeConcern.wTimeout of -1 indicates return immediately after checking. Return codes: 201 * ErrorCodes::WriteConcernFailed if the writeConcern.wTimeout is reached before 202 * the data has been sufficiently replicated 203 * ErrorCodes::ExceededTimeLimit if the opCtx->getMaxTimeMicrosRemaining is reached before 204 * the data has been sufficiently replicated 205 * ErrorCodes::NotMaster if the node is not Primary/Master 206 * ErrorCodes::UnknownReplWriteConcern if the writeConcern.wMode contains a write concern 207 * mode that is not known 208 * ErrorCodes::ShutdownInProgress if we are mid-shutdown 209 * ErrorCodes::Interrupted if the operation was killed with killop() 210 */ 211 virtual StatusAndDuration awaitReplication(OperationContext* opCtx, 212 const OpTime& opTime, 213 const WriteConcernOptions& writeConcern) = 0; 214 215 /** 216 * Like awaitReplication(), above, but waits for the replication of the last operation 217 * performed on the client associated with "opCtx". 218 */ 219 virtual StatusAndDuration awaitReplicationOfLastOpForClient( 220 OperationContext* opCtx, const WriteConcernOptions& writeConcern) = 0; 221 222 /** 223 * Causes this node to relinquish being primary for at least 'stepdownTime'. If 'force' is 224 * false, before doing so it will wait for 'waitTime' for one other node to be within 10 225 * seconds of this node's optime before stepping down. Returns a Status with the code 226 * ErrorCodes::ExceededTimeLimit if no secondary catches up within waitTime, 227 * ErrorCodes::NotMaster if you are no longer primary when trying to step down, 228 * ErrorCodes::SecondaryAheadOfPrimary if we are primary but there is another node that 229 * seems to be ahead of us in replication, and Status::OK otherwise. 230 */ 231 virtual Status stepDown(OperationContext* opCtx, 232 bool force, 233 const Milliseconds& waitTime, 234 const Milliseconds& stepdownTime) = 0; 235 236 /** 237 * Returns true if the node can be considered master for the purpose of introspective 238 * commands such as isMaster() and rs.status(). 239 */ 240 virtual bool isMasterForReportingPurposes() = 0; 241 242 /** 243 * Returns true if it is valid for this node to accept writes on the given database. 244 * Currently this is true only if this node is Primary, master in master/slave, 245 * a standalone, or is writing to the local database. 246 * 247 * If a node was started with the replSet argument, but has not yet received a config, it 248 * will not be able to receive writes to a database other than local (it will not be 249 * treated as standalone node). 250 * 251 * NOTE: This function can only be meaningfully called while the caller holds the global 252 * lock in some mode other than MODE_NONE. 253 */ 254 virtual bool canAcceptWritesForDatabase(OperationContext* opCtx, StringData dbName) = 0; 255 256 /** 257 * Version which does not check for the global lock. Do not use in new code. 258 * Without the global lock held, the return value may be inaccurate by the time 259 * the function returns. 260 */ 261 virtual bool canAcceptWritesForDatabase_UNSAFE(OperationContext* opCtx, StringData dbName) = 0; 262 263 /** 264 * Returns true if it is valid for this node to accept writes on the given namespace. 265 * 266 * The result of this function should be consistent with canAcceptWritesForDatabase() 267 * for the database the namespace refers to, with additional checks on the collection. 268 */ 269 virtual bool canAcceptWritesFor(OperationContext* opCtx, const NamespaceString& ns) = 0; 270 271 /** 272 * Version which does not check for the global lock. Do not use in new code. 273 * Without the global lock held, the return value may be inaccurate by the time 274 * the function returns. 275 */ 276 virtual bool canAcceptWritesFor_UNSAFE(OperationContext* opCtx, const NamespaceString& ns) = 0; 277 278 /** 279 * Checks if the current replica set configuration can satisfy the given write concern. 280 * 281 * Things that are taken into consideration include: 282 * 1. If the set has enough data-bearing members. 283 * 2. If the write concern mode exists. 284 * 3. If there are enough members for the write concern mode specified. 285 */ 286 virtual Status checkIfWriteConcernCanBeSatisfied( 287 const WriteConcernOptions& writeConcern) const = 0; 288 289 /** 290 * Returns Status::OK() if it is valid for this node to serve reads on the given collection 291 * and an errorcode indicating why the node cannot if it cannot. 292 */ 293 virtual Status checkCanServeReadsFor(OperationContext* opCtx, 294 const NamespaceString& ns, 295 bool slaveOk) = 0; 296 297 /** 298 * Version which does not check for the global lock. Do not use in new code. 299 * Without the global lock held, the return value may be inaccurate by the time 300 * the function returns. 301 */ 302 virtual Status checkCanServeReadsFor_UNSAFE(OperationContext* opCtx, 303 const NamespaceString& ns, 304 bool slaveOk) = 0; 305 306 /** 307 * Returns true if this node should ignore index constraints for idempotency reasons. 308 * 309 * The namespace "ns" is passed in because the "local" database is usually writable 310 * and we need to enforce the constraints for it. 311 */ 312 virtual bool shouldRelaxIndexConstraints(OperationContext* opCtx, 313 const NamespaceString& ns) = 0; 314 315 /** 316 * Updates our internal tracking of the last OpTime applied for the given slave 317 * identified by "rid". Only valid to call in master/slave mode 318 */ 319 virtual Status setLastOptimeForSlave(const OID& rid, const Timestamp& ts) = 0; 320 321 /** 322 * Updates our internal tracking of the last OpTime applied to this node. 323 * 324 * The new value of "opTime" must be no less than any prior value passed to this method, and 325 * it is the caller's job to properly synchronize this behavior. The exception to this rule 326 * is that after calls to resetLastOpTimesFromOplog(), the minimum acceptable value for 327 * "opTime" is reset based on the contents of the oplog, and may go backwards due to 328 * rollback. Additionally, the optime given MUST represent a consistent database state. 329 */ 330 virtual void setMyLastAppliedOpTime(const OpTime& opTime) = 0; 331 332 /** 333 * Updates our internal tracking of the last OpTime durable to this node. 334 * 335 * The new value of "opTime" must be no less than any prior value passed to this method, and 336 * it is the caller's job to properly synchronize this behavior. The exception to this rule 337 * is that after calls to resetLastOpTimesFromOplog(), the minimum acceptable value for 338 * "opTime" is reset based on the contents of the oplog, and may go backwards due to 339 * rollback. 340 */ 341 virtual void setMyLastDurableOpTime(const OpTime& opTime) = 0; 342 343 /** 344 * This type is used to represent the "consistency" of a current database state. In 345 * replication, there may be times when our database data is not represented by a single optime, 346 * because we have fetched remote data from different points in time. For example, when we are 347 * in RECOVERING following a refetch based rollback. We never allow external clients to read 348 * from the database if it is not consistent. 349 */ 350 enum class DataConsistency { Consistent, Inconsistent }; 351 352 /** 353 * Updates our internal tracking of the last OpTime applied to this node, but only 354 * if the supplied optime is later than the current last OpTime known to the replication 355 * coordinator. The 'consistency' argument must tell whether or not the optime argument 356 * represents a consistent database state. 357 * 358 * This function is used by logOp() on a primary, since the ops in the oplog do not 359 * necessarily commit in sequential order. It is also used when we finish oplog batch 360 * application on secondaries, to avoid any potential race conditions around setting the 361 * applied optime from more than one thread. 362 */ 363 virtual void setMyLastAppliedOpTimeForward(const OpTime& opTime, 364 DataConsistency consistency) = 0; 365 366 /** 367 * Updates our internal tracking of the last OpTime durable to this node, but only 368 * if the supplied optime is later than the current last OpTime known to the replication 369 * coordinator. 370 * 371 * This function is used by logOp() on a primary, since the ops in the oplog do not 372 * necessarily commit in sequential order. 373 */ 374 virtual void setMyLastDurableOpTimeForward(const OpTime& opTime) = 0; 375 376 /** 377 * Same as above, but used during places we need to zero our last optime. 378 */ 379 virtual void resetMyLastOpTimes() = 0; 380 381 /** 382 * Updates our the message we include in heartbeat responses. 383 */ 384 virtual void setMyHeartbeatMessage(const std::string& msg) = 0; 385 386 /** 387 * Returns the last optime recorded by setMyLastAppliedOpTime. 388 */ 389 virtual OpTime getMyLastAppliedOpTime() const = 0; 390 391 /** 392 * Returns the last optime recorded by setMyLastDurableOpTime. 393 */ 394 virtual OpTime getMyLastDurableOpTime() const = 0; 395 396 /** 397 * Waits until the optime of the current node is at least the opTime specified in 'settings'. 398 * 399 * Returns whether the wait was successful. 400 */ 401 virtual Status waitUntilOpTimeForRead(OperationContext* opCtx, 402 const ReadConcernArgs& settings) = 0; 403 404 /** 405 * Waits until the deadline or until the optime of the current node is at least the opTime 406 * specified in 'settings'. 407 * 408 * Returns whether the wait was successful. 409 */ 410 virtual Status waitUntilOpTimeForReadUntil(OperationContext* opCtx, 411 const ReadConcernArgs& settings, 412 boost::optional<Date_t> deadline) = 0; 413 414 /** 415 * Retrieves and returns the current election id, which is a unique id that is local to 416 * this node and changes every time we become primary. 417 * TODO(spencer): Use term instead. 418 */ 419 virtual OID getElectionId() = 0; 420 421 /** 422 * Returns the RID for this node. The RID is used to identify this node to our sync source 423 * when sending updates about our replication progress. 424 */ 425 virtual OID getMyRID() const = 0; 426 427 /** 428 * Returns the id for this node as specified in the current replica set configuration. 429 */ 430 virtual int getMyId() const = 0; 431 432 /** 433 * Sets this node into a specific follower mode. 434 * 435 * Returns OK if the follower mode was successfully set. Returns NotSecondary if the 436 * node is a leader when setFollowerMode is called and ElectionInProgess if the node is in the 437 * process of trying to elect itself primary. 438 * 439 * Follower modes are RS_STARTUP2 (initial sync), RS_SECONDARY, RS_ROLLBACK and 440 * RS_RECOVERING. They are the valid states of a node whose topology coordinator has the 441 * follower role. 442 * 443 * This is essentially an interface that allows the applier to prevent the node from 444 * becoming a candidate or accepting reads, depending on circumstances in the oplog 445 * application process. 446 */ 447 virtual Status setFollowerMode(const MemberState& newState) = 0; 448 449 /** 450 * Step-up 451 * ======= 452 * On stepup, repl coord enters catch-up mode. It's the same as the secondary mode from 453 * the perspective of producer and applier, so there's nothing to do with them. 454 * When a node enters drain mode, producer state = Stopped, applier state = Draining. 455 * 456 * If the applier state is Draining, it will signal repl coord when there's nothing to apply. 457 * The applier goes into Stopped state at the same time. 458 * 459 * The states go like the following: 460 * - secondary and during catchup mode 461 * (producer: Running, applier: Running) 462 * | 463 * | finish catch-up, enter drain mode 464 * V 465 * - drain mode 466 * (producer: Stopped, applier: Draining) 467 * | 468 * | applier signals drain is complete 469 * V 470 * - primary is in master mode 471 * (producer: Stopped, applier: Stopped) 472 * 473 * 474 * Step-down 475 * ========= 476 * The state transitions become: 477 * - primary is in master mode 478 * (producer: Stopped, applier: Stopped) 479 * | 480 * | step down 481 * V 482 * - secondary mode, starting bgsync 483 * (producer: Starting, applier: Running) 484 * | 485 * | bgsync runs start() 486 * V 487 * - secondary mode, normal 488 * (producer: Running, applier: Running) 489 * 490 * When a node steps down during draining mode, it's OK to change from (producer: Stopped, 491 * applier: Draining) to (producer: Starting, applier: Running). 492 * 493 * When a node steps down during catchup mode, the states remain the same (producer: Running, 494 * applier: Running). 495 */ 496 enum class ApplierState { Running, Draining, Stopped }; 497 498 /** 499 * In normal cases: Running -> Draining -> Stopped -> Running. 500 * Draining -> Running is also possible if a node steps down during drain mode. 501 * 502 * Only the applier can make the transition from Draining to Stopped by calling 503 * signalDrainComplete(). 504 */ 505 virtual ApplierState getApplierState() = 0; 506 507 /** 508 * Signals that a previously requested pause and drain of the applier buffer 509 * has completed. 510 * 511 * This is an interface that allows the applier to reenable writes after 512 * a successful election triggers the draining of the applier buffer. 513 * 514 * The applier signals drain complete when the buffer is empty and it's in Draining 515 * state. We need to make sure the applier checks both conditions in the same term. 516 * Otherwise, it's possible that the applier confirms the empty buffer, but the node 517 * steps down and steps up so quickly that the applier signals drain complete in the wrong 518 * term. 519 */ 520 virtual void signalDrainComplete(OperationContext* opCtx, long long termWhenBufferIsEmpty) = 0; 521 522 /** 523 * Waits duration of 'timeout' for applier to finish draining its buffer of operations. 524 * Returns OK if we are not in drain mode. 525 * Returns ErrorCodes::ExceededTimeLimit if we timed out waiting for the applier to drain its 526 * buffer. 527 * Returns ErrorCodes::BadValue if timeout is negative. 528 */ 529 virtual Status waitForDrainFinish(Milliseconds timeout) = 0; 530 531 /** 532 * Signals the sync source feedback thread to wake up and send a handshake and 533 * replSetUpdatePosition command to our sync source. 534 */ 535 virtual void signalUpstreamUpdater() = 0; 536 537 /** 538 * Prepares a BSONObj describing an invocation of the replSetUpdatePosition command that can 539 * be sent to this node's sync source to update it about our progress in replication. 540 */ 541 virtual StatusWith<BSONObj> prepareReplSetUpdatePositionCommand() const = 0; 542 543 enum class ReplSetGetStatusResponseStyle { kBasic, kInitialSync }; 544 545 /** 546 * Handles an incoming replSetGetStatus command. Adds BSON to 'result'. If kInitialSync is 547 * requested but initial sync is not running, kBasic will be used. 548 */ 549 virtual Status processReplSetGetStatus(BSONObjBuilder* result, 550 ReplSetGetStatusResponseStyle responseStyle) = 0; 551 552 /** 553 * Does an initial sync of data, after dropping existing data. 554 */ 555 virtual Status resyncData(OperationContext* opCtx, bool waitUntilCompleted) = 0; 556 557 /** 558 * Handles an incoming isMaster command for a replica set node. Should not be 559 * called on a master-slave or standalone node. 560 */ 561 virtual void fillIsMasterForReplSet(IsMasterResponse* result, 562 const SplitHorizon::Parameters& horizonParams) = 0; 563 564 /** 565 * Adds to "result" a description of the slaveInfo data structure used to map RIDs to their 566 * last known optimes. 567 */ 568 virtual void appendSlaveInfoData(BSONObjBuilder* result) = 0; 569 570 /** 571 * Returns a copy of the current ReplSetConfig. 572 */ 573 virtual ReplSetConfig getConfig() const = 0; 574 575 /** 576 * Handles an incoming replSetGetConfig command. Adds BSON to 'result'. 577 */ 578 virtual void processReplSetGetConfig(BSONObjBuilder* result) = 0; 579 580 /** 581 * Processes the ReplSetMetadata returned from a command run against another 582 * replica set member and so long as the config version in the metadata matches the replica set 583 * config version this node currently has, updates the current term. 584 * 585 * This does NOT update this node's notion of the commit point. 586 */ 587 virtual void processReplSetMetadata(const rpc::ReplSetMetadata& replMetadata) = 0; 588 589 /** 590 * This updates the node's notion of the commit point. 591 */ 592 virtual void advanceCommitPoint(const OpTime& committedOptime) = 0; 593 594 /** 595 * Elections under protocol version 1 are triggered by a timer. 596 * When a node is informed of the primary's liveness (either through heartbeats or 597 * while reading a sync source's oplog), it calls this function to postpone the 598 * election timer by a duration of at least 'electionTimeoutMillis' (see getConfig()). 599 * If the current node is not electable (secondary with priority > 0), this function 600 * cancels the existing timer but will not schedule a new one. 601 */ 602 virtual void cancelAndRescheduleElectionTimeout() = 0; 603 604 /** 605 * Toggles maintenanceMode to the value expressed by 'activate' 606 * return Status::OK if the change worked, NotSecondary if it failed because we are 607 * PRIMARY, and OperationFailed if we are not currently in maintenance mode 608 */ 609 virtual Status setMaintenanceMode(bool activate) = 0; 610 611 /** 612 * Retrieves the current count of maintenanceMode and returns 'true' if greater than 0. 613 */ 614 virtual bool getMaintenanceMode() = 0; 615 616 /** 617 * Handles an incoming replSetSyncFrom command. Adds BSON to 'result' 618 * returns Status::OK if the sync target could be set and an ErrorCode indicating why it 619 * couldn't otherwise. 620 */ 621 virtual Status processReplSetSyncFrom(OperationContext* opCtx, 622 const HostAndPort& target, 623 BSONObjBuilder* resultObj) = 0; 624 625 /** 626 * Handles an incoming replSetFreeze command. Adds BSON to 'resultObj' 627 * returns Status::OK() if the node is a member of a replica set with a config and an 628 * error Status otherwise 629 */ 630 virtual Status processReplSetFreeze(int secs, BSONObjBuilder* resultObj) = 0; 631 632 /** 633 * Handles an incoming heartbeat command with arguments 'args'. Populates 'response'; 634 * returns a Status with either OK or an error message. 635 */ 636 virtual Status processHeartbeat(const ReplSetHeartbeatArgs& args, 637 ReplSetHeartbeatResponse* response) = 0; 638 virtual Status processHeartbeatV1(const ReplSetHeartbeatArgsV1& args, 639 ReplSetHeartbeatResponse* response) = 0; 640 641 642 /** 643 * Arguments for the replSetReconfig command. 644 */ 645 struct ReplSetReconfigArgs { 646 BSONObj newConfigObj; 647 bool force; 648 }; 649 650 /** 651 * Handles an incoming replSetReconfig command. Adds BSON to 'resultObj'; 652 * returns a Status with either OK or an error message. 653 */ 654 virtual Status processReplSetReconfig(OperationContext* opCtx, 655 const ReplSetReconfigArgs& args, 656 BSONObjBuilder* resultObj) = 0; 657 658 /* 659 * Handles an incoming replSetInitiate command. If "configObj" is empty, generates a default 660 * configuration to use. 661 * Adds BSON to 'resultObj'; returns a Status with either OK or an error message. 662 */ 663 virtual Status processReplSetInitiate(OperationContext* opCtx, 664 const BSONObj& configObj, 665 BSONObjBuilder* resultObj) = 0; 666 667 /** 668 * Arguments to the replSetFresh command. 669 */ 670 struct ReplSetFreshArgs { 671 std::string setName; // Name of the replset 672 HostAndPort who; // host and port of the member that sent the replSetFresh command 673 unsigned id; // replSet id of the member that sent the replSetFresh command 674 int cfgver; // replSet config version that the member who sent the command thinks it has 675 Timestamp opTime; // last optime seen by the member who sent the replSetFresh command 676 }; 677 678 /* 679 * Handles an incoming replSetFresh command. 680 * Adds BSON to 'resultObj'; returns a Status with either OK or an error message. 681 */ 682 virtual Status processReplSetFresh(const ReplSetFreshArgs& args, BSONObjBuilder* resultObj) = 0; 683 684 /** 685 * Arguments to the replSetElect command. 686 */ 687 struct ReplSetElectArgs { 688 std::string set; // Name of the replset 689 int whoid; // replSet id of the member that sent the replSetFresh command 690 int cfgver; // replSet config version that the member who sent the command thinks it has 691 OID round; // unique ID for this election 692 }; 693 694 /* 695 * Handles an incoming replSetElect command. 696 * Adds BSON to 'resultObj'; returns a Status with either OK or an error message. 697 */ 698 virtual Status processReplSetElect(const ReplSetElectArgs& args, BSONObjBuilder* resultObj) = 0; 699 700 /** 701 * Handles an incoming replSetUpdatePosition command, updating each node's oplog progress. 702 * Returns Status::OK() if all updates are processed correctly, NodeNotFound 703 * if any updating node cannot be found in the config, InvalidReplicaSetConfig if the 704 * "configVersion" sent in any of the updates doesn't match our config version, or 705 * NotMasterOrSecondary if we are in state REMOVED or otherwise don't have a valid 706 * replica set config. 707 * If a non-OK status is returned, it is unspecified whether none or some of the updates 708 * were applied. 709 * "configVersion" will be populated with our config version if and only if we return 710 * InvalidReplicaSetConfig. 711 */ 712 virtual Status processReplSetUpdatePosition(const UpdatePositionArgs& updates, 713 long long* configVersion) = 0; 714 715 /** 716 * Handles an incoming Handshake command. Associates the node's 'remoteID' with its 717 * 'handshake' object. This association is used to update internal representation of 718 * replication progress and to forward the node's replication progress upstream when this 719 * node is being chained through in master/slave replication. 720 * 721 * Returns ErrorCodes::IllegalOperation if we're not running with master/slave replication. 722 */ 723 virtual Status processHandshake(OperationContext* opCtx, const HandshakeArgs& handshake) = 0; 724 725 /** 726 * Returns a bool indicating whether or not this node builds indexes. 727 */ 728 virtual bool buildsIndexes() = 0; 729 730 /** 731 * Returns a vector of members that have applied the operation with OpTime 'op'. 732 * "durablyWritten" indicates whether the operation has to be durably applied. 733 */ 734 virtual std::vector<HostAndPort> getHostsWrittenTo(const OpTime& op, bool durablyWritten) = 0; 735 736 /** 737 * Returns a vector of the members other than ourself in the replica set, as specified in 738 * the replica set config. Invalid to call if we are not in replica set mode. Returns 739 * an empty vector if we do not have a valid config. 740 */ 741 virtual std::vector<HostAndPort> getOtherNodesInReplSet() const = 0; 742 743 /** 744 * Returns a BSONObj containing a representation of the current default write concern. 745 */ 746 virtual WriteConcernOptions getGetLastErrorDefault() = 0; 747 748 /** 749 * Checks that the --replSet flag was passed when starting up the node and that the node 750 * has a valid replica set config. 751 * 752 * Returns a Status indicating whether those conditions are met with errorcode 753 * NoReplicationEnabled if --replSet was not present during start up or with errorcode 754 * NotYetInitialized in the absence of a valid config. Also adds error info to "result". 755 */ 756 virtual Status checkReplEnabledForCommand(BSONObjBuilder* result) = 0; 757 758 /** 759 * Loads the optime from the last op in the oplog into the coordinator's lastAppliedOpTime and 760 * lastDurableOpTime values. The 'consistency' argument must tell whether or not the optime of 761 * the op in the oplog represents a consistent database state. 762 */ 763 virtual void resetLastOpTimesFromOplog(OperationContext* opCtx, 764 DataConsistency consistency) = 0; 765 766 /** 767 * Returns the OpTime of the latest replica set-committed op known to this server. 768 * Committed means a majority of the voting nodes of the config are known to have the 769 * operation in their oplogs. This implies such ops will never be rolled back. 770 */ 771 virtual OpTime getLastCommittedOpTime() const = 0; 772 773 /* 774 * Handles an incoming replSetRequestVotes command. 775 * Adds BSON to 'resultObj'; returns a Status with either OK or an error message. 776 */ 777 virtual Status processReplSetRequestVotes(OperationContext* opCtx, 778 const ReplSetRequestVotesArgs& args, 779 ReplSetRequestVotesResponse* response) = 0; 780 781 /** 782 * Prepares a metadata object with the ReplSetMetadata and the OplogQueryMetadata depending 783 * on what has been requested. 784 */ 785 virtual void prepareReplMetadata(const BSONObj& metadataRequestObj, 786 const OpTime& lastOpTimeFromClient, 787 BSONObjBuilder* builder) const = 0; 788 789 /** 790 * Returns true if the V1 election protocol is being used and false otherwise. 791 */ 792 virtual bool isV1ElectionProtocol() const = 0; 793 794 /** 795 * Returns whether or not majority write concerns should implicitly journal, if j has not been 796 * explicitly set. 797 */ 798 virtual bool getWriteConcernMajorityShouldJournal() = 0; 799 800 /** 801 * Writes into 'output' all the information needed to generate a summary of the current 802 * replication state for use by the web interface. 803 */ 804 virtual void summarizeAsHtml(ReplSetHtmlSummary* output) = 0; 805 806 /** 807 * Returns the current term. 808 */ 809 virtual long long getTerm() = 0; 810 811 /** 812 * Attempts to update the current term for the V1 election protocol. If the term changes and 813 * this node is primary, relinquishes primary. 814 * Returns a Status OK if the term was *not* updated (meaning, it is safe to proceed with 815 * the rest of the work, because the term is still the same). 816 * Returns StaleTerm if the supplied term was higher than the current term. 817 */ 818 virtual Status updateTerm(OperationContext* opCtx, long long term) = 0; 819 820 /** 821 * Reserves a unique SnapshotName. 822 * 823 * This name is guaranteed to compare > all names reserved before and < all names reserved 824 * after. 825 * 826 * This method will not take any locks or attempt to access storage using the passed-in 827 * OperationContext. It will only be used to track reserved SnapshotNames by each operation so 828 * that awaitReplicationOfLastOpForClient() can correctly wait for the reserved snapshot to be 829 * visible. 830 * 831 * A null OperationContext can be used in cases where the snapshot to wait for should not be 832 * adjusted. 833 */ 834 virtual Timestamp reserveSnapshotName(OperationContext* opCtx) = 0; 835 836 /** 837 * Blocks until either the current committed snapshot is at least as high as 'untilSnapshot', 838 * or we are interrupted for any reason, including shutdown or maxTimeMs expiration. 839 * 'opCtx' is used to checkForInterrupt and enforce maxTimeMS. 840 */ 841 virtual void waitUntilSnapshotCommitted(OperationContext* opCtx, 842 const Timestamp& untilSnapshot) = 0; 843 844 /** 845 * Resets all information related to snapshotting. 846 */ 847 virtual void dropAllSnapshots() = 0; 848 849 /** 850 * Gets the latest OpTime of the currentCommittedSnapshot. 851 */ 852 virtual OpTime getCurrentCommittedSnapshotOpTime() const = 0; 853 854 /** 855 * Appends diagnostics about the replication subsystem. 856 */ 857 virtual void appendDiagnosticBSON(BSONObjBuilder* bob) = 0; 858 859 /** 860 * Appends connection information to the provided BSONObjBuilder. 861 */ 862 virtual void appendConnectionStats(executor::ConnectionPoolStats* stats) const = 0; 863 864 /** 865 * Gets the number of uncommitted snapshots currently held. 866 * Warning: This value can change at any time and may not even be accurate at the time of 867 * return. It should not be used when an exact amount is needed. 868 */ 869 virtual size_t getNumUncommittedSnapshots() = 0; 870 871 /** 872 * Returns a new WriteConcernOptions based on "wc" but with UNSET syncMode reset to JOURNAL or 873 * NONE based on our rsConfig. 874 */ 875 virtual WriteConcernOptions populateUnsetWriteConcernOptionsSyncMode( 876 WriteConcernOptions wc) = 0; 877 virtual ReplSettings::IndexPrefetchConfig getIndexPrefetchConfig() const = 0; 878 virtual void setIndexPrefetchConfig(const ReplSettings::IndexPrefetchConfig cfg) = 0; 879 880 virtual Status stepUpIfEligible(bool skipDryRun) = 0; 881 882 virtual ServiceContext* getServiceContext() = 0; 883 884 /** 885 * Abort catchup if the node is in catchup mode. 886 */ 887 virtual Status abortCatchupIfNeeded() = 0; 888 889 /** 890 * Returns true if logOp() should not append an entry to the oplog for the namespace for this 891 * operation. 892 */ 893 bool isOplogDisabledFor(OperationContext* opCtx, const NamespaceString& nss); 894 895 protected: 896 ReplicationCoordinator(); 897 }; 898 899 } // namespace repl 900 } // namespace mongo 901