1 2 /** 3 * Copyright (C) 2018-present MongoDB, Inc. 4 * 5 * This program is free software: you can redistribute it and/or modify 6 * it under the terms of the Server Side Public License, version 1, 7 * as published by MongoDB, Inc. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * Server Side Public License for more details. 13 * 14 * You should have received a copy of the Server Side Public License 15 * along with this program. If not, see 16 * <http://www.mongodb.com/licensing/server-side-public-license>. 17 * 18 * As a special exception, the copyright holders give permission to link the 19 * code of portions of this program with the OpenSSL library under certain 20 * conditions as described in each individual source file and distribute 21 * linked combinations including the program with the OpenSSL library. You 22 * must comply with the Server Side Public License in all respects for 23 * all of the code used other than as permitted herein. If you modify file(s) 24 * with this exception, you may extend this exception to your version of the 25 * file(s), but you are not obligated to do so. If you do not wish to do so, 26 * delete this exception statement from your version. If you delete this 27 * exception statement from all source files in the program, then also delete 28 * it in the license file. 29 */ 30 31 #pragma once 32 33 #include <iosfwd> 34 #include <string> 35 36 #include "mongo/base/disallow_copying.h" 37 #include "mongo/db/repl/last_vote.h" 38 #include "mongo/db/repl/repl_set_heartbeat_response.h" 39 #include "mongo/db/repl/replication_coordinator.h" 40 #include "mongo/db/repl/split_horizon.h" 41 #include "mongo/db/server_options.h" 42 #include "mongo/stdx/functional.h" 43 #include "mongo/util/net/hostandport.h" 44 #include "mongo/util/time_support.h" 45 46 namespace mongo { 47 class Timestamp; 48 49 namespace repl { 50 class HeartbeatResponseAction; 51 class MemberData; 52 class OpTime; 53 class ReplSetHeartbeatArgs; 54 class ReplSetConfig; 55 class TagSubgroup; 56 struct MemberState; 57 58 // Maximum number of retries for a failed heartbeat. 59 const int kMaxHeartbeatRetries = 2; 60 61 /** 62 * Replication Topology Coordinator 63 * 64 * This object is responsible for managing the topology of the cluster. 65 * Tasks include consensus and leader election, chaining, and configuration management. 66 * Methods of this class should be non-blocking. 67 */ 68 class TopologyCoordinator { 69 MONGO_DISALLOW_COPYING(TopologyCoordinator); 70 71 public: 72 /** 73 * Type that denotes the role of a node in the replication protocol. 74 * 75 * The role is distinct from MemberState, in that it only deals with the 76 * roles a node plays in the basic protocol -- leader, follower and candidate. 77 * The mapping between MemberState and Role is complex -- several MemberStates 78 * map to the follower role, and MemberState::RS_SECONDARY maps to either 79 * follower or candidate roles, e.g. 80 */ 81 enum class Role { kLeader = 0, kFollower = 1, kCandidate = 2 }; 82 83 struct Options { 84 // A sync source is re-evaluated after it lags behind further than this amount. 85 Seconds maxSyncSourceLagSecs{0}; 86 87 // Whether or not this node is running as a config server. 88 ClusterRole clusterRole{ClusterRole::None}; 89 }; 90 91 /** 92 * Constructs a Topology Coordinator object. 93 **/ 94 TopologyCoordinator(Options options); 95 96 97 ~TopologyCoordinator(); 98 99 //////////////////////////////////////////////////////////// 100 // 101 // State inspection methods. 102 // 103 //////////////////////////////////////////////////////////// 104 105 /** 106 * Gets the role of this member in the replication protocol. 107 */ 108 Role getRole() const; 109 110 /** 111 * Gets the MemberState of this member in the replica set. 112 */ 113 MemberState getMemberState() const; 114 115 /** 116 * Returns whether this node should be allowed to accept writes. 117 */ 118 bool canAcceptWrites() const; 119 120 /** 121 * Returns true if this node is in the process of stepping down. Note that this can be 122 * due to an unconditional stepdown that must succeed (for instance from learning about a new 123 * term) or due to a stepdown attempt that could fail (for instance from a stepdown cmd that 124 * could fail if not enough nodes are caught up). 125 */ 126 bool isSteppingDown() const; 127 128 /** 129 * Returns the address of the current sync source, or an empty HostAndPort if there is no 130 * current sync source. 131 */ 132 HostAndPort getSyncSourceAddress() const; 133 134 /** 135 * Retrieves a vector of HostAndPorts containing all nodes that are neither DOWN nor 136 * ourself. 137 */ 138 std::vector<HostAndPort> getMaybeUpHostAndPorts() const; 139 140 /** 141 * Gets the earliest time the current node will stand for election. 142 */ 143 Date_t getStepDownTime() const; 144 145 /** 146 * Gets the current value of the maintenance mode counter. 147 */ 148 int getMaintenanceCount() const; 149 150 /** 151 * Gets the latest term this member is aware of. If this member is the primary, 152 * it's the current term of the replica set. 153 */ 154 long long getTerm() const; 155 156 enum class UpdateTermResult { kAlreadyUpToDate, kTriggerStepDown, kUpdatedTerm }; 157 158 //////////////////////////////////////////////////////////// 159 // 160 // Basic state manipulation methods. 161 // 162 //////////////////////////////////////////////////////////// 163 164 /** 165 * Sets the latest term this member is aware of to the higher of its current value and 166 * the value passed in as "term". 167 * Returns the result of setting the term value, or if a stepdown should be triggered. 168 */ 169 UpdateTermResult updateTerm(long long term, Date_t now); 170 171 /** 172 * Sets the index into the config used when we next choose a sync source 173 */ 174 void setForceSyncSourceIndex(int index); 175 176 enum class ChainingPreference { kAllowChaining, kUseConfiguration }; 177 178 /** 179 * Chooses and sets a new sync source, based on our current knowledge of the world. 180 */ 181 HostAndPort chooseNewSyncSource(Date_t now, 182 const OpTime& lastOpTimeFetched, 183 ChainingPreference chainingPreference); 184 185 /** 186 * Suppresses selecting "host" as sync source until "until". 187 */ 188 void blacklistSyncSource(const HostAndPort& host, Date_t until); 189 190 /** 191 * Removes a single entry "host" from the list of potential sync sources which we 192 * have blacklisted, if it is supposed to be unblacklisted by "now". 193 */ 194 void unblacklistSyncSource(const HostAndPort& host, Date_t now); 195 196 /** 197 * Clears the list of potential sync sources we have blacklisted. 198 */ 199 void clearSyncSourceBlacklist(); 200 201 /** 202 * Determines if a new sync source should be chosen, if a better candidate sync source is 203 * available. If the current sync source's last optime ("syncSourceLastOpTime" under 204 * protocolVersion 1, but pulled from the MemberData in protocolVersion 0) is more than 205 * _maxSyncSourceLagSecs behind any syncable source, this function returns true. If we are 206 * running in ProtocolVersion 1, our current sync source is not primary, has no sync source 207 * ("syncSourceHasSyncSource" is false), and only has data up to "myLastOpTime", returns true. 208 * 209 * "now" is used to skip over currently blacklisted sync sources. 210 * 211 * TODO (SERVER-27668): Make OplogQueryMetadata non-optional in mongodb 3.8. 212 */ 213 bool shouldChangeSyncSource(const HostAndPort& currentSource, 214 const rpc::ReplSetMetadata& replMetadata, 215 boost::optional<rpc::OplogQueryMetadata> oqMetadata, 216 Date_t now) const; 217 218 /** 219 * Checks whether we are a single node set and we are not in a stepdown period. If so, 220 * puts us into candidate mode, otherwise does nothing. This is used to ensure that 221 * nodes in a single node replset become primary again when their stepdown period ends. 222 */ 223 bool becomeCandidateIfStepdownPeriodOverAndSingleNodeSet(Date_t now); 224 225 /** 226 * Sets the earliest time the current node will stand for election to "newTime". 227 * 228 * Until this time, while the node may report itself as electable, it will not stand 229 * for election. 230 */ 231 void setElectionSleepUntil(Date_t newTime); 232 233 /** 234 * Sets the reported mode of this node to one of RS_SECONDARY, RS_STARTUP2, RS_ROLLBACK or 235 * RS_RECOVERING, when getRole() == Role::follower. This is the interface by which the 236 * applier changes the reported member state of the current node, and enables or suppresses 237 * electability of the current node. All modes but RS_SECONDARY indicate an unelectable 238 * follower state (one that cannot transition to candidate). 239 */ 240 void setFollowerMode(MemberState::MS newMode); 241 242 /** 243 * Scan the memberData and determine the highest last applied or last 244 * durable optime present on a majority of servers; set _lastCommittedOpTime to this 245 * new entry. 246 * Whether the last applied or last durable op time is used depends on whether 247 * the config getWriteConcernMajorityShouldJournal is set. 248 * Returns true if the _lastCommittedOpTime was changed. 249 */ 250 bool updateLastCommittedOpTime(); 251 252 /** 253 * Updates _lastCommittedOpTime to min(committedOpTime, lastApplied) if it is more recent than 254 * the current last committed OpTime. Returns true if _lastCommittedOpTime is changed. 255 */ 256 bool advanceLastCommittedOpTime(OpTime committedOpTime); 257 258 /** 259 * Resets _lastCommittedOpTime to OpTime(), the default value at startup. 260 * Used on PV downgrade to forget the OpTimes in PV1. 261 */ 262 void resetLastCommittedOpTime(); 263 264 /** 265 * Returns the OpTime of the latest majority-committed op known to this server. 266 */ 267 OpTime getLastCommittedOpTime() const; 268 269 /** 270 * Returns true if it's safe to transition to LeaderMode::kMaster. 271 */ 272 bool canCompleteTransitionToPrimary(long long termWhenDrainCompleted) const; 273 274 /** 275 * Called by the ReplicationCoordinator to signal that we have finished catchup and drain modes 276 * and are ready to fully become primary and start accepting writes. 277 * "firstOpTimeOfTerm" is a floor on the OpTimes this node will be allowed to consider committed 278 * for this tenure as primary. This prevents entries from before our election from counting as 279 * committed in our view, until our election (the "firstOpTimeOfTerm" op) has been committed. 280 * Returns PrimarySteppedDown if this node is no longer eligible to begin accepting writes. 281 */ 282 Status completeTransitionToPrimary(const OpTime& firstOpTimeOfTerm); 283 284 /** 285 * Adjusts the maintenance mode count by "inc". 286 * 287 * It is an error to call this method if getRole() does not return Role::follower. 288 * It is an error to allow the maintenance count to go negative. 289 */ 290 void adjustMaintenanceCountBy(int inc); 291 292 //////////////////////////////////////////////////////////// 293 // 294 // Methods that prepare responses to command requests. 295 // 296 //////////////////////////////////////////////////////////// 297 298 // produces a reply to a replSetSyncFrom command 299 void prepareSyncFromResponse(const HostAndPort& target, 300 BSONObjBuilder* response, 301 Status* result); 302 303 // produce a reply to a replSetFresh command 304 void prepareFreshResponse(const ReplicationCoordinator::ReplSetFreshArgs& args, 305 Date_t now, 306 BSONObjBuilder* response, 307 Status* result); 308 309 // produce a reply to a received electCmd 310 void prepareElectResponse(const ReplicationCoordinator::ReplSetElectArgs& args, 311 Date_t now, 312 BSONObjBuilder* response, 313 Status* result); 314 315 // produce a reply to a heartbeat 316 Status prepareHeartbeatResponse(Date_t now, 317 const ReplSetHeartbeatArgs& args, 318 const std::string& ourSetName, 319 ReplSetHeartbeatResponse* response); 320 321 // produce a reply to a V1 heartbeat 322 Status prepareHeartbeatResponseV1(Date_t now, 323 const ReplSetHeartbeatArgsV1& args, 324 const std::string& ourSetName, 325 ReplSetHeartbeatResponse* response); 326 327 struct ReplSetStatusArgs { 328 Date_t now; 329 unsigned selfUptime; 330 const OpTime& readConcernMajorityOpTime; 331 const BSONObj& initialSyncStatus; 332 }; 333 334 // produce a reply to a status request 335 void prepareStatusResponse(const ReplSetStatusArgs& rsStatusArgs, 336 BSONObjBuilder* response, 337 Status* result); 338 339 // Produce a replSetUpdatePosition command to be sent to the node's sync source. 340 StatusWith<BSONObj> prepareReplSetUpdatePositionCommand( 341 OpTime currentCommittedSnapshotOpTime) const; 342 343 // Produce a reply to an ismaster request. It is only valid to call this if we are a 344 // replset. Drivers interpret the isMaster fields according to the Server Discovery and 345 // Monitoring Spec, see the "Parsing an isMaster response" section. 346 void fillIsMasterForReplSet(IsMasterResponse* response, 347 const SplitHorizon::Parameters& horizonParams); 348 349 // Produce member data for the serverStatus command and diagnostic logging. 350 void fillMemberData(BSONObjBuilder* result); 351 352 enum class PrepareFreezeResponseResult { kNoAction, kSingleNodeSelfElect }; 353 354 /** 355 * Produce a reply to a freeze request. Returns a PostMemberStateUpdateAction on success that 356 * may trigger state changes in the caller. 357 */ 358 StatusWith<PrepareFreezeResponseResult> prepareFreezeResponse(Date_t now, 359 int secs, 360 BSONObjBuilder* response); 361 362 //////////////////////////////////////////////////////////// 363 // 364 // Methods for sending and receiving heartbeats, 365 // reconfiguring and handling the results of standing for 366 // election. 367 // 368 //////////////////////////////////////////////////////////// 369 370 /** 371 * Updates the topology coordinator's notion of the replica set configuration. 372 * 373 * "newConfig" is the new configuration, and "selfIndex" is the index of this 374 * node's configuration information in "newConfig", or "selfIndex" is -1 to 375 * indicate that this node is not a member of "newConfig". 376 * 377 * newConfig.isInitialized() should be true, though implementations may accept 378 * configurations where this is not true, for testing purposes. 379 */ 380 void updateConfig(const ReplSetConfig& newConfig, int selfIndex, Date_t now); 381 382 /** 383 * Prepares a heartbeat request appropriate for sending to "target", assuming the 384 * current time is "now". "ourSetName" is used as the name for our replica set if 385 * the topology coordinator does not have a valid configuration installed. 386 * 387 * The returned pair contains proper arguments for a replSetHeartbeat command, and 388 * an amount of time to wait for the response. 389 * 390 * This call should be paired (with intervening network communication) with a call to 391 * processHeartbeatResponse for the same "target". 392 */ 393 std::pair<ReplSetHeartbeatArgs, Milliseconds> prepareHeartbeatRequest( 394 Date_t now, const std::string& ourSetName, const HostAndPort& target); 395 std::pair<ReplSetHeartbeatArgsV1, Milliseconds> prepareHeartbeatRequestV1( 396 Date_t now, const std::string& ourSetName, const HostAndPort& target); 397 398 /** 399 * Processes a heartbeat response from "target" that arrived around "now", having 400 * spent "networkRoundTripTime" millis on the network. 401 * 402 * Updates internal topology coordinator state, and returns instructions about what action 403 * to take next. 404 * 405 * If the next action indicates StartElection, the topology coordinator has transitioned to 406 * the "candidate" role, and will remain there until processWinElection or 407 * processLoseElection are called. 408 * 409 * If the next action indicates "StepDownSelf", the topology coordinator has transitioned 410 * to the "follower" role from "leader", and the caller should take any necessary actions 411 * to become a follower. 412 * 413 * If the next action indicates "StepDownRemotePrimary", the caller should take steps to 414 * cause the specified remote host to step down from primary to secondary. 415 * 416 * If the next action indicates "Reconfig", the caller should verify the configuration in 417 * hbResponse is acceptable, perform any other reconfiguration actions it must, and call 418 * updateConfig with the new configuration and the appropriate value for "selfIndex". It 419 * must also wrap up any outstanding elections (by calling processLoseElection or 420 * processWinElection) before calling updateConfig. 421 * 422 * This call should be paired (with intervening network communication) with a call to 423 * prepareHeartbeatRequest for the same "target". 424 */ 425 HeartbeatResponseAction processHeartbeatResponse( 426 Date_t now, 427 Milliseconds networkRoundTripTime, 428 const HostAndPort& target, 429 const StatusWith<ReplSetHeartbeatResponse>& hbResponse); 430 431 /** 432 * Returns whether or not at least 'numNodes' have reached the given opTime. 433 * "durablyWritten" indicates whether the operation has to be durably applied. 434 */ 435 bool haveNumNodesReachedOpTime(const OpTime& opTime, int numNodes, bool durablyWritten); 436 437 /** 438 * Returns whether or not at least one node matching the tagPattern has reached 439 * the given opTime. 440 * "durablyWritten" indicates whether the operation has to be durably applied. 441 */ 442 bool haveTaggedNodesReachedOpTime(const OpTime& opTime, 443 const ReplSetTagPattern& tagPattern, 444 bool durablyWritten); 445 446 /** 447 * Returns a vector of members that have applied the operation with OpTime 'op'. 448 * "durablyWritten" indicates whether the operation has to be durably applied. 449 * "skipSelf" means to exclude this node whether or not the op has been applied. 450 */ 451 std::vector<HostAndPort> getHostsWrittenTo(const OpTime& op, 452 bool durablyWritten, 453 bool skipSelf); 454 455 /** 456 * Marks a member as down from our perspective and returns a bool which indicates if we can no 457 * longer see a majority of the nodes and thus should step down. 458 */ 459 bool setMemberAsDown(Date_t now, const int memberIndex); 460 461 /** 462 * Goes through the memberData and determines which member that is currently live 463 * has the stalest (earliest) last update time. Returns (-1, Date_t::max()) if there are 464 * no other members. 465 */ 466 std::pair<int, Date_t> getStalestLiveMember() const; 467 468 /** 469 * Go through the memberData, and mark nodes which haven't been updated 470 * recently (within an election timeout) as "down". Returns a HeartbeatResponseAction, which 471 * will be StepDownSelf if we can no longer see a majority of the nodes, otherwise NoAction. 472 */ 473 HeartbeatResponseAction checkMemberTimeouts(Date_t now); 474 475 /** 476 * Set all nodes in memberData to not stale with a lastUpdate of "now". 477 */ 478 void resetAllMemberTimeouts(Date_t now); 479 480 /** 481 * Set all nodes in memberData that are present in member_set 482 * to not stale with a lastUpdate of "now". 483 */ 484 void resetMemberTimeouts(Date_t now, const stdx::unordered_set<HostAndPort>& member_set); 485 486 /* 487 * Returns the last optime that this node has applied, whether or not it has been journaled. 488 */ 489 OpTime getMyLastAppliedOpTime() const; 490 491 /* 492 * Returns the last optime that this node has applied and journaled. 493 */ 494 OpTime getMyLastDurableOpTime() const; 495 496 /* 497 * Returns information we have on the state of this node. 498 */ 499 MemberData* getMyMemberData(); 500 501 /* 502 * Returns information we have on the state of the node identified by memberId. Returns 503 * nullptr if memberId is not found in the configuration. 504 */ 505 MemberData* findMemberDataByMemberId(const int memberId); 506 507 /* 508 * Returns information we have on the state of the node identified by rid. Returns 509 * nullptr if rid is not found in the heartbeat data. This method is used only for 510 * master/slave replication. 511 */ 512 MemberData* findMemberDataByRid(const OID rid); 513 514 /* 515 * Adds and returns a memberData entry for the given RID. 516 * Used only in master/slave mode. 517 */ 518 MemberData* addSlaveMemberData(const OID rid); 519 520 /** 521 * If getRole() == Role::candidate and this node has not voted too recently, updates the 522 * lastVote tracker and returns true. Otherwise, returns false. 523 */ 524 bool voteForMyself(Date_t now); 525 526 /** 527 * Sets lastVote to be for ourself in this term. 528 */ 529 void voteForMyselfV1(); 530 531 /** 532 * Sets election id and election optime. 533 */ 534 void setElectionInfo(OID electionId, Timestamp electionOpTime); 535 536 /** 537 * Performs state updates associated with winning an election. 538 * 539 * It is an error to call this if the topology coordinator is not in candidate mode. 540 * 541 * Exactly one of either processWinElection or processLoseElection must be called if 542 * processHeartbeatResponse returns StartElection, to exit candidate mode. 543 */ 544 void processWinElection(OID electionId, Timestamp electionOpTime); 545 546 /** 547 * Performs state updates associated with losing an election. 548 * 549 * It is an error to call this if the topology coordinator is not in candidate mode. 550 * 551 * Exactly one of either processWinElection or processLoseElection must be called if 552 * processHeartbeatResponse returns StartElection, to exit candidate mode. 553 */ 554 void processLoseElection(); 555 556 557 using StepDownAttemptAbortFn = stdx::function<void()>; 558 /** 559 * Readies the TopologyCoordinator for an attempt to stepdown that may fail. This is used 560 * when we receive a stepdown command (which can fail if not enough secondaries are caught up) 561 * to ensure that we never process more than one stepdown request at a time. 562 * Returns OK if it is safe to continue with the stepdown attempt, or returns: 563 * - NotMaster if this node is not a leader. 564 * - ConflictingOperationInProgess if this node is already processing a stepdown request of any 565 * kind. 566 * On an OK return status also returns a function object that can be called to abort the 567 * pending stepdown attempt and return this node to normal primary/master state. 568 */ 569 StatusWith<StepDownAttemptAbortFn> prepareForStepDownAttempt(); 570 571 /** 572 * Tries to transition the coordinator from the leader role to the follower role. 573 * 574 * A step down succeeds based on the following conditions: 575 * 576 * C1. 'force' is true and now > waitUntil 577 * 578 * C2. A majority set of nodes, M, in the replica set have optimes greater than or 579 * equal to the last applied optime of the primary. 580 * 581 * C3. There exists at least one electable secondary node in the majority set M. 582 * 583 * 584 * If C1 is true, or if both C2 and C3 are true, then the stepdown occurs and this method 585 * returns true. If the conditions for successful stepdown aren't met yet, but waiting for more 586 * time to pass could make it succeed, returns false. If the whole stepdown attempt should be 587 * abandoned (for example because the time limit expired or because we've already stepped down), 588 * throws an exception. 589 * TODO(spencer): Unify with the finishUnconditionalStepDown() method. 590 */ 591 bool attemptStepDown( 592 long long termAtStart, Date_t now, Date_t waitUntil, Date_t stepDownUntil, bool force); 593 594 /** 595 * Returns whether it is safe for a stepdown attempt to complete, ignoring the 'force' argument. 596 * This is essentially checking conditions C2 and C3 as described in the comment to 597 * attemptStepDown(). 598 */ 599 bool isSafeToStepDown(); 600 601 /** 602 * Readies the TopologyCoordinator for stepdown. Returns false if we're already in the process 603 * of an unconditional step down. If we are in the middle of a stepdown command attempt when 604 * this is called then this unconditional stepdown will supersede the stepdown attempt, which 605 * will cause the stepdown to fail. When this returns true it must be followed by a call to 606 * finishUnconditionalStepDown() that is called when holding the global X lock. 607 */ 608 bool prepareForUnconditionalStepDown(); 609 610 /** 611 * Sometimes a request to step down comes in (like via a heartbeat), but we don't have the 612 * global exclusive lock so we can't actually stepdown at that moment. When that happens 613 * we record that a stepdown request is pending (by calling prepareForUnconditionalStepDown()) 614 * and schedule work to stepdown in the global X lock. This method is called after holding the 615 * global lock to perform the actual stepdown. 616 * TODO(spencer): Unify with the finishAttemptedStepDown() method. 617 */ 618 void finishUnconditionalStepDown(); 619 620 /** 621 * Returns the index of the most suitable candidate for an election handoff. The node must be 622 * caught up and electable. Ties are resolved first by highest priority, then by lowest member 623 * id. 624 */ 625 int chooseElectionHandoffCandidate(); 626 627 /** 628 * Considers whether or not this node should stand for election, and returns true 629 * if the node has transitioned to candidate role as a result of the call. 630 */ 631 Status checkShouldStandForElection(Date_t now) const; 632 633 /** 634 * Set the outgoing heartbeat message from self 635 */ 636 void setMyHeartbeatMessage(const Date_t now, const std::string& s); 637 638 /** 639 * Prepares a ReplSetMetadata object describing the current term, primary, and lastOp 640 * information. 641 */ 642 rpc::ReplSetMetadata prepareReplSetMetadata(const OpTime& lastVisibleOpTime) const; 643 644 /** 645 * Prepares an OplogQueryMetadata object describing the current sync source, rbid, primary, 646 * lastOpApplied, and lastOpCommitted. 647 */ 648 rpc::OplogQueryMetadata prepareOplogQueryMetadata(int rbid) const; 649 650 /** 651 * Writes into 'output' all the information needed to generate a summary of the current 652 * replication state for use by the web interface. 653 */ 654 void summarizeAsHtml(ReplSetHtmlSummary* output); 655 656 /** 657 * Prepares a ReplSetRequestVotesResponse. 658 */ 659 void processReplSetRequestVotes(const ReplSetRequestVotesArgs& args, 660 ReplSetRequestVotesResponse* response); 661 662 /** 663 * Loads an initial LastVote document, which was read from local storage. 664 * 665 * Called only during replication startup. All other updates are done internally. 666 */ 667 void loadLastVote(const LastVote& lastVote); 668 669 /** 670 * Updates the current primary index. 671 */ 672 void setPrimaryIndex(long long primaryIndex); 673 674 /** 675 * Returns the current primary index. 676 */ 677 int getCurrentPrimaryIndex() const; 678 679 enum StartElectionReason { 680 kElectionTimeout, 681 kPriorityTakeover, 682 kStepUpRequest, 683 kStepUpRequestSkipDryRun, 684 kCatchupTakeover, 685 kSingleNodePromptElection 686 }; 687 688 /** 689 * Transitions to the candidate role if the node is electable. 690 */ 691 Status becomeCandidateIfElectable(const Date_t now, StartElectionReason reason); 692 693 /** 694 * Updates the storage engine read committed support in the TopologyCoordinator options after 695 * creation. 696 */ 697 void setStorageEngineSupportsReadCommitted(bool supported); 698 699 /** 700 * Reset the booleans to record the last heartbeat restart. 701 */ 702 void restartHeartbeats(); 703 704 /** 705 * Scans through all members that are 'up' and return the latest known optime, if we have 706 * received (successful or failed) heartbeats from all nodes since heartbeat restart. 707 * 708 * Returns boost::none if any node hasn't responded to a heartbeat since we last restarted 709 * heartbeats. 710 * Returns OpTime(Timestamp(0, 0), 0), the smallest OpTime in PV1, if other nodes are all down. 711 */ 712 boost::optional<OpTime> latestKnownOpTimeSinceHeartbeatRestart() const; 713 714 //////////////////////////////////////////////////////////// 715 // 716 // Test support methods 717 // 718 //////////////////////////////////////////////////////////// 719 720 // Changes _memberState to newMemberState. Only for testing. 721 void changeMemberState_forTest(const MemberState& newMemberState, 722 const Timestamp& electionTime = Timestamp(0, 0)); 723 724 // Sets "_electionTime" to "newElectionTime". Only for testing. 725 void _setElectionTime(const Timestamp& newElectionTime); 726 727 // Sets _currentPrimaryIndex to the given index. Should only be used in unit tests! 728 // TODO(spencer): Remove this once we can easily call for an election in unit tests to 729 // set the current primary. 730 void _setCurrentPrimaryForTest(int primaryIndex); 731 732 // Returns _electionTime. Only used in unittests. 733 Timestamp getElectionTime() const; 734 735 // Returns _electionId. Only used in unittests. 736 OID getElectionId() const; 737 738 // Returns the name for a role. Only used in unittests. 739 static std::string roleToString(TopologyCoordinator::Role role); 740 741 private: 742 typedef int UnelectableReasonMask; 743 class PingStats; 744 745 /** 746 * Different modes a node can be in while still reporting itself as in state PRIMARY. 747 * 748 * Valid transitions: 749 * 750 * kNotLeader <---------------------------------- 751 * | | 752 * | | 753 * | | 754 * v | 755 * kLeaderElect----------------- | 756 * | ^ | | | 757 * | | | | | 758 * v | | | | 759 * kMaster -------------------------- | 760 * | ^ | | | | | 761 * | | | | | | | 762 * | | | | | | | 763 * v | | v v v | 764 * kAttemptingStepDown----------->kSteppingDown | 765 * | | | 766 * | | | 767 * | | | 768 * --------------------------------------------- 769 * 770 */ 771 enum class LeaderMode { 772 kNotLeader, // This node is not currently a leader. 773 kLeaderElect, // This node has been elected leader, but can't yet accept writes. 774 kMaster, // This node reports ismaster:true and can accept writes. 775 kSteppingDown, // This node is in the middle of a (hb) stepdown that must complete. 776 kAttemptingStepDown, // This node is in the middle of a stepdown (cmd) that might fail. 777 }; 778 779 enum UnelectableReason { 780 None = 0, 781 CannotSeeMajority = 1 << 0, 782 NotCloseEnoughToLatestOptime = 1 << 1, 783 ArbiterIAm = 1 << 2, 784 NotSecondary = 1 << 3, 785 NoPriority = 1 << 4, 786 StepDownPeriodActive = 1 << 5, 787 NoData = 1 << 6, 788 NotInitialized = 1 << 7, 789 VotedTooRecently = 1 << 8, 790 RefusesToStand = 1 << 9, 791 NotCloseEnoughToLatestForPriorityTakeover = 1 << 10, 792 NotFreshEnoughForCatchupTakeover = 1 << 11, 793 }; 794 795 // Set what type of PRIMARY this node currently is. 796 void _setLeaderMode(LeaderMode mode); 797 798 // Returns the number of heartbeat pings which have occurred. 799 int _getTotalPings(); 800 801 // Returns the current "ping" value for the given member by their address 802 Milliseconds _getPing(const HostAndPort& host); 803 804 // Determines if we will veto the member specified by "args.id". 805 // If we veto, the errmsg will be filled in with a reason 806 bool _shouldVetoMember(const ReplicationCoordinator::ReplSetFreshArgs& args, 807 const Date_t& now, 808 std::string* errmsg) const; 809 810 // Returns the index of the member with the matching id, or -1 if none match. 811 int _getMemberIndex(int id) const; 812 813 // Sees if a majority number of votes are held by members who are currently "up" 814 bool _aMajoritySeemsToBeUp() const; 815 816 // Checks if the node can see a healthy primary of equal or greater priority to the 817 // candidate. If so, returns the index of that node. Otherwise returns -1. 818 int _findHealthyPrimaryOfEqualOrGreaterPriority(const int candidateIndex) const; 819 820 // Is otherOpTime close enough (within 10 seconds) to the latest known optime to qualify 821 // for an election 822 bool _isOpTimeCloseEnoughToLatestToElect(const OpTime& otherOpTime) const; 823 824 // Is our optime close enough to the latest known optime to call for a priority takeover. 825 bool _amIFreshEnoughForPriorityTakeover() const; 826 827 // Is the primary node still in catchup mode and is our optime the latest 828 // known optime of all the up nodes. 829 bool _amIFreshEnoughForCatchupTakeover() const; 830 831 // Returns reason why "self" member is unelectable 832 UnelectableReasonMask _getMyUnelectableReason(const Date_t now, 833 StartElectionReason reason) const; 834 835 // Returns reason why memberIndex is unelectable 836 UnelectableReasonMask _getUnelectableReason(int memberIndex) const; 837 838 // Returns the nice text of why the node is unelectable 839 std::string _getUnelectableReasonString(UnelectableReasonMask ur) const; 840 841 // Return true if we are currently primary 842 bool _iAmPrimary() const; 843 844 // Scans through all members that are 'up' and return the latest known optime. 845 OpTime _latestKnownOpTime() const; 846 847 // Scans the electable set and returns the highest priority member index 848 int _getHighestPriorityElectableIndex(Date_t now) const; 849 850 // Returns true if "one" member is higher priority than "two" member 851 bool _isMemberHigherPriority(int memberOneIndex, int memberTwoIndex) const; 852 853 // Helper shortcut to self config 854 const MemberConfig& _selfConfig() const; 855 856 // Helper shortcut to self member data 857 const MemberData& _selfMemberData() const; 858 859 // Index of self member in member data. 860 const int _selfMemberDataIndex() const; 861 862 // Returns NULL if there is no primary, or the MemberConfig* for the current primary 863 const MemberConfig* _currentPrimaryMember() const; 864 865 /** 866 * Performs updating "_currentPrimaryIndex" for processHeartbeatResponse(), and determines if an 867 * election or stepdown should commence. 868 * _updatePrimaryFromHBDataV1() is a simplified version of _updatePrimaryFromHBData() to be used 869 * when in ProtocolVersion1. 870 */ 871 HeartbeatResponseAction _updatePrimaryFromHBData(int updatedConfigIndex, 872 const MemberState& originalState, 873 Date_t now); 874 HeartbeatResponseAction _updatePrimaryFromHBDataV1(int updatedConfigIndex, 875 const MemberState& originalState, 876 Date_t now); 877 878 /** 879 * Updates _memberData based on the newConfig, ensuring that every member in the newConfig 880 * has an entry in _memberData. If any nodes in the newConfig are also present in 881 * _currentConfig, copies their heartbeat info into the corresponding entry in the updated 882 * _memberData vector. 883 */ 884 void _updateHeartbeatDataForReconfig(const ReplSetConfig& newConfig, int selfIndex, Date_t now); 885 886 /** 887 * Returns whether a stepdown attempt should be allowed to proceed. See the comment for 888 * attemptStepDown() for more details on the rules of when stepdown attempts succeed or fail. 889 */ 890 bool _canCompleteStepDownAttempt(Date_t now, Date_t waitUntil, bool force); 891 892 /** 893 * Returns true if a node is both caught up to our last applied opTime and electable. 894 */ 895 bool _isCaughtUpAndElectable(int memberIndex, OpTime lastApplied); 896 897 void _stepDownSelfAndReplaceWith(int newPrimary); 898 899 /** 900 * Looks up the provided member in the blacklist and returns true if the member's blacklist 901 * expire time is after 'now'. If the member is found but the expire time is before 'now', 902 * the function returns false. If the member is not found in the blacklist, the function 903 * returns false. 904 **/ 905 bool _memberIsBlacklisted(const MemberConfig& memberConfig, Date_t now) const; 906 907 /** 908 * Returns true if we are a one-node replica set, we're the one member, 909 * we're electable, we're not in maintenance mode, and we are currently in followerMode 910 * SECONDARY. 911 * 912 * This is used to decide if we should transition to Role::candidate in a one-node replica set. 913 */ 914 bool _isElectableNodeInSingleNodeReplicaSet() const; 915 916 // This node's role in the replication protocol. 917 Role _role; 918 919 // This is a unique id that is generated and set each time we transition to PRIMARY, as the 920 // result of an election. 921 OID _electionId; 922 // The time at which the current PRIMARY was elected. 923 Timestamp _electionTime; 924 925 // This node's election term. The term is used as part of the consensus algorithm to elect 926 // and maintain one primary (leader) node in the cluster. 927 long long _term; 928 929 // the index of the member we currently believe is primary, if one exists, otherwise -1 930 int _currentPrimaryIndex; 931 932 // the hostandport we are currently syncing from 933 // empty if no sync source (we are primary, or we cannot connect to anyone yet) 934 HostAndPort _syncSource; 935 // These members are not chosen as sync sources for a period of time, due to connection 936 // issues with them 937 std::map<HostAndPort, Date_t> _syncSourceBlacklist; 938 // The next sync source to be chosen, requested via a replSetSyncFrom command 939 int _forceSyncSourceIndex; 940 941 // Options for this TopologyCoordinator 942 Options _options; 943 944 // "heartbeat message" 945 // sent in requestHeartbeat respond in field "hbm" 946 std::string _hbmsg; 947 Date_t _hbmsgTime; // when it was logged 948 949 // heartbeat msg to send to others; descriptive diagnostic info 950 std::string _getHbmsg(Date_t now) const; 951 952 int _selfIndex; // this node's index in _members and _currentConfig 953 954 ReplSetConfig _rsConfig; // The current config, including a vector of MemberConfigs 955 956 // Heartbeat, current applied/durable optime, and other state data for each member. It is 957 // guaranteed that this vector will be maintained in the same order as the MemberConfigs in 958 // _currentConfig, therefore the member config index can be used to index into this vector as 959 // well. 960 std::vector<MemberData> _memberData; 961 962 // Time when stepDown command expires 963 Date_t _stepDownUntil; 964 965 // A time before which this node will not stand for election. 966 // In protocol version 1, this is used to prevent running for election after seeing 967 // a new term. 968 Date_t _electionSleepUntil; 969 970 // OpTime of the latest committed operation. 971 OpTime _lastCommittedOpTime; 972 973 // OpTime representing our transition to PRIMARY and the start of our term. 974 // _lastCommittedOpTime cannot be set to an earlier OpTime. 975 OpTime _firstOpTimeOfMyTerm; 976 977 // The number of calls we have had to enter maintenance mode 978 int _maintenanceModeCalls; 979 980 // The sub-mode of follower that we are in. Legal values are RS_SECONDARY, RS_RECOVERING, 981 // RS_STARTUP2 (initial sync) and RS_ROLLBACK. Only meaningful if _role == Role::follower. 982 // Configured via setFollowerMode(). If the sub-mode is RS_SECONDARY, then the effective 983 // sub-mode is either RS_SECONDARY or RS_RECOVERING, depending on _maintenanceModeCalls. 984 // Rather than accesing this variable direclty, one should use the getMemberState() method, 985 // which computes the replica set node state on the fly. 986 MemberState::MS _followerMode; 987 988 // What type of PRIMARY this node currently is. Don't set this directly, call _setLeaderMode 989 // instead. 990 LeaderMode _leaderMode = LeaderMode::kNotLeader; 991 992 typedef std::map<HostAndPort, PingStats> PingMap; 993 // Ping stats for each member by HostAndPort; 994 PingMap _pings; 995 996 // Last vote info from the election 997 struct VoteLease { 998 static const Seconds leaseTime; 999 1000 Date_t when; 1001 int whoId = -1; 1002 HostAndPort whoHostAndPort; 1003 } _voteLease; 1004 1005 // V1 last vote info for elections 1006 LastVote _lastVote{OpTime::kInitialTerm, -1}; 1007 1008 enum class ReadCommittedSupport { 1009 kUnknown, 1010 kNo, 1011 kYes, 1012 }; 1013 1014 // Whether or not the storage engine supports read committed. 1015 ReadCommittedSupport _storageEngineSupportsReadCommitted{ReadCommittedSupport::kUnknown}; 1016 }; 1017 1018 /** 1019 * A PingStats object stores data about heartbeat attempts to a particular target node. Over the 1020 * course of its lifetime, it may be used for multiple rounds of heartbeats. This allows for the 1021 * collection of statistics like average heartbeat latency to a target. The heartbeat latency 1022 * measurement it stores for each replica set member is an average weighted 80% to the old value, 1023 * and 20% to the new value. 1024 * 1025 */ 1026 class TopologyCoordinator::PingStats { 1027 public: 1028 /** 1029 * Starts a new round of heartbeat attempts by transitioning to 'TRYING' and resetting the 1030 * failure count. Also Records that a new heartbeat request started at "now". 1031 */ 1032 void start(Date_t now); 1033 1034 /** 1035 * Records that a heartbeat request completed successfully, and that "millis" milliseconds 1036 * were spent for a single network roundtrip plus remote processing time. 1037 */ 1038 void hit(Milliseconds millis); 1039 1040 /** 1041 * Records that a heartbeat request failed. 1042 */ 1043 void miss(); 1044 1045 /** 1046 * Gets the number of hit() calls. 1047 */ getCount()1048 unsigned int getCount() const { 1049 return hitCount; 1050 } 1051 1052 /** 1053 * Gets the weighted average round trip time for heartbeat messages to the target. 1054 * Returns 0 if there have been no pings recorded yet. 1055 */ getMillis()1056 Milliseconds getMillis() const { 1057 return averagePingTimeMs == UninitializedPingTime ? Milliseconds(0) : averagePingTimeMs; 1058 } 1059 1060 /** 1061 * Gets the date at which start() was last called, which is used to determine if 1062 * a heartbeat should be retried or if the time limit has expired. 1063 */ getLastHeartbeatStartDate()1064 Date_t getLastHeartbeatStartDate() const { 1065 return _lastHeartbeatStartDate; 1066 } 1067 1068 /** 1069 * Returns true if the number of failed heartbeats for the most recent round of attempts has 1070 * exceeded the max number of heartbeat retries. 1071 */ failed()1072 bool failed() const { 1073 return _state == FAILED; 1074 } 1075 1076 /** 1077 * Returns true if a good heartbeat has been received for the most recent round of heartbeat 1078 * attempts before the maximum number of retries has been exceeded. Returns false otherwise. 1079 */ succeeded()1080 bool succeeded() const { 1081 return _state == SUCCEEDED; 1082 } 1083 1084 /** 1085 * Returns true if a heartbeat attempt is currently in progress and there are still retries 1086 * left. 1087 */ trying()1088 bool trying() const { 1089 return _state == TRYING; 1090 } 1091 1092 /** 1093 * Returns true if 'start' has never been called on this instance of PingStats. Otherwise 1094 * returns false. 1095 */ uninitialized()1096 bool uninitialized() const { 1097 return _state == UNINITIALIZED; 1098 } 1099 1100 /** 1101 * Gets the number of retries left for this heartbeat attempt. Invalid to call if the current 1102 * state is 'UNINITIALIZED'. 1103 */ retriesLeft()1104 int retriesLeft() const { 1105 return kMaxHeartbeatRetries - _numFailuresSinceLastStart; 1106 } 1107 1108 private: 1109 /** 1110 * Represents the current state of this PingStats object. 1111 * 1112 * At creation time, a PingStats object is in the 'UNINITIALIZED' state, and will remain so 1113 * until the first heartbeat attempt is initiated. Heartbeat attempts are initiated by calls to 1114 * 'start', which puts the object into 'TRYING' state. If all heartbeat retries are used up 1115 * before receiving a good response, it will enter the 'FAILED' state. If a good heartbeat 1116 * response is received before exceeding the maximum number of retries, the object enters the 1117 * 'SUCCEEDED' state. From either the 'SUCCEEDED' or 'FAILED' state, the object can go back into 1118 * 'TRYING' state, to begin a new heartbeat attempt. The following is a simple state transition 1119 * table illustrating this behavior: 1120 * 1121 * UNINITIALIZED: [TRYING] 1122 * TRYING: [SUCCEEDED, FAILED] 1123 * SUCCEEDED: [TRYING] 1124 * FAILED: [TRYING] 1125 * 1126 */ 1127 enum HeartbeatState { UNINITIALIZED, TRYING, SUCCEEDED, FAILED }; 1128 1129 // The current state of this PingStats object. 1130 HeartbeatState _state = UNINITIALIZED; 1131 1132 // Represents the uninitialized value of a counter that should only ever be >=0 after 1133 // initialization. 1134 static constexpr int UninitializedCount{-1}; 1135 1136 // The value of 'averagePingTimeMs' before any good heartbeats have been received. 1137 static constexpr Milliseconds UninitializedPingTime{UninitializedCount}; 1138 1139 // The number of successful heartbeats that have ever been received i.e. the total number of 1140 // calls to 'PingStats::hit'. 1141 unsigned int hitCount = 0; 1142 1143 // The running, weighted average round trip time for heartbeat messages to the target node. 1144 // Weighted 80% to the old round trip ping time, and 20% to the new round trip ping time. 1145 Milliseconds averagePingTimeMs = UninitializedPingTime; 1146 1147 // The time of the most recent call to 'PingStats::start'. 1148 Date_t _lastHeartbeatStartDate; 1149 1150 // The number of failed heartbeat attempts since the most recent call to 'PingStats::start'. 1151 int _numFailuresSinceLastStart = UninitializedCount; 1152 }; 1153 1154 // 1155 // Convenience method for unittest code. Please use accessors otherwise. 1156 // 1157 1158 std::ostream& operator<<(std::ostream& os, TopologyCoordinator::Role role); 1159 std::ostream& operator<<(std::ostream& os, TopologyCoordinator::PrepareFreezeResponseResult result); 1160 1161 } // namespace repl 1162 } // namespace mongo 1163