1 
2 /**
3  *    Copyright (C) 2018-present MongoDB, Inc.
4  *
5  *    This program is free software: you can redistribute it and/or modify
6  *    it under the terms of the Server Side Public License, version 1,
7  *    as published by MongoDB, Inc.
8  *
9  *    This program is distributed in the hope that it will be useful,
10  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *    Server Side Public License for more details.
13  *
14  *    You should have received a copy of the Server Side Public License
15  *    along with this program. If not, see
16  *    <http://www.mongodb.com/licensing/server-side-public-license>.
17  *
18  *    As a special exception, the copyright holders give permission to link the
19  *    code of portions of this program with the OpenSSL library under certain
20  *    conditions as described in each individual source file and distribute
21  *    linked combinations including the program with the OpenSSL library. You
22  *    must comply with the Server Side Public License in all respects for
23  *    all of the code used other than as permitted herein. If you modify file(s)
24  *    with this exception, you may extend this exception to your version of the
25  *    file(s), but you are not obligated to do so. If you do not wish to do so,
26  *    delete this exception statement from your version. If you delete this
27  *    exception statement from all source files in the program, then also delete
28  *    it in the license file.
29  */
30 
31 #pragma once
32 
33 #include <iosfwd>
34 #include <string>
35 
36 #include "mongo/base/disallow_copying.h"
37 #include "mongo/db/repl/last_vote.h"
38 #include "mongo/db/repl/repl_set_heartbeat_response.h"
39 #include "mongo/db/repl/replication_coordinator.h"
40 #include "mongo/db/repl/split_horizon.h"
41 #include "mongo/db/server_options.h"
42 #include "mongo/stdx/functional.h"
43 #include "mongo/util/net/hostandport.h"
44 #include "mongo/util/time_support.h"
45 
46 namespace mongo {
47 class Timestamp;
48 
49 namespace repl {
50 class HeartbeatResponseAction;
51 class MemberData;
52 class OpTime;
53 class ReplSetHeartbeatArgs;
54 class ReplSetConfig;
55 class TagSubgroup;
56 struct MemberState;
57 
58 // Maximum number of retries for a failed heartbeat.
59 const int kMaxHeartbeatRetries = 2;
60 
61 /**
62  * Replication Topology Coordinator
63  *
64  * This object is responsible for managing the topology of the cluster.
65  * Tasks include consensus and leader election, chaining, and configuration management.
66  * Methods of this class should be non-blocking.
67  */
68 class TopologyCoordinator {
69     MONGO_DISALLOW_COPYING(TopologyCoordinator);
70 
71 public:
72     /**
73      * Type that denotes the role of a node in the replication protocol.
74      *
75      * The role is distinct from MemberState, in that it only deals with the
76      * roles a node plays in the basic protocol -- leader, follower and candidate.
77      * The mapping between MemberState and Role is complex -- several MemberStates
78      * map to the follower role, and MemberState::RS_SECONDARY maps to either
79      * follower or candidate roles, e.g.
80      */
81     enum class Role { kLeader = 0, kFollower = 1, kCandidate = 2 };
82 
83     struct Options {
84         // A sync source is re-evaluated after it lags behind further than this amount.
85         Seconds maxSyncSourceLagSecs{0};
86 
87         // Whether or not this node is running as a config server.
88         ClusterRole clusterRole{ClusterRole::None};
89     };
90 
91     /**
92      * Constructs a Topology Coordinator object.
93      **/
94     TopologyCoordinator(Options options);
95 
96 
97     ~TopologyCoordinator();
98 
99     ////////////////////////////////////////////////////////////
100     //
101     // State inspection methods.
102     //
103     ////////////////////////////////////////////////////////////
104 
105     /**
106      * Gets the role of this member in the replication protocol.
107      */
108     Role getRole() const;
109 
110     /**
111      * Gets the MemberState of this member in the replica set.
112      */
113     MemberState getMemberState() const;
114 
115     /**
116      * Returns whether this node should be allowed to accept writes.
117      */
118     bool canAcceptWrites() const;
119 
120     /**
121      * Returns true if this node is in the process of stepping down.  Note that this can be
122      * due to an unconditional stepdown that must succeed (for instance from learning about a new
123      * term) or due to a stepdown attempt that could fail (for instance from a stepdown cmd that
124      * could fail if not enough nodes are caught up).
125      */
126     bool isSteppingDown() const;
127 
128     /**
129      * Returns the address of the current sync source, or an empty HostAndPort if there is no
130      * current sync source.
131      */
132     HostAndPort getSyncSourceAddress() const;
133 
134     /**
135      * Retrieves a vector of HostAndPorts containing all nodes that are neither DOWN nor
136      * ourself.
137      */
138     std::vector<HostAndPort> getMaybeUpHostAndPorts() const;
139 
140     /**
141      * Gets the earliest time the current node will stand for election.
142      */
143     Date_t getStepDownTime() const;
144 
145     /**
146      * Gets the current value of the maintenance mode counter.
147      */
148     int getMaintenanceCount() const;
149 
150     /**
151      * Gets the latest term this member is aware of. If this member is the primary,
152      * it's the current term of the replica set.
153      */
154     long long getTerm() const;
155 
156     enum class UpdateTermResult { kAlreadyUpToDate, kTriggerStepDown, kUpdatedTerm };
157 
158     ////////////////////////////////////////////////////////////
159     //
160     // Basic state manipulation methods.
161     //
162     ////////////////////////////////////////////////////////////
163 
164     /**
165      * Sets the latest term this member is aware of to the higher of its current value and
166      * the value passed in as "term".
167      * Returns the result of setting the term value, or if a stepdown should be triggered.
168      */
169     UpdateTermResult updateTerm(long long term, Date_t now);
170 
171     /**
172      * Sets the index into the config used when we next choose a sync source
173      */
174     void setForceSyncSourceIndex(int index);
175 
176     enum class ChainingPreference { kAllowChaining, kUseConfiguration };
177 
178     /**
179      * Chooses and sets a new sync source, based on our current knowledge of the world.
180      */
181     HostAndPort chooseNewSyncSource(Date_t now,
182                                     const OpTime& lastOpTimeFetched,
183                                     ChainingPreference chainingPreference);
184 
185     /**
186      * Suppresses selecting "host" as sync source until "until".
187      */
188     void blacklistSyncSource(const HostAndPort& host, Date_t until);
189 
190     /**
191      * Removes a single entry "host" from the list of potential sync sources which we
192      * have blacklisted, if it is supposed to be unblacklisted by "now".
193      */
194     void unblacklistSyncSource(const HostAndPort& host, Date_t now);
195 
196     /**
197      * Clears the list of potential sync sources we have blacklisted.
198      */
199     void clearSyncSourceBlacklist();
200 
201     /**
202      * Determines if a new sync source should be chosen, if a better candidate sync source is
203      * available.  If the current sync source's last optime ("syncSourceLastOpTime" under
204      * protocolVersion 1, but pulled from the MemberData in protocolVersion 0) is more than
205      * _maxSyncSourceLagSecs behind any syncable source, this function returns true. If we are
206      * running in ProtocolVersion 1, our current sync source is not primary, has no sync source
207      * ("syncSourceHasSyncSource" is false), and only has data up to "myLastOpTime", returns true.
208      *
209      * "now" is used to skip over currently blacklisted sync sources.
210      *
211      * TODO (SERVER-27668): Make OplogQueryMetadata non-optional in mongodb 3.8.
212      */
213     bool shouldChangeSyncSource(const HostAndPort& currentSource,
214                                 const rpc::ReplSetMetadata& replMetadata,
215                                 boost::optional<rpc::OplogQueryMetadata> oqMetadata,
216                                 Date_t now) const;
217 
218     /**
219      * Checks whether we are a single node set and we are not in a stepdown period.  If so,
220      * puts us into candidate mode, otherwise does nothing.  This is used to ensure that
221      * nodes in a single node replset become primary again when their stepdown period ends.
222      */
223     bool becomeCandidateIfStepdownPeriodOverAndSingleNodeSet(Date_t now);
224 
225     /**
226      * Sets the earliest time the current node will stand for election to "newTime".
227      *
228      * Until this time, while the node may report itself as electable, it will not stand
229      * for election.
230      */
231     void setElectionSleepUntil(Date_t newTime);
232 
233     /**
234      * Sets the reported mode of this node to one of RS_SECONDARY, RS_STARTUP2, RS_ROLLBACK or
235      * RS_RECOVERING, when getRole() == Role::follower.  This is the interface by which the
236      * applier changes the reported member state of the current node, and enables or suppresses
237      * electability of the current node.  All modes but RS_SECONDARY indicate an unelectable
238      * follower state (one that cannot transition to candidate).
239      */
240     void setFollowerMode(MemberState::MS newMode);
241 
242     /**
243      * Scan the memberData and determine the highest last applied or last
244      * durable optime present on a majority of servers; set _lastCommittedOpTime to this
245      * new entry.
246      * Whether the last applied or last durable op time is used depends on whether
247      * the config getWriteConcernMajorityShouldJournal is set.
248      * Returns true if the _lastCommittedOpTime was changed.
249      */
250     bool updateLastCommittedOpTime();
251 
252     /**
253      * Updates _lastCommittedOpTime to min(committedOpTime, lastApplied) if it is more recent than
254      * the current last committed OpTime.  Returns true if _lastCommittedOpTime is changed.
255      */
256     bool advanceLastCommittedOpTime(OpTime committedOpTime);
257 
258     /**
259      * Resets _lastCommittedOpTime to OpTime(), the default value at startup.
260      * Used on PV downgrade to forget the OpTimes in PV1.
261      */
262     void resetLastCommittedOpTime();
263 
264     /**
265      * Returns the OpTime of the latest majority-committed op known to this server.
266      */
267     OpTime getLastCommittedOpTime() const;
268 
269     /**
270      * Returns true if it's safe to transition to LeaderMode::kMaster.
271      */
272     bool canCompleteTransitionToPrimary(long long termWhenDrainCompleted) const;
273 
274     /**
275      * Called by the ReplicationCoordinator to signal that we have finished catchup and drain modes
276      * and are ready to fully become primary and start accepting writes.
277      * "firstOpTimeOfTerm" is a floor on the OpTimes this node will be allowed to consider committed
278      * for this tenure as primary. This prevents entries from before our election from counting as
279      * committed in our view, until our election (the "firstOpTimeOfTerm" op) has been committed.
280      * Returns PrimarySteppedDown if this node is no longer eligible to begin accepting writes.
281      */
282     Status completeTransitionToPrimary(const OpTime& firstOpTimeOfTerm);
283 
284     /**
285      * Adjusts the maintenance mode count by "inc".
286      *
287      * It is an error to call this method if getRole() does not return Role::follower.
288      * It is an error to allow the maintenance count to go negative.
289      */
290     void adjustMaintenanceCountBy(int inc);
291 
292     ////////////////////////////////////////////////////////////
293     //
294     // Methods that prepare responses to command requests.
295     //
296     ////////////////////////////////////////////////////////////
297 
298     // produces a reply to a replSetSyncFrom command
299     void prepareSyncFromResponse(const HostAndPort& target,
300                                  BSONObjBuilder* response,
301                                  Status* result);
302 
303     // produce a reply to a replSetFresh command
304     void prepareFreshResponse(const ReplicationCoordinator::ReplSetFreshArgs& args,
305                               Date_t now,
306                               BSONObjBuilder* response,
307                               Status* result);
308 
309     // produce a reply to a received electCmd
310     void prepareElectResponse(const ReplicationCoordinator::ReplSetElectArgs& args,
311                               Date_t now,
312                               BSONObjBuilder* response,
313                               Status* result);
314 
315     // produce a reply to a heartbeat
316     Status prepareHeartbeatResponse(Date_t now,
317                                     const ReplSetHeartbeatArgs& args,
318                                     const std::string& ourSetName,
319                                     ReplSetHeartbeatResponse* response);
320 
321     // produce a reply to a V1 heartbeat
322     Status prepareHeartbeatResponseV1(Date_t now,
323                                       const ReplSetHeartbeatArgsV1& args,
324                                       const std::string& ourSetName,
325                                       ReplSetHeartbeatResponse* response);
326 
327     struct ReplSetStatusArgs {
328         Date_t now;
329         unsigned selfUptime;
330         const OpTime& readConcernMajorityOpTime;
331         const BSONObj& initialSyncStatus;
332     };
333 
334     // produce a reply to a status request
335     void prepareStatusResponse(const ReplSetStatusArgs& rsStatusArgs,
336                                BSONObjBuilder* response,
337                                Status* result);
338 
339     // Produce a replSetUpdatePosition command to be sent to the node's sync source.
340     StatusWith<BSONObj> prepareReplSetUpdatePositionCommand(
341         OpTime currentCommittedSnapshotOpTime) const;
342 
343     // Produce a reply to an ismaster request.  It is only valid to call this if we are a
344     // replset.  Drivers interpret the isMaster fields according to the Server Discovery and
345     // Monitoring Spec, see the "Parsing an isMaster response" section.
346     void fillIsMasterForReplSet(IsMasterResponse* response,
347                                 const SplitHorizon::Parameters& horizonParams);
348 
349     // Produce member data for the serverStatus command and diagnostic logging.
350     void fillMemberData(BSONObjBuilder* result);
351 
352     enum class PrepareFreezeResponseResult { kNoAction, kSingleNodeSelfElect };
353 
354     /**
355      * Produce a reply to a freeze request. Returns a PostMemberStateUpdateAction on success that
356      * may trigger state changes in the caller.
357      */
358     StatusWith<PrepareFreezeResponseResult> prepareFreezeResponse(Date_t now,
359                                                                   int secs,
360                                                                   BSONObjBuilder* response);
361 
362     ////////////////////////////////////////////////////////////
363     //
364     // Methods for sending and receiving heartbeats,
365     // reconfiguring and handling the results of standing for
366     // election.
367     //
368     ////////////////////////////////////////////////////////////
369 
370     /**
371      * Updates the topology coordinator's notion of the replica set configuration.
372      *
373      * "newConfig" is the new configuration, and "selfIndex" is the index of this
374      * node's configuration information in "newConfig", or "selfIndex" is -1 to
375      * indicate that this node is not a member of "newConfig".
376      *
377      * newConfig.isInitialized() should be true, though implementations may accept
378      * configurations where this is not true, for testing purposes.
379      */
380     void updateConfig(const ReplSetConfig& newConfig, int selfIndex, Date_t now);
381 
382     /**
383      * Prepares a heartbeat request appropriate for sending to "target", assuming the
384      * current time is "now".  "ourSetName" is used as the name for our replica set if
385      * the topology coordinator does not have a valid configuration installed.
386      *
387      * The returned pair contains proper arguments for a replSetHeartbeat command, and
388      * an amount of time to wait for the response.
389      *
390      * This call should be paired (with intervening network communication) with a call to
391      * processHeartbeatResponse for the same "target".
392      */
393     std::pair<ReplSetHeartbeatArgs, Milliseconds> prepareHeartbeatRequest(
394         Date_t now, const std::string& ourSetName, const HostAndPort& target);
395     std::pair<ReplSetHeartbeatArgsV1, Milliseconds> prepareHeartbeatRequestV1(
396         Date_t now, const std::string& ourSetName, const HostAndPort& target);
397 
398     /**
399      * Processes a heartbeat response from "target" that arrived around "now", having
400      * spent "networkRoundTripTime" millis on the network.
401      *
402      * Updates internal topology coordinator state, and returns instructions about what action
403      * to take next.
404      *
405      * If the next action indicates StartElection, the topology coordinator has transitioned to
406      * the "candidate" role, and will remain there until processWinElection or
407      * processLoseElection are called.
408      *
409      * If the next action indicates "StepDownSelf", the topology coordinator has transitioned
410      * to the "follower" role from "leader", and the caller should take any necessary actions
411      * to become a follower.
412      *
413      * If the next action indicates "StepDownRemotePrimary", the caller should take steps to
414      * cause the specified remote host to step down from primary to secondary.
415      *
416      * If the next action indicates "Reconfig", the caller should verify the configuration in
417      * hbResponse is acceptable, perform any other reconfiguration actions it must, and call
418      * updateConfig with the new configuration and the appropriate value for "selfIndex".  It
419      * must also wrap up any outstanding elections (by calling processLoseElection or
420      * processWinElection) before calling updateConfig.
421      *
422      * This call should be paired (with intervening network communication) with a call to
423      * prepareHeartbeatRequest for the same "target".
424      */
425     HeartbeatResponseAction processHeartbeatResponse(
426         Date_t now,
427         Milliseconds networkRoundTripTime,
428         const HostAndPort& target,
429         const StatusWith<ReplSetHeartbeatResponse>& hbResponse);
430 
431     /**
432      *  Returns whether or not at least 'numNodes' have reached the given opTime.
433      * "durablyWritten" indicates whether the operation has to be durably applied.
434      */
435     bool haveNumNodesReachedOpTime(const OpTime& opTime, int numNodes, bool durablyWritten);
436 
437     /**
438      * Returns whether or not at least one node matching the tagPattern has reached
439      * the given opTime.
440      * "durablyWritten" indicates whether the operation has to be durably applied.
441      */
442     bool haveTaggedNodesReachedOpTime(const OpTime& opTime,
443                                       const ReplSetTagPattern& tagPattern,
444                                       bool durablyWritten);
445 
446     /**
447      * Returns a vector of members that have applied the operation with OpTime 'op'.
448      * "durablyWritten" indicates whether the operation has to be durably applied.
449      * "skipSelf" means to exclude this node whether or not the op has been applied.
450      */
451     std::vector<HostAndPort> getHostsWrittenTo(const OpTime& op,
452                                                bool durablyWritten,
453                                                bool skipSelf);
454 
455     /**
456      * Marks a member as down from our perspective and returns a bool which indicates if we can no
457      * longer see a majority of the nodes and thus should step down.
458      */
459     bool setMemberAsDown(Date_t now, const int memberIndex);
460 
461     /**
462      * Goes through the memberData and determines which member that is currently live
463      * has the stalest (earliest) last update time.  Returns (-1, Date_t::max()) if there are
464      * no other members.
465      */
466     std::pair<int, Date_t> getStalestLiveMember() const;
467 
468     /**
469      * Go through the memberData, and mark nodes which haven't been updated
470      * recently (within an election timeout) as "down".  Returns a HeartbeatResponseAction, which
471      * will be StepDownSelf if we can no longer see a majority of the nodes, otherwise NoAction.
472      */
473     HeartbeatResponseAction checkMemberTimeouts(Date_t now);
474 
475     /**
476      * Set all nodes in memberData to not stale with a lastUpdate of "now".
477      */
478     void resetAllMemberTimeouts(Date_t now);
479 
480     /**
481      * Set all nodes in memberData that are present in member_set
482      * to not stale with a lastUpdate of "now".
483      */
484     void resetMemberTimeouts(Date_t now, const stdx::unordered_set<HostAndPort>& member_set);
485 
486     /*
487      * Returns the last optime that this node has applied, whether or not it has been journaled.
488      */
489     OpTime getMyLastAppliedOpTime() const;
490 
491     /*
492      * Returns the last optime that this node has applied and journaled.
493      */
494     OpTime getMyLastDurableOpTime() const;
495 
496     /*
497      * Returns information we have on the state of this node.
498      */
499     MemberData* getMyMemberData();
500 
501     /*
502      * Returns information we have on the state of the node identified by memberId.  Returns
503      * nullptr if memberId is not found in the configuration.
504      */
505     MemberData* findMemberDataByMemberId(const int memberId);
506 
507     /*
508      * Returns information we have on the state of the node identified by rid.  Returns
509      * nullptr if rid is not found in the heartbeat data.  This method is used only for
510      * master/slave replication.
511      */
512     MemberData* findMemberDataByRid(const OID rid);
513 
514     /*
515      * Adds and returns a memberData entry for the given RID.
516      * Used only in master/slave mode.
517      */
518     MemberData* addSlaveMemberData(const OID rid);
519 
520     /**
521      * If getRole() == Role::candidate and this node has not voted too recently, updates the
522      * lastVote tracker and returns true.  Otherwise, returns false.
523      */
524     bool voteForMyself(Date_t now);
525 
526     /**
527      * Sets lastVote to be for ourself in this term.
528      */
529     void voteForMyselfV1();
530 
531     /**
532      * Sets election id and election optime.
533      */
534     void setElectionInfo(OID electionId, Timestamp electionOpTime);
535 
536     /**
537      * Performs state updates associated with winning an election.
538      *
539      * It is an error to call this if the topology coordinator is not in candidate mode.
540      *
541      * Exactly one of either processWinElection or processLoseElection must be called if
542      * processHeartbeatResponse returns StartElection, to exit candidate mode.
543      */
544     void processWinElection(OID electionId, Timestamp electionOpTime);
545 
546     /**
547      * Performs state updates associated with losing an election.
548      *
549      * It is an error to call this if the topology coordinator is not in candidate mode.
550      *
551      * Exactly one of either processWinElection or processLoseElection must be called if
552      * processHeartbeatResponse returns StartElection, to exit candidate mode.
553      */
554     void processLoseElection();
555 
556 
557     using StepDownAttemptAbortFn = stdx::function<void()>;
558     /**
559      * Readies the TopologyCoordinator for an attempt to stepdown that may fail.  This is used
560      * when we receive a stepdown command (which can fail if not enough secondaries are caught up)
561      * to ensure that we never process more than one stepdown request at a time.
562      * Returns OK if it is safe to continue with the stepdown attempt, or returns:
563      * - NotMaster if this node is not a leader.
564      * - ConflictingOperationInProgess if this node is already processing a stepdown request of any
565      * kind.
566      * On an OK return status also returns a function object that can be called to abort the
567      * pending stepdown attempt and return this node to normal primary/master state.
568      */
569     StatusWith<StepDownAttemptAbortFn> prepareForStepDownAttempt();
570 
571     /**
572      * Tries to transition the coordinator from the leader role to the follower role.
573      *
574      * A step down succeeds based on the following conditions:
575      *
576      *      C1. 'force' is true and now > waitUntil
577      *
578      *      C2. A majority set of nodes, M, in the replica set have optimes greater than or
579      *      equal to the last applied optime of the primary.
580      *
581      *      C3. There exists at least one electable secondary node in the majority set M.
582      *
583      *
584      * If C1 is true, or if both C2 and C3 are true, then the stepdown occurs and this method
585      * returns true. If the conditions for successful stepdown aren't met yet, but waiting for more
586      * time to pass could make it succeed, returns false.  If the whole stepdown attempt should be
587      * abandoned (for example because the time limit expired or because we've already stepped down),
588      * throws an exception.
589      * TODO(spencer): Unify with the finishUnconditionalStepDown() method.
590      */
591     bool attemptStepDown(
592         long long termAtStart, Date_t now, Date_t waitUntil, Date_t stepDownUntil, bool force);
593 
594     /**
595      * Returns whether it is safe for a stepdown attempt to complete, ignoring the 'force' argument.
596      * This is essentially checking conditions C2 and C3 as described in the comment to
597      * attemptStepDown().
598      */
599     bool isSafeToStepDown();
600 
601     /**
602      * Readies the TopologyCoordinator for stepdown.  Returns false if we're already in the process
603      * of an unconditional step down.  If we are in the middle of a stepdown command attempt when
604      * this is called then this unconditional stepdown will supersede the stepdown attempt, which
605      * will cause the stepdown to fail.  When this returns true it must be followed by a call to
606      * finishUnconditionalStepDown() that is called when holding the global X lock.
607      */
608     bool prepareForUnconditionalStepDown();
609 
610     /**
611      * Sometimes a request to step down comes in (like via a heartbeat), but we don't have the
612      * global exclusive lock so we can't actually stepdown at that moment. When that happens
613      * we record that a stepdown request is pending (by calling prepareForUnconditionalStepDown())
614      * and schedule work to stepdown in the global X lock.  This method is called after holding the
615      * global lock to perform the actual stepdown.
616      * TODO(spencer): Unify with the finishAttemptedStepDown() method.
617      */
618     void finishUnconditionalStepDown();
619 
620     /**
621      * Returns the index of the most suitable candidate for an election handoff. The node must be
622      * caught up and electable. Ties are resolved first by highest priority, then by lowest member
623      * id.
624      */
625     int chooseElectionHandoffCandidate();
626 
627     /**
628      * Considers whether or not this node should stand for election, and returns true
629      * if the node has transitioned to candidate role as a result of the call.
630      */
631     Status checkShouldStandForElection(Date_t now) const;
632 
633     /**
634      * Set the outgoing heartbeat message from self
635      */
636     void setMyHeartbeatMessage(const Date_t now, const std::string& s);
637 
638     /**
639      * Prepares a ReplSetMetadata object describing the current term, primary, and lastOp
640      * information.
641      */
642     rpc::ReplSetMetadata prepareReplSetMetadata(const OpTime& lastVisibleOpTime) const;
643 
644     /**
645      * Prepares an OplogQueryMetadata object describing the current sync source, rbid, primary,
646      * lastOpApplied, and lastOpCommitted.
647      */
648     rpc::OplogQueryMetadata prepareOplogQueryMetadata(int rbid) const;
649 
650     /**
651      * Writes into 'output' all the information needed to generate a summary of the current
652      * replication state for use by the web interface.
653      */
654     void summarizeAsHtml(ReplSetHtmlSummary* output);
655 
656     /**
657      * Prepares a ReplSetRequestVotesResponse.
658      */
659     void processReplSetRequestVotes(const ReplSetRequestVotesArgs& args,
660                                     ReplSetRequestVotesResponse* response);
661 
662     /**
663      * Loads an initial LastVote document, which was read from local storage.
664      *
665      * Called only during replication startup. All other updates are done internally.
666      */
667     void loadLastVote(const LastVote& lastVote);
668 
669     /**
670      * Updates the current primary index.
671      */
672     void setPrimaryIndex(long long primaryIndex);
673 
674     /**
675      * Returns the current primary index.
676      */
677     int getCurrentPrimaryIndex() const;
678 
679     enum StartElectionReason {
680         kElectionTimeout,
681         kPriorityTakeover,
682         kStepUpRequest,
683         kStepUpRequestSkipDryRun,
684         kCatchupTakeover,
685         kSingleNodePromptElection
686     };
687 
688     /**
689      * Transitions to the candidate role if the node is electable.
690      */
691     Status becomeCandidateIfElectable(const Date_t now, StartElectionReason reason);
692 
693     /**
694      * Updates the storage engine read committed support in the TopologyCoordinator options after
695      * creation.
696      */
697     void setStorageEngineSupportsReadCommitted(bool supported);
698 
699     /**
700      * Reset the booleans to record the last heartbeat restart.
701      */
702     void restartHeartbeats();
703 
704     /**
705      * Scans through all members that are 'up' and return the latest known optime, if we have
706      * received (successful or failed) heartbeats from all nodes since heartbeat restart.
707      *
708      * Returns boost::none if any node hasn't responded to a heartbeat since we last restarted
709      * heartbeats.
710      * Returns OpTime(Timestamp(0, 0), 0), the smallest OpTime in PV1, if other nodes are all down.
711      */
712     boost::optional<OpTime> latestKnownOpTimeSinceHeartbeatRestart() const;
713 
714     ////////////////////////////////////////////////////////////
715     //
716     // Test support methods
717     //
718     ////////////////////////////////////////////////////////////
719 
720     // Changes _memberState to newMemberState.  Only for testing.
721     void changeMemberState_forTest(const MemberState& newMemberState,
722                                    const Timestamp& electionTime = Timestamp(0, 0));
723 
724     // Sets "_electionTime" to "newElectionTime".  Only for testing.
725     void _setElectionTime(const Timestamp& newElectionTime);
726 
727     // Sets _currentPrimaryIndex to the given index.  Should only be used in unit tests!
728     // TODO(spencer): Remove this once we can easily call for an election in unit tests to
729     // set the current primary.
730     void _setCurrentPrimaryForTest(int primaryIndex);
731 
732     // Returns _electionTime.  Only used in unittests.
733     Timestamp getElectionTime() const;
734 
735     // Returns _electionId.  Only used in unittests.
736     OID getElectionId() const;
737 
738     // Returns the name for a role.  Only used in unittests.
739     static std::string roleToString(TopologyCoordinator::Role role);
740 
741 private:
742     typedef int UnelectableReasonMask;
743     class PingStats;
744 
745     /**
746      * Different modes a node can be in while still reporting itself as in state PRIMARY.
747      *
748      * Valid transitions:
749      *
750      *       kNotLeader <----------------------------------
751      *          |                                         |
752      *          |                                         |
753      *          |                                         |
754      *          v                                         |
755      *       kLeaderElect-----------------                |
756      *          |    ^  |                |                |
757      *          |    |  |                |                |
758      *          v    |  |                |                |
759      *       kMaster --------------------------           |
760      *        |  ^   |  |                |    |           |
761      *        |  |   |  |                |    |           |
762      *        |  |   |  |                |    |           |
763      *        v  |   |  v                v    v           |
764      *  kAttemptingStepDown----------->kSteppingDown      |
765      *        |                              |            |
766      *        |                              |            |
767      *        |                              |            |
768      *        ---------------------------------------------
769      *
770      */
771     enum class LeaderMode {
772         kNotLeader,           // This node is not currently a leader.
773         kLeaderElect,         // This node has been elected leader, but can't yet accept writes.
774         kMaster,              // This node reports ismaster:true and can accept writes.
775         kSteppingDown,        // This node is in the middle of a (hb) stepdown that must complete.
776         kAttemptingStepDown,  // This node is in the middle of a stepdown (cmd) that might fail.
777     };
778 
779     enum UnelectableReason {
780         None = 0,
781         CannotSeeMajority = 1 << 0,
782         NotCloseEnoughToLatestOptime = 1 << 1,
783         ArbiterIAm = 1 << 2,
784         NotSecondary = 1 << 3,
785         NoPriority = 1 << 4,
786         StepDownPeriodActive = 1 << 5,
787         NoData = 1 << 6,
788         NotInitialized = 1 << 7,
789         VotedTooRecently = 1 << 8,
790         RefusesToStand = 1 << 9,
791         NotCloseEnoughToLatestForPriorityTakeover = 1 << 10,
792         NotFreshEnoughForCatchupTakeover = 1 << 11,
793     };
794 
795     // Set what type of PRIMARY this node currently is.
796     void _setLeaderMode(LeaderMode mode);
797 
798     // Returns the number of heartbeat pings which have occurred.
799     int _getTotalPings();
800 
801     // Returns the current "ping" value for the given member by their address
802     Milliseconds _getPing(const HostAndPort& host);
803 
804     // Determines if we will veto the member specified by "args.id".
805     // If we veto, the errmsg will be filled in with a reason
806     bool _shouldVetoMember(const ReplicationCoordinator::ReplSetFreshArgs& args,
807                            const Date_t& now,
808                            std::string* errmsg) const;
809 
810     // Returns the index of the member with the matching id, or -1 if none match.
811     int _getMemberIndex(int id) const;
812 
813     // Sees if a majority number of votes are held by members who are currently "up"
814     bool _aMajoritySeemsToBeUp() const;
815 
816     // Checks if the node can see a healthy primary of equal or greater priority to the
817     // candidate. If so, returns the index of that node. Otherwise returns -1.
818     int _findHealthyPrimaryOfEqualOrGreaterPriority(const int candidateIndex) const;
819 
820     // Is otherOpTime close enough (within 10 seconds) to the latest known optime to qualify
821     // for an election
822     bool _isOpTimeCloseEnoughToLatestToElect(const OpTime& otherOpTime) const;
823 
824     // Is our optime close enough to the latest known optime to call for a priority takeover.
825     bool _amIFreshEnoughForPriorityTakeover() const;
826 
827     // Is the primary node still in catchup mode and is our optime the latest
828     // known optime of all the up nodes.
829     bool _amIFreshEnoughForCatchupTakeover() const;
830 
831     // Returns reason why "self" member is unelectable
832     UnelectableReasonMask _getMyUnelectableReason(const Date_t now,
833                                                   StartElectionReason reason) const;
834 
835     // Returns reason why memberIndex is unelectable
836     UnelectableReasonMask _getUnelectableReason(int memberIndex) const;
837 
838     // Returns the nice text of why the node is unelectable
839     std::string _getUnelectableReasonString(UnelectableReasonMask ur) const;
840 
841     // Return true if we are currently primary
842     bool _iAmPrimary() const;
843 
844     // Scans through all members that are 'up' and return the latest known optime.
845     OpTime _latestKnownOpTime() const;
846 
847     // Scans the electable set and returns the highest priority member index
848     int _getHighestPriorityElectableIndex(Date_t now) const;
849 
850     // Returns true if "one" member is higher priority than "two" member
851     bool _isMemberHigherPriority(int memberOneIndex, int memberTwoIndex) const;
852 
853     // Helper shortcut to self config
854     const MemberConfig& _selfConfig() const;
855 
856     // Helper shortcut to self member data
857     const MemberData& _selfMemberData() const;
858 
859     // Index of self member in member data.
860     const int _selfMemberDataIndex() const;
861 
862     // Returns NULL if there is no primary, or the MemberConfig* for the current primary
863     const MemberConfig* _currentPrimaryMember() const;
864 
865     /**
866      * Performs updating "_currentPrimaryIndex" for processHeartbeatResponse(), and determines if an
867      * election or stepdown should commence.
868      * _updatePrimaryFromHBDataV1() is a simplified version of _updatePrimaryFromHBData() to be used
869      * when in ProtocolVersion1.
870      */
871     HeartbeatResponseAction _updatePrimaryFromHBData(int updatedConfigIndex,
872                                                      const MemberState& originalState,
873                                                      Date_t now);
874     HeartbeatResponseAction _updatePrimaryFromHBDataV1(int updatedConfigIndex,
875                                                        const MemberState& originalState,
876                                                        Date_t now);
877 
878     /**
879      * Updates _memberData based on the newConfig, ensuring that every member in the newConfig
880      * has an entry in _memberData.  If any nodes in the newConfig are also present in
881      * _currentConfig, copies their heartbeat info into the corresponding entry in the updated
882      * _memberData vector.
883      */
884     void _updateHeartbeatDataForReconfig(const ReplSetConfig& newConfig, int selfIndex, Date_t now);
885 
886     /**
887      * Returns whether a stepdown attempt should be allowed to proceed.  See the comment for
888      * attemptStepDown() for more details on the rules of when stepdown attempts succeed or fail.
889      */
890     bool _canCompleteStepDownAttempt(Date_t now, Date_t waitUntil, bool force);
891 
892     /**
893      * Returns true if a node is both caught up to our last applied opTime and electable.
894      */
895     bool _isCaughtUpAndElectable(int memberIndex, OpTime lastApplied);
896 
897     void _stepDownSelfAndReplaceWith(int newPrimary);
898 
899     /**
900      * Looks up the provided member in the blacklist and returns true if the member's blacklist
901      * expire time is after 'now'.  If the member is found but the expire time is before 'now',
902      * the function returns false.  If the member is not found in the blacklist, the function
903      * returns false.
904      **/
905     bool _memberIsBlacklisted(const MemberConfig& memberConfig, Date_t now) const;
906 
907     /**
908      * Returns true if we are a one-node replica set, we're the one member,
909      * we're electable, we're not in maintenance mode, and we are currently in followerMode
910      * SECONDARY.
911      *
912      * This is used to decide if we should transition to Role::candidate in a one-node replica set.
913      */
914     bool _isElectableNodeInSingleNodeReplicaSet() const;
915 
916     // This node's role in the replication protocol.
917     Role _role;
918 
919     // This is a unique id that is generated and set each time we transition to PRIMARY, as the
920     // result of an election.
921     OID _electionId;
922     // The time at which the current PRIMARY was elected.
923     Timestamp _electionTime;
924 
925     // This node's election term.  The term is used as part of the consensus algorithm to elect
926     // and maintain one primary (leader) node in the cluster.
927     long long _term;
928 
929     // the index of the member we currently believe is primary, if one exists, otherwise -1
930     int _currentPrimaryIndex;
931 
932     // the hostandport we are currently syncing from
933     // empty if no sync source (we are primary, or we cannot connect to anyone yet)
934     HostAndPort _syncSource;
935     // These members are not chosen as sync sources for a period of time, due to connection
936     // issues with them
937     std::map<HostAndPort, Date_t> _syncSourceBlacklist;
938     // The next sync source to be chosen, requested via a replSetSyncFrom command
939     int _forceSyncSourceIndex;
940 
941     // Options for this TopologyCoordinator
942     Options _options;
943 
944     // "heartbeat message"
945     // sent in requestHeartbeat respond in field "hbm"
946     std::string _hbmsg;
947     Date_t _hbmsgTime;  // when it was logged
948 
949     // heartbeat msg to send to others; descriptive diagnostic info
950     std::string _getHbmsg(Date_t now) const;
951 
952     int _selfIndex;  // this node's index in _members and _currentConfig
953 
954     ReplSetConfig _rsConfig;  // The current config, including a vector of MemberConfigs
955 
956     // Heartbeat, current applied/durable optime, and other state data for each member.  It is
957     // guaranteed that this vector will be maintained in the same order as the MemberConfigs in
958     // _currentConfig, therefore the member config index can be used to index into this vector as
959     // well.
960     std::vector<MemberData> _memberData;
961 
962     // Time when stepDown command expires
963     Date_t _stepDownUntil;
964 
965     // A time before which this node will not stand for election.
966     // In protocol version 1, this is used to prevent running for election after seeing
967     // a new term.
968     Date_t _electionSleepUntil;
969 
970     // OpTime of the latest committed operation.
971     OpTime _lastCommittedOpTime;
972 
973     // OpTime representing our transition to PRIMARY and the start of our term.
974     // _lastCommittedOpTime cannot be set to an earlier OpTime.
975     OpTime _firstOpTimeOfMyTerm;
976 
977     // The number of calls we have had to enter maintenance mode
978     int _maintenanceModeCalls;
979 
980     // The sub-mode of follower that we are in.  Legal values are RS_SECONDARY, RS_RECOVERING,
981     // RS_STARTUP2 (initial sync) and RS_ROLLBACK.  Only meaningful if _role == Role::follower.
982     // Configured via setFollowerMode().  If the sub-mode is RS_SECONDARY, then the effective
983     // sub-mode is either RS_SECONDARY or RS_RECOVERING, depending on _maintenanceModeCalls.
984     // Rather than accesing this variable direclty, one should use the getMemberState() method,
985     // which computes the replica set node state on the fly.
986     MemberState::MS _followerMode;
987 
988     // What type of PRIMARY this node currently is.  Don't set this directly, call _setLeaderMode
989     // instead.
990     LeaderMode _leaderMode = LeaderMode::kNotLeader;
991 
992     typedef std::map<HostAndPort, PingStats> PingMap;
993     // Ping stats for each member by HostAndPort;
994     PingMap _pings;
995 
996     // Last vote info from the election
997     struct VoteLease {
998         static const Seconds leaseTime;
999 
1000         Date_t when;
1001         int whoId = -1;
1002         HostAndPort whoHostAndPort;
1003     } _voteLease;
1004 
1005     // V1 last vote info for elections
1006     LastVote _lastVote{OpTime::kInitialTerm, -1};
1007 
1008     enum class ReadCommittedSupport {
1009         kUnknown,
1010         kNo,
1011         kYes,
1012     };
1013 
1014     // Whether or not the storage engine supports read committed.
1015     ReadCommittedSupport _storageEngineSupportsReadCommitted{ReadCommittedSupport::kUnknown};
1016 };
1017 
1018 /**
1019  * A PingStats object stores data about heartbeat attempts to a particular target node. Over the
1020  * course of its lifetime, it may be used for multiple rounds of heartbeats. This allows for the
1021  * collection of statistics like average heartbeat latency to a target. The heartbeat latency
1022  * measurement it stores for each replica set member is an average weighted 80% to the old value,
1023  * and 20% to the new value.
1024  *
1025  */
1026 class TopologyCoordinator::PingStats {
1027 public:
1028     /**
1029      * Starts a new round of heartbeat attempts by transitioning to 'TRYING' and resetting the
1030      * failure count. Also Records that a new heartbeat request started at "now".
1031      */
1032     void start(Date_t now);
1033 
1034     /**
1035      * Records that a heartbeat request completed successfully, and that "millis" milliseconds
1036      * were spent for a single network roundtrip plus remote processing time.
1037      */
1038     void hit(Milliseconds millis);
1039 
1040     /**
1041      * Records that a heartbeat request failed.
1042      */
1043     void miss();
1044 
1045     /**
1046      * Gets the number of hit() calls.
1047      */
getCount()1048     unsigned int getCount() const {
1049         return hitCount;
1050     }
1051 
1052     /**
1053      * Gets the weighted average round trip time for heartbeat messages to the target.
1054      * Returns 0 if there have been no pings recorded yet.
1055      */
getMillis()1056     Milliseconds getMillis() const {
1057         return averagePingTimeMs == UninitializedPingTime ? Milliseconds(0) : averagePingTimeMs;
1058     }
1059 
1060     /**
1061      * Gets the date at which start() was last called, which is used to determine if
1062      * a heartbeat should be retried or if the time limit has expired.
1063      */
getLastHeartbeatStartDate()1064     Date_t getLastHeartbeatStartDate() const {
1065         return _lastHeartbeatStartDate;
1066     }
1067 
1068     /**
1069      * Returns true if the number of failed heartbeats for the most recent round of attempts has
1070      * exceeded the max number of heartbeat retries.
1071      */
failed()1072     bool failed() const {
1073         return _state == FAILED;
1074     }
1075 
1076     /**
1077      * Returns true if a good heartbeat has been received for the most recent round of heartbeat
1078      * attempts before the maximum number of retries has been exceeded. Returns false otherwise.
1079      */
succeeded()1080     bool succeeded() const {
1081         return _state == SUCCEEDED;
1082     }
1083 
1084     /**
1085      * Returns true if a heartbeat attempt is currently in progress and there are still retries
1086      * left.
1087      */
trying()1088     bool trying() const {
1089         return _state == TRYING;
1090     }
1091 
1092     /**
1093      * Returns true if 'start' has never been called on this instance of PingStats. Otherwise
1094      * returns false.
1095      */
uninitialized()1096     bool uninitialized() const {
1097         return _state == UNINITIALIZED;
1098     }
1099 
1100     /**
1101      * Gets the number of retries left for this heartbeat attempt. Invalid to call if the current
1102      * state is 'UNINITIALIZED'.
1103     */
retriesLeft()1104     int retriesLeft() const {
1105         return kMaxHeartbeatRetries - _numFailuresSinceLastStart;
1106     }
1107 
1108 private:
1109     /**
1110      * Represents the current state of this PingStats object.
1111      *
1112      * At creation time, a PingStats object is in the 'UNINITIALIZED' state, and will remain so
1113      * until the first heartbeat attempt is initiated. Heartbeat attempts are initiated by calls to
1114      * 'start', which puts the object into 'TRYING' state. If all heartbeat retries are used up
1115      * before receiving a good response, it will enter the 'FAILED' state. If a good heartbeat
1116      * response is received before exceeding the maximum number of retries, the object enters the
1117      * 'SUCCEEDED' state. From either the 'SUCCEEDED' or 'FAILED' state, the object can go back into
1118      * 'TRYING' state, to begin a new heartbeat attempt. The following is a simple state transition
1119      * table illustrating this behavior:
1120      *
1121      * UNINITIALIZED:   [TRYING]
1122      * TRYING:          [SUCCEEDED, FAILED]
1123      * SUCCEEDED:       [TRYING]
1124      * FAILED:          [TRYING]
1125      *
1126      */
1127     enum HeartbeatState { UNINITIALIZED, TRYING, SUCCEEDED, FAILED };
1128 
1129     // The current state of this PingStats object.
1130     HeartbeatState _state = UNINITIALIZED;
1131 
1132     // Represents the uninitialized value of a counter that should only ever be >=0 after
1133     // initialization.
1134     static constexpr int UninitializedCount{-1};
1135 
1136     // The value of 'averagePingTimeMs' before any good heartbeats have been received.
1137     static constexpr Milliseconds UninitializedPingTime{UninitializedCount};
1138 
1139     // The number of successful heartbeats that have ever been received i.e. the total number of
1140     // calls to 'PingStats::hit'.
1141     unsigned int hitCount = 0;
1142 
1143     // The running, weighted average round trip time for heartbeat messages to the target node.
1144     // Weighted 80% to the old round trip ping time, and 20% to the new round trip ping time.
1145     Milliseconds averagePingTimeMs = UninitializedPingTime;
1146 
1147     // The time of the most recent call to 'PingStats::start'.
1148     Date_t _lastHeartbeatStartDate;
1149 
1150     // The number of failed heartbeat attempts since the most recent call to 'PingStats::start'.
1151     int _numFailuresSinceLastStart = UninitializedCount;
1152 };
1153 
1154 //
1155 // Convenience method for unittest code. Please use accessors otherwise.
1156 //
1157 
1158 std::ostream& operator<<(std::ostream& os, TopologyCoordinator::Role role);
1159 std::ostream& operator<<(std::ostream& os, TopologyCoordinator::PrepareFreezeResponseResult result);
1160 
1161 }  // namespace repl
1162 }  // namespace mongo
1163