1 
2 /**
3  *    Copyright (C) 2018-present MongoDB, Inc.
4  *
5  *    This program is free software: you can redistribute it and/or modify
6  *    it under the terms of the Server Side Public License, version 1,
7  *    as published by MongoDB, Inc.
8  *
9  *    This program is distributed in the hope that it will be useful,
10  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *    Server Side Public License for more details.
13  *
14  *    You should have received a copy of the Server Side Public License
15  *    along with this program. If not, see
16  *    <http://www.mongodb.com/licensing/server-side-public-license>.
17  *
18  *    As a special exception, the copyright holders give permission to link the
19  *    code of portions of this program with the OpenSSL library under certain
20  *    conditions as described in each individual source file and distribute
21  *    linked combinations including the program with the OpenSSL library. You
22  *    must comply with the Server Side Public License in all respects for
23  *    all of the code used other than as permitted herein. If you modify file(s)
24  *    with this exception, you may extend this exception to your version of the
25  *    file(s), but you are not obligated to do so. If you do not wish to do so,
26  *    delete this exception statement from your version. If you delete this
27  *    exception statement from all source files in the program, then also delete
28  *    it in the license file.
29  */
30 
31 #pragma once
32 
33 #include "mongo/db/repl/replication_coordinator_fwd.h"
34 
35 #include <vector>
36 
37 #include "mongo/base/disallow_copying.h"
38 #include "mongo/base/status.h"
39 #include "mongo/base/status_with.h"
40 #include "mongo/bson/timestamp.h"
41 #include "mongo/db/repl/member_state.h"
42 #include "mongo/db/repl/repl_settings.h"
43 #include "mongo/db/repl/split_horizon.h"
44 #include "mongo/db/repl/sync_source_selector.h"
45 #include "mongo/util/net/hostandport.h"
46 #include "mongo/util/time_support.h"
47 
48 namespace mongo {
49 
50 class BSONObj;
51 class BSONObjBuilder;
52 class IndexDescriptor;
53 class NamespaceString;
54 class OperationContext;
55 class ServiceContext;
56 class Timestamp;
57 struct WriteConcernOptions;
58 
59 namespace executor {
60 struct ConnectionPoolStats;
61 }  // namespace executor
62 
63 namespace rpc {
64 
65 class OplogQueryMetadata;
66 class ReplSetMetadata;
67 
68 }  // namespace rpc
69 
70 namespace repl {
71 
72 class BackgroundSync;
73 class HandshakeArgs;
74 class IsMasterResponse;
75 class OplogReader;
76 class OpTime;
77 class ReadConcernArgs;
78 class ReplSetConfig;
79 class ReplSetHeartbeatArgs;
80 class ReplSetHeartbeatArgsV1;
81 class ReplSetHeartbeatResponse;
82 class ReplSetHtmlSummary;
83 class ReplSetRequestVotesArgs;
84 class ReplSetRequestVotesResponse;
85 class UpdatePositionArgs;
86 
87 /**
88  * Global variable that contains a std::string telling why master/slave halted
89  *
90  * "dead" means something really bad happened like replication falling completely out of sync.
91  * when non-null, we are dead and the string is informational
92  *
93  * TODO(dannenberg) remove when master slave goes
94  */
95 extern const char* replAllDead;
96 
97 /**
98  * The ReplicationCoordinator is responsible for coordinating the interaction of replication
99  * with the rest of the system.  The public methods on ReplicationCoordinator are the public
100  * API that the replication subsystem presents to the rest of the codebase.
101  */
102 class ReplicationCoordinator : public SyncSourceSelector {
103     MONGO_DISALLOW_COPYING(ReplicationCoordinator);
104 
105 public:
106     static ReplicationCoordinator* get(ServiceContext* service);
107     static ReplicationCoordinator* get(ServiceContext& service);
108     static ReplicationCoordinator* get(OperationContext* ctx);
109 
110     static void set(ServiceContext* service,
111                     std::unique_ptr<ReplicationCoordinator> replCoordinator);
112 
113     struct StatusAndDuration {
114     public:
115         Status status;
116         Milliseconds duration;
117 
StatusAndDurationStatusAndDuration118         StatusAndDuration(const Status& stat, Milliseconds ms) : status(stat), duration(ms) {}
119     };
120 
121     virtual ~ReplicationCoordinator();
122 
123     /**
124      * Does any initial bookkeeping needed to start replication, and instructs the other
125      * components of the replication system to start up whatever threads and do whatever
126      * initialization they need.
127      */
128     virtual void startup(OperationContext* opCtx) = 0;
129 
130     /**
131      * Start terminal shutdown.  This causes the topology coordinator to refuse to vote in any
132      * further elections.  This should only be called from global shutdown after we've passed the
133      * point of no return.
134      *
135      * This should be called once we are sure to call shutdown().
136      */
137     virtual void enterTerminalShutdown() = 0;
138 
139     /**
140      * Does whatever cleanup is required to stop replication, including instructing the other
141      * components of the replication system to shut down and stop any threads they are using,
142      * blocking until all replication-related shutdown tasks are complete.
143      */
144     virtual void shutdown(OperationContext* opCtx) = 0;
145 
146     /**
147      * Returns a reference to the parsed command line arguments that are related to replication.
148      */
149     virtual const ReplSettings& getSettings() const = 0;
150 
151     enum Mode { modeNone = 0, modeReplSet, modeMasterSlave };
152 
153     /**
154      * Returns a value indicating whether this node was configured at start-up to run
155      * standalone, as part of a master-slave pair, or as a member of a replica set.
156      */
157     virtual Mode getReplicationMode() const = 0;
158 
159     /**
160      * Returns true if this node is configured to be a member of a replica set or master/slave
161      * setup.
162      */
163     virtual bool isReplEnabled() const = 0;
164 
165     /**
166      * Returns the current replica set state of this node (PRIMARY, SECONDARY, STARTUP, etc).
167      * It is invalid to call this unless getReplicationMode() == modeReplSet.
168      */
169     virtual MemberState getMemberState() const = 0;
170 
171     /**
172      * Waits for 'timeout' ms for member state to become 'state'.
173      * Returns OK if member state is 'state'.
174      * Returns ErrorCodes::ExceededTimeLimit if we timed out waiting for the state change.
175      * Returns ErrorCodes::BadValue if timeout is negative.
176      */
177     virtual Status waitForMemberState(MemberState expectedState, Milliseconds timeout) = 0;
178 
179     /**
180      * Returns true if this node is in state PRIMARY or SECONDARY.
181      *
182      * It is invalid to call this unless getReplicationMode() == modeReplSet.
183      *
184      * This method may be optimized to reduce synchronization overhead compared to
185      * reading the current member state with getMemberState().
186      */
187     virtual bool isInPrimaryOrSecondaryState() const = 0;
188 
189 
190     /**
191      * Returns how slave delayed this node is configured to be, or 0 seconds if this node is not a
192      * member of the current replica set configuration.
193      */
194     virtual Seconds getSlaveDelaySecs() const = 0;
195 
196     /**
197      * Blocks the calling thread for up to writeConcern.wTimeout millis, or until "opTime" has
198      * been replicated to at least a set of nodes that satisfies the writeConcern, whichever
199      * comes first. A writeConcern.wTimeout of 0 indicates no timeout (block forever) and a
200      * writeConcern.wTimeout of -1 indicates return immediately after checking. Return codes:
201      * ErrorCodes::WriteConcernFailed if the writeConcern.wTimeout is reached before
202      *     the data has been sufficiently replicated
203      * ErrorCodes::ExceededTimeLimit if the opCtx->getMaxTimeMicrosRemaining is reached before
204      *     the data has been sufficiently replicated
205      * ErrorCodes::NotMaster if the node is not Primary/Master
206      * ErrorCodes::UnknownReplWriteConcern if the writeConcern.wMode contains a write concern
207      *     mode that is not known
208      * ErrorCodes::ShutdownInProgress if we are mid-shutdown
209      * ErrorCodes::Interrupted if the operation was killed with killop()
210      */
211     virtual StatusAndDuration awaitReplication(OperationContext* opCtx,
212                                                const OpTime& opTime,
213                                                const WriteConcernOptions& writeConcern) = 0;
214 
215     /**
216      * Like awaitReplication(), above, but waits for the replication of the last operation
217      * performed on the client associated with "opCtx".
218      */
219     virtual StatusAndDuration awaitReplicationOfLastOpForClient(
220         OperationContext* opCtx, const WriteConcernOptions& writeConcern) = 0;
221 
222     /**
223      * Causes this node to relinquish being primary for at least 'stepdownTime'.  If 'force' is
224      * false, before doing so it will wait for 'waitTime' for one other node to be within 10
225      * seconds of this node's optime before stepping down. Returns a Status with the code
226      * ErrorCodes::ExceededTimeLimit if no secondary catches up within waitTime,
227      * ErrorCodes::NotMaster if you are no longer primary when trying to step down,
228      * ErrorCodes::SecondaryAheadOfPrimary if we are primary but there is another node that
229      * seems to be ahead of us in replication, and Status::OK otherwise.
230      */
231     virtual Status stepDown(OperationContext* opCtx,
232                             bool force,
233                             const Milliseconds& waitTime,
234                             const Milliseconds& stepdownTime) = 0;
235 
236     /**
237      * Returns true if the node can be considered master for the purpose of introspective
238      * commands such as isMaster() and rs.status().
239      */
240     virtual bool isMasterForReportingPurposes() = 0;
241 
242     /**
243      * Returns true if it is valid for this node to accept writes on the given database.
244      * Currently this is true only if this node is Primary, master in master/slave,
245      * a standalone, or is writing to the local database.
246      *
247      * If a node was started with the replSet argument, but has not yet received a config, it
248      * will not be able to receive writes to a database other than local (it will not be
249      * treated as standalone node).
250      *
251      * NOTE: This function can only be meaningfully called while the caller holds the global
252      * lock in some mode other than MODE_NONE.
253      */
254     virtual bool canAcceptWritesForDatabase(OperationContext* opCtx, StringData dbName) = 0;
255 
256     /**
257      * Version which does not check for the global lock.  Do not use in new code.
258      * Without the global lock held, the return value may be inaccurate by the time
259      * the function returns.
260      */
261     virtual bool canAcceptWritesForDatabase_UNSAFE(OperationContext* opCtx, StringData dbName) = 0;
262 
263     /**
264      * Returns true if it is valid for this node to accept writes on the given namespace.
265      *
266      * The result of this function should be consistent with canAcceptWritesForDatabase()
267      * for the database the namespace refers to, with additional checks on the collection.
268      */
269     virtual bool canAcceptWritesFor(OperationContext* opCtx, const NamespaceString& ns) = 0;
270 
271     /**
272      * Version which does not check for the global lock.  Do not use in new code.
273      * Without the global lock held, the return value may be inaccurate by the time
274      * the function returns.
275      */
276     virtual bool canAcceptWritesFor_UNSAFE(OperationContext* opCtx, const NamespaceString& ns) = 0;
277 
278     /**
279      * Checks if the current replica set configuration can satisfy the given write concern.
280      *
281      * Things that are taken into consideration include:
282      * 1. If the set has enough data-bearing members.
283      * 2. If the write concern mode exists.
284      * 3. If there are enough members for the write concern mode specified.
285      */
286     virtual Status checkIfWriteConcernCanBeSatisfied(
287         const WriteConcernOptions& writeConcern) const = 0;
288 
289     /**
290      * Returns Status::OK() if it is valid for this node to serve reads on the given collection
291      * and an errorcode indicating why the node cannot if it cannot.
292      */
293     virtual Status checkCanServeReadsFor(OperationContext* opCtx,
294                                          const NamespaceString& ns,
295                                          bool slaveOk) = 0;
296 
297     /**
298      * Version which does not check for the global lock.  Do not use in new code.
299      * Without the global lock held, the return value may be inaccurate by the time
300      * the function returns.
301      */
302     virtual Status checkCanServeReadsFor_UNSAFE(OperationContext* opCtx,
303                                                 const NamespaceString& ns,
304                                                 bool slaveOk) = 0;
305 
306     /**
307      * Returns true if this node should ignore index constraints for idempotency reasons.
308      *
309      * The namespace "ns" is passed in because the "local" database is usually writable
310      * and we need to enforce the constraints for it.
311      */
312     virtual bool shouldRelaxIndexConstraints(OperationContext* opCtx,
313                                              const NamespaceString& ns) = 0;
314 
315     /**
316      * Updates our internal tracking of the last OpTime applied for the given slave
317      * identified by "rid".  Only valid to call in master/slave mode
318      */
319     virtual Status setLastOptimeForSlave(const OID& rid, const Timestamp& ts) = 0;
320 
321     /**
322      * Updates our internal tracking of the last OpTime applied to this node.
323      *
324      * The new value of "opTime" must be no less than any prior value passed to this method, and
325      * it is the caller's job to properly synchronize this behavior.  The exception to this rule
326      * is that after calls to resetLastOpTimesFromOplog(), the minimum acceptable value for
327      * "opTime" is reset based on the contents of the oplog, and may go backwards due to
328      * rollback. Additionally, the optime given MUST represent a consistent database state.
329      */
330     virtual void setMyLastAppliedOpTime(const OpTime& opTime) = 0;
331 
332     /**
333      * Updates our internal tracking of the last OpTime durable to this node.
334      *
335      * The new value of "opTime" must be no less than any prior value passed to this method, and
336      * it is the caller's job to properly synchronize this behavior.  The exception to this rule
337      * is that after calls to resetLastOpTimesFromOplog(), the minimum acceptable value for
338      * "opTime" is reset based on the contents of the oplog, and may go backwards due to
339      * rollback.
340      */
341     virtual void setMyLastDurableOpTime(const OpTime& opTime) = 0;
342 
343     /**
344      * This type is used to represent the "consistency" of a current database state. In
345      * replication, there may be times when our database data is not represented by a single optime,
346      * because we have fetched remote data from different points in time. For example, when we are
347      * in RECOVERING following a refetch based rollback. We never allow external clients to read
348      * from the database if it is not consistent.
349      */
350     enum class DataConsistency { Consistent, Inconsistent };
351 
352     /**
353      * Updates our internal tracking of the last OpTime applied to this node, but only
354      * if the supplied optime is later than the current last OpTime known to the replication
355      * coordinator. The 'consistency' argument must tell whether or not the optime argument
356      * represents a consistent database state.
357      *
358      * This function is used by logOp() on a primary, since the ops in the oplog do not
359      * necessarily commit in sequential order. It is also used when we finish oplog batch
360      * application on secondaries, to avoid any potential race conditions around setting the
361      * applied optime from more than one thread.
362      */
363     virtual void setMyLastAppliedOpTimeForward(const OpTime& opTime,
364                                                DataConsistency consistency) = 0;
365 
366     /**
367      * Updates our internal tracking of the last OpTime durable to this node, but only
368      * if the supplied optime is later than the current last OpTime known to the replication
369      * coordinator.
370      *
371      * This function is used by logOp() on a primary, since the ops in the oplog do not
372      * necessarily commit in sequential order.
373      */
374     virtual void setMyLastDurableOpTimeForward(const OpTime& opTime) = 0;
375 
376     /**
377      * Same as above, but used during places we need to zero our last optime.
378      */
379     virtual void resetMyLastOpTimes() = 0;
380 
381     /**
382      * Updates our the message we include in heartbeat responses.
383      */
384     virtual void setMyHeartbeatMessage(const std::string& msg) = 0;
385 
386     /**
387      * Returns the last optime recorded by setMyLastAppliedOpTime.
388      */
389     virtual OpTime getMyLastAppliedOpTime() const = 0;
390 
391     /**
392      * Returns the last optime recorded by setMyLastDurableOpTime.
393      */
394     virtual OpTime getMyLastDurableOpTime() const = 0;
395 
396     /**
397      * Waits until the optime of the current node is at least the opTime specified in 'settings'.
398      *
399      * Returns whether the wait was successful.
400      */
401     virtual Status waitUntilOpTimeForRead(OperationContext* opCtx,
402                                           const ReadConcernArgs& settings) = 0;
403 
404     /**
405      * Waits until the deadline or until the optime of the current node is at least the opTime
406      * specified in 'settings'.
407      *
408      * Returns whether the wait was successful.
409      */
410     virtual Status waitUntilOpTimeForReadUntil(OperationContext* opCtx,
411                                                const ReadConcernArgs& settings,
412                                                boost::optional<Date_t> deadline) = 0;
413 
414     /**
415      * Retrieves and returns the current election id, which is a unique id that is local to
416      * this node and changes every time we become primary.
417      * TODO(spencer): Use term instead.
418      */
419     virtual OID getElectionId() = 0;
420 
421     /**
422      * Returns the RID for this node.  The RID is used to identify this node to our sync source
423      * when sending updates about our replication progress.
424      */
425     virtual OID getMyRID() const = 0;
426 
427     /**
428      * Returns the id for this node as specified in the current replica set configuration.
429      */
430     virtual int getMyId() const = 0;
431 
432     /**
433      * Sets this node into a specific follower mode.
434      *
435      * Returns OK if the follower mode was successfully set.  Returns NotSecondary if the
436      * node is a leader when setFollowerMode is called and ElectionInProgess if the node is in the
437      * process of trying to elect itself primary.
438      *
439      * Follower modes are RS_STARTUP2 (initial sync), RS_SECONDARY, RS_ROLLBACK and
440      * RS_RECOVERING.  They are the valid states of a node whose topology coordinator has the
441      * follower role.
442      *
443      * This is essentially an interface that allows the applier to prevent the node from
444      * becoming a candidate or accepting reads, depending on circumstances in the oplog
445      * application process.
446      */
447     virtual Status setFollowerMode(const MemberState& newState) = 0;
448 
449     /**
450      * Step-up
451      * =======
452      * On stepup, repl coord enters catch-up mode. It's the same as the secondary mode from
453      * the perspective of producer and applier, so there's nothing to do with them.
454      * When a node enters drain mode, producer state = Stopped, applier state = Draining.
455      *
456      * If the applier state is Draining, it will signal repl coord when there's nothing to apply.
457      * The applier goes into Stopped state at the same time.
458      *
459      * The states go like the following:
460      * - secondary and during catchup mode
461      * (producer: Running, applier: Running)
462      *      |
463      *      | finish catch-up, enter drain mode
464      *      V
465      * - drain mode
466      * (producer: Stopped, applier: Draining)
467      *      |
468      *      | applier signals drain is complete
469      *      V
470      * - primary is in master mode
471      * (producer: Stopped, applier: Stopped)
472      *
473      *
474      * Step-down
475      * =========
476      * The state transitions become:
477      * - primary is in master mode
478      * (producer: Stopped, applier: Stopped)
479      *      |
480      *      | step down
481      *      V
482      * - secondary mode, starting bgsync
483      * (producer: Starting, applier: Running)
484      *      |
485      *      | bgsync runs start()
486      *      V
487      * - secondary mode, normal
488      * (producer: Running, applier: Running)
489      *
490      * When a node steps down during draining mode, it's OK to change from (producer: Stopped,
491      * applier: Draining) to (producer: Starting, applier: Running).
492      *
493      * When a node steps down during catchup mode, the states remain the same (producer: Running,
494      * applier: Running).
495      */
496     enum class ApplierState { Running, Draining, Stopped };
497 
498     /**
499      * In normal cases: Running -> Draining -> Stopped -> Running.
500      * Draining -> Running is also possible if a node steps down during drain mode.
501      *
502      * Only the applier can make the transition from Draining to Stopped by calling
503      * signalDrainComplete().
504      */
505     virtual ApplierState getApplierState() = 0;
506 
507     /**
508      * Signals that a previously requested pause and drain of the applier buffer
509      * has completed.
510      *
511      * This is an interface that allows the applier to reenable writes after
512      * a successful election triggers the draining of the applier buffer.
513      *
514      * The applier signals drain complete when the buffer is empty and it's in Draining
515      * state. We need to make sure the applier checks both conditions in the same term.
516      * Otherwise, it's possible that the applier confirms the empty buffer, but the node
517      * steps down and steps up so quickly that the applier signals drain complete in the wrong
518      * term.
519      */
520     virtual void signalDrainComplete(OperationContext* opCtx, long long termWhenBufferIsEmpty) = 0;
521 
522     /**
523      * Waits duration of 'timeout' for applier to finish draining its buffer of operations.
524      * Returns OK if we are not in drain mode.
525      * Returns ErrorCodes::ExceededTimeLimit if we timed out waiting for the applier to drain its
526      * buffer.
527      * Returns ErrorCodes::BadValue if timeout is negative.
528      */
529     virtual Status waitForDrainFinish(Milliseconds timeout) = 0;
530 
531     /**
532      * Signals the sync source feedback thread to wake up and send a handshake and
533      * replSetUpdatePosition command to our sync source.
534      */
535     virtual void signalUpstreamUpdater() = 0;
536 
537     /**
538      * Prepares a BSONObj describing an invocation of the replSetUpdatePosition command that can
539      * be sent to this node's sync source to update it about our progress in replication.
540      */
541     virtual StatusWith<BSONObj> prepareReplSetUpdatePositionCommand() const = 0;
542 
543     enum class ReplSetGetStatusResponseStyle { kBasic, kInitialSync };
544 
545     /**
546      * Handles an incoming replSetGetStatus command. Adds BSON to 'result'. If kInitialSync is
547      * requested but initial sync is not running, kBasic will be used.
548      */
549     virtual Status processReplSetGetStatus(BSONObjBuilder* result,
550                                            ReplSetGetStatusResponseStyle responseStyle) = 0;
551 
552     /**
553      * Does an initial sync of data, after dropping existing data.
554      */
555     virtual Status resyncData(OperationContext* opCtx, bool waitUntilCompleted) = 0;
556 
557     /**
558      * Handles an incoming isMaster command for a replica set node.  Should not be
559      * called on a master-slave or standalone node.
560      */
561     virtual void fillIsMasterForReplSet(IsMasterResponse* result,
562                                         const SplitHorizon::Parameters& horizonParams) = 0;
563 
564     /**
565      * Adds to "result" a description of the slaveInfo data structure used to map RIDs to their
566      * last known optimes.
567      */
568     virtual void appendSlaveInfoData(BSONObjBuilder* result) = 0;
569 
570     /**
571      * Returns a copy of the current ReplSetConfig.
572      */
573     virtual ReplSetConfig getConfig() const = 0;
574 
575     /**
576      * Handles an incoming replSetGetConfig command. Adds BSON to 'result'.
577      */
578     virtual void processReplSetGetConfig(BSONObjBuilder* result) = 0;
579 
580     /**
581      * Processes the ReplSetMetadata returned from a command run against another
582      * replica set member and so long as the config version in the metadata matches the replica set
583      * config version this node currently has, updates the current term.
584      *
585      * This does NOT update this node's notion of the commit point.
586      */
587     virtual void processReplSetMetadata(const rpc::ReplSetMetadata& replMetadata) = 0;
588 
589     /**
590      * This updates the node's notion of the commit point.
591      */
592     virtual void advanceCommitPoint(const OpTime& committedOptime) = 0;
593 
594     /**
595      * Elections under protocol version 1 are triggered by a timer.
596      * When a node is informed of the primary's liveness (either through heartbeats or
597      * while reading a sync source's oplog), it calls this function to postpone the
598      * election timer by a duration of at least 'electionTimeoutMillis' (see getConfig()).
599      * If the current node is not electable (secondary with priority > 0), this function
600      * cancels the existing timer but will not schedule a new one.
601      */
602     virtual void cancelAndRescheduleElectionTimeout() = 0;
603 
604     /**
605      * Toggles maintenanceMode to the value expressed by 'activate'
606      * return Status::OK if the change worked, NotSecondary if it failed because we are
607      * PRIMARY, and OperationFailed if we are not currently in maintenance mode
608      */
609     virtual Status setMaintenanceMode(bool activate) = 0;
610 
611     /**
612      * Retrieves the current count of maintenanceMode and returns 'true' if greater than 0.
613      */
614     virtual bool getMaintenanceMode() = 0;
615 
616     /**
617      * Handles an incoming replSetSyncFrom command. Adds BSON to 'result'
618      * returns Status::OK if the sync target could be set and an ErrorCode indicating why it
619      * couldn't otherwise.
620      */
621     virtual Status processReplSetSyncFrom(OperationContext* opCtx,
622                                           const HostAndPort& target,
623                                           BSONObjBuilder* resultObj) = 0;
624 
625     /**
626      * Handles an incoming replSetFreeze command. Adds BSON to 'resultObj'
627      * returns Status::OK() if the node is a member of a replica set with a config and an
628      * error Status otherwise
629      */
630     virtual Status processReplSetFreeze(int secs, BSONObjBuilder* resultObj) = 0;
631 
632     /**
633      * Handles an incoming heartbeat command with arguments 'args'. Populates 'response';
634      * returns a Status with either OK or an error message.
635      */
636     virtual Status processHeartbeat(const ReplSetHeartbeatArgs& args,
637                                     ReplSetHeartbeatResponse* response) = 0;
638     virtual Status processHeartbeatV1(const ReplSetHeartbeatArgsV1& args,
639                                       ReplSetHeartbeatResponse* response) = 0;
640 
641 
642     /**
643      * Arguments for the replSetReconfig command.
644      */
645     struct ReplSetReconfigArgs {
646         BSONObj newConfigObj;
647         bool force;
648     };
649 
650     /**
651      * Handles an incoming replSetReconfig command. Adds BSON to 'resultObj';
652      * returns a Status with either OK or an error message.
653      */
654     virtual Status processReplSetReconfig(OperationContext* opCtx,
655                                           const ReplSetReconfigArgs& args,
656                                           BSONObjBuilder* resultObj) = 0;
657 
658     /*
659      * Handles an incoming replSetInitiate command. If "configObj" is empty, generates a default
660      * configuration to use.
661      * Adds BSON to 'resultObj'; returns a Status with either OK or an error message.
662      */
663     virtual Status processReplSetInitiate(OperationContext* opCtx,
664                                           const BSONObj& configObj,
665                                           BSONObjBuilder* resultObj) = 0;
666 
667     /**
668      * Arguments to the replSetFresh command.
669      */
670     struct ReplSetFreshArgs {
671         std::string setName;  // Name of the replset
672         HostAndPort who;      // host and port of the member that sent the replSetFresh command
673         unsigned id;          // replSet id of the member that sent the replSetFresh command
674         int cfgver;  // replSet config version that the member who sent the command thinks it has
675         Timestamp opTime;  // last optime seen by the member who sent the replSetFresh command
676     };
677 
678     /*
679      * Handles an incoming replSetFresh command.
680      * Adds BSON to 'resultObj'; returns a Status with either OK or an error message.
681      */
682     virtual Status processReplSetFresh(const ReplSetFreshArgs& args, BSONObjBuilder* resultObj) = 0;
683 
684     /**
685      * Arguments to the replSetElect command.
686      */
687     struct ReplSetElectArgs {
688         std::string set;  // Name of the replset
689         int whoid;        // replSet id of the member that sent the replSetFresh command
690         int cfgver;  // replSet config version that the member who sent the command thinks it has
691         OID round;   // unique ID for this election
692     };
693 
694     /*
695      * Handles an incoming replSetElect command.
696      * Adds BSON to 'resultObj'; returns a Status with either OK or an error message.
697      */
698     virtual Status processReplSetElect(const ReplSetElectArgs& args, BSONObjBuilder* resultObj) = 0;
699 
700     /**
701      * Handles an incoming replSetUpdatePosition command, updating each node's oplog progress.
702      * Returns Status::OK() if all updates are processed correctly, NodeNotFound
703      * if any updating node cannot be found in the config, InvalidReplicaSetConfig if the
704      * "configVersion" sent in any of the updates doesn't match our config version, or
705      * NotMasterOrSecondary if we are in state REMOVED or otherwise don't have a valid
706      * replica set config.
707      * If a non-OK status is returned, it is unspecified whether none or some of the updates
708      * were applied.
709      * "configVersion" will be populated with our config version if and only if we return
710      * InvalidReplicaSetConfig.
711      */
712     virtual Status processReplSetUpdatePosition(const UpdatePositionArgs& updates,
713                                                 long long* configVersion) = 0;
714 
715     /**
716      * Handles an incoming Handshake command. Associates the node's 'remoteID' with its
717      * 'handshake' object. This association is used to update internal representation of
718      * replication progress and to forward the node's replication progress upstream when this
719      * node is being chained through in master/slave replication.
720      *
721      * Returns ErrorCodes::IllegalOperation if we're not running with master/slave replication.
722      */
723     virtual Status processHandshake(OperationContext* opCtx, const HandshakeArgs& handshake) = 0;
724 
725     /**
726      * Returns a bool indicating whether or not this node builds indexes.
727      */
728     virtual bool buildsIndexes() = 0;
729 
730     /**
731      * Returns a vector of members that have applied the operation with OpTime 'op'.
732      * "durablyWritten" indicates whether the operation has to be durably applied.
733      */
734     virtual std::vector<HostAndPort> getHostsWrittenTo(const OpTime& op, bool durablyWritten) = 0;
735 
736     /**
737      * Returns a vector of the members other than ourself in the replica set, as specified in
738      * the replica set config.  Invalid to call if we are not in replica set mode.  Returns
739      * an empty vector if we do not have a valid config.
740      */
741     virtual std::vector<HostAndPort> getOtherNodesInReplSet() const = 0;
742 
743     /**
744      * Returns a BSONObj containing a representation of the current default write concern.
745      */
746     virtual WriteConcernOptions getGetLastErrorDefault() = 0;
747 
748     /**
749      * Checks that the --replSet flag was passed when starting up the node and that the node
750      * has a valid replica set config.
751      *
752      * Returns a Status indicating whether those conditions are met with errorcode
753      * NoReplicationEnabled if --replSet was not present during start up or with errorcode
754      * NotYetInitialized in the absence of a valid config. Also adds error info to "result".
755      */
756     virtual Status checkReplEnabledForCommand(BSONObjBuilder* result) = 0;
757 
758     /**
759      * Loads the optime from the last op in the oplog into the coordinator's lastAppliedOpTime and
760      * lastDurableOpTime values. The 'consistency' argument must tell whether or not the optime of
761      * the op in the oplog represents a consistent database state.
762      */
763     virtual void resetLastOpTimesFromOplog(OperationContext* opCtx,
764                                            DataConsistency consistency) = 0;
765 
766     /**
767      * Returns the OpTime of the latest replica set-committed op known to this server.
768      * Committed means a majority of the voting nodes of the config are known to have the
769      * operation in their oplogs.  This implies such ops will never be rolled back.
770      */
771     virtual OpTime getLastCommittedOpTime() const = 0;
772 
773     /*
774     * Handles an incoming replSetRequestVotes command.
775     * Adds BSON to 'resultObj'; returns a Status with either OK or an error message.
776     */
777     virtual Status processReplSetRequestVotes(OperationContext* opCtx,
778                                               const ReplSetRequestVotesArgs& args,
779                                               ReplSetRequestVotesResponse* response) = 0;
780 
781     /**
782      * Prepares a metadata object with the ReplSetMetadata and the OplogQueryMetadata depending
783      * on what has been requested.
784      */
785     virtual void prepareReplMetadata(const BSONObj& metadataRequestObj,
786                                      const OpTime& lastOpTimeFromClient,
787                                      BSONObjBuilder* builder) const = 0;
788 
789     /**
790      * Returns true if the V1 election protocol is being used and false otherwise.
791      */
792     virtual bool isV1ElectionProtocol() const = 0;
793 
794     /**
795      * Returns whether or not majority write concerns should implicitly journal, if j has not been
796      * explicitly set.
797      */
798     virtual bool getWriteConcernMajorityShouldJournal() = 0;
799 
800     /**
801      * Writes into 'output' all the information needed to generate a summary of the current
802      * replication state for use by the web interface.
803      */
804     virtual void summarizeAsHtml(ReplSetHtmlSummary* output) = 0;
805 
806     /**
807      * Returns the current term.
808      */
809     virtual long long getTerm() = 0;
810 
811     /**
812      * Attempts to update the current term for the V1 election protocol. If the term changes and
813      * this node is primary, relinquishes primary.
814      * Returns a Status OK if the term was *not* updated (meaning, it is safe to proceed with
815      * the rest of the work, because the term is still the same).
816      * Returns StaleTerm if the supplied term was higher than the current term.
817      */
818     virtual Status updateTerm(OperationContext* opCtx, long long term) = 0;
819 
820     /**
821      * Reserves a unique SnapshotName.
822      *
823      * This name is guaranteed to compare > all names reserved before and < all names reserved
824      * after.
825      *
826      * This method will not take any locks or attempt to access storage using the passed-in
827      * OperationContext. It will only be used to track reserved SnapshotNames by each operation so
828      * that awaitReplicationOfLastOpForClient() can correctly wait for the reserved snapshot to be
829      * visible.
830      *
831      * A null OperationContext can be used in cases where the snapshot to wait for should not be
832      * adjusted.
833      */
834     virtual Timestamp reserveSnapshotName(OperationContext* opCtx) = 0;
835 
836     /**
837      * Blocks until either the current committed snapshot is at least as high as 'untilSnapshot',
838      * or we are interrupted for any reason, including shutdown or maxTimeMs expiration.
839      * 'opCtx' is used to checkForInterrupt and enforce maxTimeMS.
840      */
841     virtual void waitUntilSnapshotCommitted(OperationContext* opCtx,
842                                             const Timestamp& untilSnapshot) = 0;
843 
844     /**
845      * Resets all information related to snapshotting.
846      */
847     virtual void dropAllSnapshots() = 0;
848 
849     /**
850      * Gets the latest OpTime of the currentCommittedSnapshot.
851      */
852     virtual OpTime getCurrentCommittedSnapshotOpTime() const = 0;
853 
854     /**
855      * Appends diagnostics about the replication subsystem.
856      */
857     virtual void appendDiagnosticBSON(BSONObjBuilder* bob) = 0;
858 
859     /**
860      * Appends connection information to the provided BSONObjBuilder.
861      */
862     virtual void appendConnectionStats(executor::ConnectionPoolStats* stats) const = 0;
863 
864     /**
865      * Gets the number of uncommitted snapshots currently held.
866      * Warning: This value can change at any time and may not even be accurate at the time of
867      * return. It should not be used when an exact amount is needed.
868      */
869     virtual size_t getNumUncommittedSnapshots() = 0;
870 
871     /**
872      * Returns a new WriteConcernOptions based on "wc" but with UNSET syncMode reset to JOURNAL or
873      * NONE based on our rsConfig.
874      */
875     virtual WriteConcernOptions populateUnsetWriteConcernOptionsSyncMode(
876         WriteConcernOptions wc) = 0;
877     virtual ReplSettings::IndexPrefetchConfig getIndexPrefetchConfig() const = 0;
878     virtual void setIndexPrefetchConfig(const ReplSettings::IndexPrefetchConfig cfg) = 0;
879 
880     virtual Status stepUpIfEligible(bool skipDryRun) = 0;
881 
882     virtual ServiceContext* getServiceContext() = 0;
883 
884     /**
885      * Abort catchup if the node is in catchup mode.
886      */
887     virtual Status abortCatchupIfNeeded() = 0;
888 
889     /**
890      * Returns true if logOp() should not append an entry to the oplog for the namespace for this
891      * operation.
892      */
893     bool isOplogDisabledFor(OperationContext* opCtx, const NamespaceString& nss);
894 
895 protected:
896     ReplicationCoordinator();
897 };
898 
899 }  // namespace repl
900 }  // namespace mongo
901