1package raft
2
3import (
4	"fmt"
5	"io"
6	"time"
7
8	"github.com/hashicorp/go-hclog"
9)
10
11// ProtocolVersion is the version of the protocol (which includes RPC messages
12// as well as Raft-specific log entries) that this server can _understand_. Use
13// the ProtocolVersion member of the Config object to control the version of
14// the protocol to use when _speaking_ to other servers. Note that depending on
15// the protocol version being spoken, some otherwise understood RPC messages
16// may be refused. See dispositionRPC for details of this logic.
17//
18// There are notes about the upgrade path in the description of the versions
19// below. If you are starting a fresh cluster then there's no reason not to
20// jump right to the latest protocol version. If you need to interoperate with
21// older, version 0 Raft servers you'll need to drive the cluster through the
22// different versions in order.
23//
24// The version details are complicated, but here's a summary of what's required
25// to get from a version 0 cluster to version 3:
26//
27// 1. In version N of your app that starts using the new Raft library with
28//    versioning, set ProtocolVersion to 1.
29// 2. Make version N+1 of your app require version N as a prerequisite (all
30//    servers must be upgraded). For version N+1 of your app set ProtocolVersion
31//    to 2.
32// 3. Similarly, make version N+2 of your app require version N+1 as a
33//    prerequisite. For version N+2 of your app, set ProtocolVersion to 3.
34//
35// During this upgrade, older cluster members will still have Server IDs equal
36// to their network addresses. To upgrade an older member and give it an ID, it
37// needs to leave the cluster and re-enter:
38//
39// 1. Remove the server from the cluster with RemoveServer, using its network
40//    address as its ServerID.
41// 2. Update the server's config to use a UUID or something else that is
42//	  not tied to the machine as the ServerID (restarting the server).
43// 3. Add the server back to the cluster with AddVoter, using its new ID.
44//
45// You can do this during the rolling upgrade from N+1 to N+2 of your app, or
46// as a rolling change at any time after the upgrade.
47//
48// Version History
49//
50// 0: Original Raft library before versioning was added. Servers running this
51//    version of the Raft library use AddPeerDeprecated/RemovePeerDeprecated
52//    for all configuration changes, and have no support for LogConfiguration.
53// 1: First versioned protocol, used to interoperate with old servers, and begin
54//    the migration path to newer versions of the protocol. Under this version
55//    all configuration changes are propagated using the now-deprecated
56//    RemovePeerDeprecated Raft log entry. This means that server IDs are always
57//    set to be the same as the server addresses (since the old log entry type
58//    cannot transmit an ID), and only AddPeer/RemovePeer APIs are supported.
59//    Servers running this version of the protocol can understand the new
60//    LogConfiguration Raft log entry but will never generate one so they can
61//    remain compatible with version 0 Raft servers in the cluster.
62// 2: Transitional protocol used when migrating an existing cluster to the new
63//    server ID system. Server IDs are still set to be the same as server
64//    addresses, but all configuration changes are propagated using the new
65//    LogConfiguration Raft log entry type, which can carry full ID information.
66//    This version supports the old AddPeer/RemovePeer APIs as well as the new
67//    ID-based AddVoter/RemoveServer APIs which should be used when adding
68//    version 3 servers to the cluster later. This version sheds all
69//    interoperability with version 0 servers, but can interoperate with newer
70//    Raft servers running with protocol version 1 since they can understand the
71//    new LogConfiguration Raft log entry, and this version can still understand
72//    their RemovePeerDeprecated Raft log entries. We need this protocol version
73//    as an intermediate step between 1 and 3 so that servers will propagate the
74//    ID information that will come from newly-added (or -rolled) servers using
75//    protocol version 3, but since they are still using their address-based IDs
76//    from the previous step they will still be able to track commitments and
77//    their own voting status properly. If we skipped this step, servers would
78//    be started with their new IDs, but they wouldn't see themselves in the old
79//    address-based configuration, so none of the servers would think they had a
80//    vote.
81// 3: Protocol adding full support for server IDs and new ID-based server APIs
82//    (AddVoter, AddNonvoter, etc.), old AddPeer/RemovePeer APIs are no longer
83//    supported. Version 2 servers should be swapped out by removing them from
84//    the cluster one-by-one and re-adding them with updated configuration for
85//    this protocol version, along with their server ID. The remove/add cycle
86//    is required to populate their server ID. Note that removing must be done
87//    by ID, which will be the old server's address.
88type ProtocolVersion int
89
90const (
91	// ProtocolVersionMin is the minimum protocol version
92	ProtocolVersionMin ProtocolVersion = 0
93	// ProtocolVersionMax is the maximum protocol version
94	ProtocolVersionMax = 3
95)
96
97// SnapshotVersion is the version of snapshots that this server can understand.
98// Currently, it is always assumed that the server generates the latest version,
99// though this may be changed in the future to include a configurable version.
100//
101// Version History
102//
103// 0: Original Raft library before versioning was added. The peers portion of
104//    these snapshots is encoded in the legacy format which requires decodePeers
105//    to parse. This version of snapshots should only be produced by the
106//    unversioned Raft library.
107// 1: New format which adds support for a full configuration structure and its
108//    associated log index, with support for server IDs and non-voting server
109//    modes. To ease upgrades, this also includes the legacy peers structure but
110//    that will never be used by servers that understand version 1 snapshots.
111//    Since the original Raft library didn't enforce any versioning, we must
112//    include the legacy peers structure for this version, but we can deprecate
113//    it in the next snapshot version.
114type SnapshotVersion int
115
116const (
117	// SnapshotVersionMin is the minimum snapshot version
118	SnapshotVersionMin SnapshotVersion = 0
119	// SnapshotVersionMax is the maximum snapshot version
120	SnapshotVersionMax = 1
121)
122
123// Config provides any necessary configuration for the Raft server.
124type Config struct {
125	// ProtocolVersion allows a Raft server to inter-operate with older
126	// Raft servers running an older version of the code. This is used to
127	// version the wire protocol as well as Raft-specific log entries that
128	// the server uses when _speaking_ to other servers. There is currently
129	// no auto-negotiation of versions so all servers must be manually
130	// configured with compatible versions. See ProtocolVersionMin and
131	// ProtocolVersionMax for the versions of the protocol that this server
132	// can _understand_.
133	ProtocolVersion ProtocolVersion
134
135	// HeartbeatTimeout specifies the time in follower state without
136	// a leader before we attempt an election.
137	HeartbeatTimeout time.Duration
138
139	// ElectionTimeout specifies the time in candidate state without
140	// a leader before we attempt an election.
141	ElectionTimeout time.Duration
142
143	// CommitTimeout controls the time without an Apply() operation
144	// before we heartbeat to ensure a timely commit. Due to random
145	// staggering, may be delayed as much as 2x this value.
146	CommitTimeout time.Duration
147
148	// MaxAppendEntries controls the maximum number of append entries
149	// to send at once. We want to strike a balance between efficiency
150	// and avoiding waste if the follower is going to reject because of
151	// an inconsistent log.
152	MaxAppendEntries int
153
154	// BatchApplyCh indicates whether we should buffer applyCh
155	// to size MaxAppendEntries. This enables batch log commitment,
156	// but breaks the timeout guarantee on Apply. Specifically,
157	// a log can be added to the applyCh buffer but not actually be
158	// processed until after the specified timeout.
159	BatchApplyCh bool
160
161	// If we are a member of a cluster, and RemovePeer is invoked for the
162	// local node, then we forget all peers and transition into the follower state.
163	// If ShutdownOnRemove is set, we additional shutdown Raft. Otherwise,
164	// we can become a leader of a cluster containing only this node.
165	ShutdownOnRemove bool
166
167	// TrailingLogs controls how many logs we leave after a snapshot. This is used
168	// so that we can quickly replay logs on a follower instead of being forced to
169	// send an entire snapshot. The value passed here is the initial setting used.
170	// This can be tuned during operation using ReloadConfig.
171	TrailingLogs uint64
172
173	// SnapshotInterval controls how often we check if we should perform a
174	// snapshot. We randomly stagger between this value and 2x this value to avoid
175	// the entire cluster from performing a snapshot at once. The value passed
176	// here is the initial setting used. This can be tuned during operation using
177	// ReloadConfig.
178	SnapshotInterval time.Duration
179
180	// SnapshotThreshold controls how many outstanding logs there must be before
181	// we perform a snapshot. This is to prevent excessive snapshotting by
182	// replaying a small set of logs instead. The value passed here is the initial
183	// setting used. This can be tuned during operation using ReloadConfig.
184	SnapshotThreshold uint64
185
186	// LeaderLeaseTimeout is used to control how long the "lease" lasts
187	// for being the leader without being able to contact a quorum
188	// of nodes. If we reach this interval without contact, we will
189	// step down as leader.
190	LeaderLeaseTimeout time.Duration
191
192	// LocalID is a unique ID for this server across all time. When running with
193	// ProtocolVersion < 3, you must set this to be the same as the network
194	// address of your transport.
195	LocalID ServerID
196
197	// NotifyCh is used to provide a channel that will be notified of leadership
198	// changes. Raft will block writing to this channel, so it should either be
199	// buffered or aggressively consumed.
200	NotifyCh chan<- bool
201
202	// LogOutput is used as a sink for logs, unless Logger is specified.
203	// Defaults to os.Stderr.
204	LogOutput io.Writer
205
206	// LogLevel represents a log level. If the value does not match a known
207	// logging level hclog.NoLevel is used.
208	LogLevel string
209
210	// Logger is a user-provided logger. If nil, a logger writing to
211	// LogOutput with LogLevel is used.
212	Logger hclog.Logger
213
214	// NoSnapshotRestoreOnStart controls if raft will restore a snapshot to the
215	// FSM on start. This is useful if your FSM recovers from other mechanisms
216	// than raft snapshotting. Snapshot metadata will still be used to initialize
217	// raft's configuration and index values.
218	NoSnapshotRestoreOnStart bool
219
220	// skipStartup allows NewRaft() to bypass all background work goroutines
221	skipStartup bool
222}
223
224// ReloadableConfig is the subset of Config that may be reconfigured during
225// runtime using raft.ReloadConfig. We choose to duplicate fields over embedding
226// or accepting a Config but only using specific fields to keep the API clear.
227// Reconfiguring some fields is potentially dangerous so we should only
228// selectively enable it for fields where that is allowed.
229type ReloadableConfig struct {
230	// TrailingLogs controls how many logs we leave after a snapshot. This is used
231	// so that we can quickly replay logs on a follower instead of being forced to
232	// send an entire snapshot. The value passed here updates the setting at runtime
233	// which will take effect as soon as the next snapshot completes and truncation
234	// occurs.
235	TrailingLogs uint64
236
237	// SnapshotInterval controls how often we check if we should perform a snapshot.
238	// We randomly stagger between this value and 2x this value to avoid the entire
239	// cluster from performing a snapshot at once.
240	SnapshotInterval time.Duration
241
242	// SnapshotThreshold controls how many outstanding logs there must be before
243	// we perform a snapshot. This is to prevent excessive snapshots when we can
244	// just replay a small set of logs.
245	SnapshotThreshold uint64
246}
247
248// apply sets the reloadable fields on the passed Config to the values in
249// `ReloadableConfig`. It returns a copy of Config with the fields from this
250// ReloadableConfig set.
251func (rc *ReloadableConfig) apply(to Config) Config {
252	to.TrailingLogs = rc.TrailingLogs
253	to.SnapshotInterval = rc.SnapshotInterval
254	to.SnapshotThreshold = rc.SnapshotThreshold
255	return to
256}
257
258// fromConfig copies the reloadable fields from the passed Config.
259func (rc *ReloadableConfig) fromConfig(from Config) {
260	rc.TrailingLogs = from.TrailingLogs
261	rc.SnapshotInterval = from.SnapshotInterval
262	rc.SnapshotThreshold = from.SnapshotThreshold
263}
264
265// DefaultConfig returns a Config with usable defaults.
266func DefaultConfig() *Config {
267	return &Config{
268		ProtocolVersion:    ProtocolVersionMax,
269		HeartbeatTimeout:   1000 * time.Millisecond,
270		ElectionTimeout:    1000 * time.Millisecond,
271		CommitTimeout:      50 * time.Millisecond,
272		MaxAppendEntries:   64,
273		ShutdownOnRemove:   true,
274		TrailingLogs:       10240,
275		SnapshotInterval:   120 * time.Second,
276		SnapshotThreshold:  8192,
277		LeaderLeaseTimeout: 500 * time.Millisecond,
278		LogLevel:           "DEBUG",
279	}
280}
281
282// ValidateConfig is used to validate a sane configuration
283func ValidateConfig(config *Config) error {
284	// We don't actually support running as 0 in the library any more, but
285	// we do understand it.
286	protocolMin := ProtocolVersionMin
287	if protocolMin == 0 {
288		protocolMin = 1
289	}
290	if config.ProtocolVersion < protocolMin ||
291		config.ProtocolVersion > ProtocolVersionMax {
292		return fmt.Errorf("ProtocolVersion %d must be >= %d and <= %d",
293			config.ProtocolVersion, protocolMin, ProtocolVersionMax)
294	}
295	if len(config.LocalID) == 0 {
296		return fmt.Errorf("LocalID cannot be empty")
297	}
298	if config.HeartbeatTimeout < 5*time.Millisecond {
299		return fmt.Errorf("HeartbeatTimeout is too low")
300	}
301	if config.ElectionTimeout < 5*time.Millisecond {
302		return fmt.Errorf("ElectionTimeout is too low")
303	}
304	if config.CommitTimeout < time.Millisecond {
305		return fmt.Errorf("CommitTimeout is too low")
306	}
307	if config.MaxAppendEntries <= 0 {
308		return fmt.Errorf("MaxAppendEntries must be positive")
309	}
310	if config.MaxAppendEntries > 1024 {
311		return fmt.Errorf("MaxAppendEntries is too large")
312	}
313	if config.SnapshotInterval < 5*time.Millisecond {
314		return fmt.Errorf("SnapshotInterval is too low")
315	}
316	if config.LeaderLeaseTimeout < 5*time.Millisecond {
317		return fmt.Errorf("LeaderLeaseTimeout is too low")
318	}
319	if config.LeaderLeaseTimeout > config.HeartbeatTimeout {
320		return fmt.Errorf("LeaderLeaseTimeout cannot be larger than heartbeat timeout")
321	}
322	if config.ElectionTimeout < config.HeartbeatTimeout {
323		return fmt.Errorf("ElectionTimeout must be equal or greater than Heartbeat Timeout")
324	}
325	return nil
326}
327