1package raft 2 3import ( 4 "fmt" 5 "io" 6 "time" 7 8 "github.com/hashicorp/go-hclog" 9) 10 11// ProtocolVersion is the version of the protocol (which includes RPC messages 12// as well as Raft-specific log entries) that this server can _understand_. Use 13// the ProtocolVersion member of the Config object to control the version of 14// the protocol to use when _speaking_ to other servers. Note that depending on 15// the protocol version being spoken, some otherwise understood RPC messages 16// may be refused. See dispositionRPC for details of this logic. 17// 18// There are notes about the upgrade path in the description of the versions 19// below. If you are starting a fresh cluster then there's no reason not to 20// jump right to the latest protocol version. If you need to interoperate with 21// older, version 0 Raft servers you'll need to drive the cluster through the 22// different versions in order. 23// 24// The version details are complicated, but here's a summary of what's required 25// to get from a version 0 cluster to version 3: 26// 27// 1. In version N of your app that starts using the new Raft library with 28// versioning, set ProtocolVersion to 1. 29// 2. Make version N+1 of your app require version N as a prerequisite (all 30// servers must be upgraded). For version N+1 of your app set ProtocolVersion 31// to 2. 32// 3. Similarly, make version N+2 of your app require version N+1 as a 33// prerequisite. For version N+2 of your app, set ProtocolVersion to 3. 34// 35// During this upgrade, older cluster members will still have Server IDs equal 36// to their network addresses. To upgrade an older member and give it an ID, it 37// needs to leave the cluster and re-enter: 38// 39// 1. Remove the server from the cluster with RemoveServer, using its network 40// address as its ServerID. 41// 2. Update the server's config to use a UUID or something else that is 42// not tied to the machine as the ServerID (restarting the server). 43// 3. Add the server back to the cluster with AddVoter, using its new ID. 44// 45// You can do this during the rolling upgrade from N+1 to N+2 of your app, or 46// as a rolling change at any time after the upgrade. 47// 48// Version History 49// 50// 0: Original Raft library before versioning was added. Servers running this 51// version of the Raft library use AddPeerDeprecated/RemovePeerDeprecated 52// for all configuration changes, and have no support for LogConfiguration. 53// 1: First versioned protocol, used to interoperate with old servers, and begin 54// the migration path to newer versions of the protocol. Under this version 55// all configuration changes are propagated using the now-deprecated 56// RemovePeerDeprecated Raft log entry. This means that server IDs are always 57// set to be the same as the server addresses (since the old log entry type 58// cannot transmit an ID), and only AddPeer/RemovePeer APIs are supported. 59// Servers running this version of the protocol can understand the new 60// LogConfiguration Raft log entry but will never generate one so they can 61// remain compatible with version 0 Raft servers in the cluster. 62// 2: Transitional protocol used when migrating an existing cluster to the new 63// server ID system. Server IDs are still set to be the same as server 64// addresses, but all configuration changes are propagated using the new 65// LogConfiguration Raft log entry type, which can carry full ID information. 66// This version supports the old AddPeer/RemovePeer APIs as well as the new 67// ID-based AddVoter/RemoveServer APIs which should be used when adding 68// version 3 servers to the cluster later. This version sheds all 69// interoperability with version 0 servers, but can interoperate with newer 70// Raft servers running with protocol version 1 since they can understand the 71// new LogConfiguration Raft log entry, and this version can still understand 72// their RemovePeerDeprecated Raft log entries. We need this protocol version 73// as an intermediate step between 1 and 3 so that servers will propagate the 74// ID information that will come from newly-added (or -rolled) servers using 75// protocol version 3, but since they are still using their address-based IDs 76// from the previous step they will still be able to track commitments and 77// their own voting status properly. If we skipped this step, servers would 78// be started with their new IDs, but they wouldn't see themselves in the old 79// address-based configuration, so none of the servers would think they had a 80// vote. 81// 3: Protocol adding full support for server IDs and new ID-based server APIs 82// (AddVoter, AddNonvoter, etc.), old AddPeer/RemovePeer APIs are no longer 83// supported. Version 2 servers should be swapped out by removing them from 84// the cluster one-by-one and re-adding them with updated configuration for 85// this protocol version, along with their server ID. The remove/add cycle 86// is required to populate their server ID. Note that removing must be done 87// by ID, which will be the old server's address. 88type ProtocolVersion int 89 90const ( 91 // ProtocolVersionMin is the minimum protocol version 92 ProtocolVersionMin ProtocolVersion = 0 93 // ProtocolVersionMax is the maximum protocol version 94 ProtocolVersionMax = 3 95) 96 97// SnapshotVersion is the version of snapshots that this server can understand. 98// Currently, it is always assumed that the server generates the latest version, 99// though this may be changed in the future to include a configurable version. 100// 101// Version History 102// 103// 0: Original Raft library before versioning was added. The peers portion of 104// these snapshots is encoded in the legacy format which requires decodePeers 105// to parse. This version of snapshots should only be produced by the 106// unversioned Raft library. 107// 1: New format which adds support for a full configuration structure and its 108// associated log index, with support for server IDs and non-voting server 109// modes. To ease upgrades, this also includes the legacy peers structure but 110// that will never be used by servers that understand version 1 snapshots. 111// Since the original Raft library didn't enforce any versioning, we must 112// include the legacy peers structure for this version, but we can deprecate 113// it in the next snapshot version. 114type SnapshotVersion int 115 116const ( 117 // SnapshotVersionMin is the minimum snapshot version 118 SnapshotVersionMin SnapshotVersion = 0 119 // SnapshotVersionMax is the maximum snapshot version 120 SnapshotVersionMax = 1 121) 122 123// Config provides any necessary configuration for the Raft server. 124type Config struct { 125 // ProtocolVersion allows a Raft server to inter-operate with older 126 // Raft servers running an older version of the code. This is used to 127 // version the wire protocol as well as Raft-specific log entries that 128 // the server uses when _speaking_ to other servers. There is currently 129 // no auto-negotiation of versions so all servers must be manually 130 // configured with compatible versions. See ProtocolVersionMin and 131 // ProtocolVersionMax for the versions of the protocol that this server 132 // can _understand_. 133 ProtocolVersion ProtocolVersion 134 135 // HeartbeatTimeout specifies the time in follower state without 136 // a leader before we attempt an election. 137 HeartbeatTimeout time.Duration 138 139 // ElectionTimeout specifies the time in candidate state without 140 // a leader before we attempt an election. 141 ElectionTimeout time.Duration 142 143 // CommitTimeout controls the time without an Apply() operation 144 // before we heartbeat to ensure a timely commit. Due to random 145 // staggering, may be delayed as much as 2x this value. 146 CommitTimeout time.Duration 147 148 // MaxAppendEntries controls the maximum number of append entries 149 // to send at once. We want to strike a balance between efficiency 150 // and avoiding waste if the follower is going to reject because of 151 // an inconsistent log. 152 MaxAppendEntries int 153 154 // BatchApplyCh indicates whether we should buffer applyCh 155 // to size MaxAppendEntries. This enables batch log commitment, 156 // but breaks the timeout guarantee on Apply. Specifically, 157 // a log can be added to the applyCh buffer but not actually be 158 // processed until after the specified timeout. 159 BatchApplyCh bool 160 161 // If we are a member of a cluster, and RemovePeer is invoked for the 162 // local node, then we forget all peers and transition into the follower state. 163 // If ShutdownOnRemove is set, we additional shutdown Raft. Otherwise, 164 // we can become a leader of a cluster containing only this node. 165 ShutdownOnRemove bool 166 167 // TrailingLogs controls how many logs we leave after a snapshot. This is used 168 // so that we can quickly replay logs on a follower instead of being forced to 169 // send an entire snapshot. The value passed here is the initial setting used. 170 // This can be tuned during operation using ReloadConfig. 171 TrailingLogs uint64 172 173 // SnapshotInterval controls how often we check if we should perform a 174 // snapshot. We randomly stagger between this value and 2x this value to avoid 175 // the entire cluster from performing a snapshot at once. The value passed 176 // here is the initial setting used. This can be tuned during operation using 177 // ReloadConfig. 178 SnapshotInterval time.Duration 179 180 // SnapshotThreshold controls how many outstanding logs there must be before 181 // we perform a snapshot. This is to prevent excessive snapshotting by 182 // replaying a small set of logs instead. The value passed here is the initial 183 // setting used. This can be tuned during operation using ReloadConfig. 184 SnapshotThreshold uint64 185 186 // LeaderLeaseTimeout is used to control how long the "lease" lasts 187 // for being the leader without being able to contact a quorum 188 // of nodes. If we reach this interval without contact, we will 189 // step down as leader. 190 LeaderLeaseTimeout time.Duration 191 192 // LocalID is a unique ID for this server across all time. When running with 193 // ProtocolVersion < 3, you must set this to be the same as the network 194 // address of your transport. 195 LocalID ServerID 196 197 // NotifyCh is used to provide a channel that will be notified of leadership 198 // changes. Raft will block writing to this channel, so it should either be 199 // buffered or aggressively consumed. 200 NotifyCh chan<- bool 201 202 // LogOutput is used as a sink for logs, unless Logger is specified. 203 // Defaults to os.Stderr. 204 LogOutput io.Writer 205 206 // LogLevel represents a log level. If the value does not match a known 207 // logging level hclog.NoLevel is used. 208 LogLevel string 209 210 // Logger is a user-provided logger. If nil, a logger writing to 211 // LogOutput with LogLevel is used. 212 Logger hclog.Logger 213 214 // NoSnapshotRestoreOnStart controls if raft will restore a snapshot to the 215 // FSM on start. This is useful if your FSM recovers from other mechanisms 216 // than raft snapshotting. Snapshot metadata will still be used to initialize 217 // raft's configuration and index values. 218 NoSnapshotRestoreOnStart bool 219 220 // skipStartup allows NewRaft() to bypass all background work goroutines 221 skipStartup bool 222} 223 224// ReloadableConfig is the subset of Config that may be reconfigured during 225// runtime using raft.ReloadConfig. We choose to duplicate fields over embedding 226// or accepting a Config but only using specific fields to keep the API clear. 227// Reconfiguring some fields is potentially dangerous so we should only 228// selectively enable it for fields where that is allowed. 229type ReloadableConfig struct { 230 // TrailingLogs controls how many logs we leave after a snapshot. This is used 231 // so that we can quickly replay logs on a follower instead of being forced to 232 // send an entire snapshot. The value passed here updates the setting at runtime 233 // which will take effect as soon as the next snapshot completes and truncation 234 // occurs. 235 TrailingLogs uint64 236 237 // SnapshotInterval controls how often we check if we should perform a snapshot. 238 // We randomly stagger between this value and 2x this value to avoid the entire 239 // cluster from performing a snapshot at once. 240 SnapshotInterval time.Duration 241 242 // SnapshotThreshold controls how many outstanding logs there must be before 243 // we perform a snapshot. This is to prevent excessive snapshots when we can 244 // just replay a small set of logs. 245 SnapshotThreshold uint64 246} 247 248// apply sets the reloadable fields on the passed Config to the values in 249// `ReloadableConfig`. It returns a copy of Config with the fields from this 250// ReloadableConfig set. 251func (rc *ReloadableConfig) apply(to Config) Config { 252 to.TrailingLogs = rc.TrailingLogs 253 to.SnapshotInterval = rc.SnapshotInterval 254 to.SnapshotThreshold = rc.SnapshotThreshold 255 return to 256} 257 258// fromConfig copies the reloadable fields from the passed Config. 259func (rc *ReloadableConfig) fromConfig(from Config) { 260 rc.TrailingLogs = from.TrailingLogs 261 rc.SnapshotInterval = from.SnapshotInterval 262 rc.SnapshotThreshold = from.SnapshotThreshold 263} 264 265// DefaultConfig returns a Config with usable defaults. 266func DefaultConfig() *Config { 267 return &Config{ 268 ProtocolVersion: ProtocolVersionMax, 269 HeartbeatTimeout: 1000 * time.Millisecond, 270 ElectionTimeout: 1000 * time.Millisecond, 271 CommitTimeout: 50 * time.Millisecond, 272 MaxAppendEntries: 64, 273 ShutdownOnRemove: true, 274 TrailingLogs: 10240, 275 SnapshotInterval: 120 * time.Second, 276 SnapshotThreshold: 8192, 277 LeaderLeaseTimeout: 500 * time.Millisecond, 278 LogLevel: "DEBUG", 279 } 280} 281 282// ValidateConfig is used to validate a sane configuration 283func ValidateConfig(config *Config) error { 284 // We don't actually support running as 0 in the library any more, but 285 // we do understand it. 286 protocolMin := ProtocolVersionMin 287 if protocolMin == 0 { 288 protocolMin = 1 289 } 290 if config.ProtocolVersion < protocolMin || 291 config.ProtocolVersion > ProtocolVersionMax { 292 return fmt.Errorf("ProtocolVersion %d must be >= %d and <= %d", 293 config.ProtocolVersion, protocolMin, ProtocolVersionMax) 294 } 295 if len(config.LocalID) == 0 { 296 return fmt.Errorf("LocalID cannot be empty") 297 } 298 if config.HeartbeatTimeout < 5*time.Millisecond { 299 return fmt.Errorf("HeartbeatTimeout is too low") 300 } 301 if config.ElectionTimeout < 5*time.Millisecond { 302 return fmt.Errorf("ElectionTimeout is too low") 303 } 304 if config.CommitTimeout < time.Millisecond { 305 return fmt.Errorf("CommitTimeout is too low") 306 } 307 if config.MaxAppendEntries <= 0 { 308 return fmt.Errorf("MaxAppendEntries must be positive") 309 } 310 if config.MaxAppendEntries > 1024 { 311 return fmt.Errorf("MaxAppendEntries is too large") 312 } 313 if config.SnapshotInterval < 5*time.Millisecond { 314 return fmt.Errorf("SnapshotInterval is too low") 315 } 316 if config.LeaderLeaseTimeout < 5*time.Millisecond { 317 return fmt.Errorf("LeaderLeaseTimeout is too low") 318 } 319 if config.LeaderLeaseTimeout > config.HeartbeatTimeout { 320 return fmt.Errorf("LeaderLeaseTimeout cannot be larger than heartbeat timeout") 321 } 322 if config.ElectionTimeout < config.HeartbeatTimeout { 323 return fmt.Errorf("ElectionTimeout must be equal or greater than Heartbeat Timeout") 324 } 325 return nil 326} 327