1package serf 2 3import ( 4 "io" 5 "log" 6 "os" 7 "time" 8 9 "github.com/hashicorp/memberlist" 10) 11 12// ProtocolVersionMap is the mapping of Serf delegate protocol versions 13// to memberlist protocol versions. We mask the memberlist protocols using 14// our own protocol version. 15var ProtocolVersionMap map[uint8]uint8 16 17func init() { 18 ProtocolVersionMap = map[uint8]uint8{ 19 5: 2, 20 4: 2, 21 3: 2, 22 2: 2, 23 } 24} 25 26// Config is the configuration for creating a Serf instance. 27type Config struct { 28 // The name of this node. This must be unique in the cluster. If this 29 // is not set, Serf will set it to the hostname of the running machine. 30 NodeName string 31 32 // The tags for this role, if any. This is used to provide arbitrary 33 // key/value metadata per-node. For example, a "role" tag may be used to 34 // differentiate "load-balancer" from a "web" role as parts of the same cluster. 35 // Tags are deprecating 'Role', and instead it acts as a special key in this 36 // map. 37 Tags map[string]string 38 39 // EventCh is a channel that receives all the Serf events. The events 40 // are sent on this channel in proper ordering. Care must be taken that 41 // this channel doesn't block, either by processing the events quick 42 // enough or buffering the channel, otherwise it can block state updates 43 // within Serf itself. If no EventCh is specified, no events will be fired, 44 // but point-in-time snapshots of members can still be retrieved by 45 // calling Members on Serf. 46 EventCh chan<- Event 47 48 // ProtocolVersion is the protocol version to speak. This must be between 49 // ProtocolVersionMin and ProtocolVersionMax. 50 ProtocolVersion uint8 51 52 // BroadcastTimeout is the amount of time to wait for a broadcast 53 // message to be sent to the cluster. Broadcast messages are used for 54 // things like leave messages and force remove messages. If this is not 55 // set, a timeout of 5 seconds will be set. 56 BroadcastTimeout time.Duration 57 58 // LeavePropagateDelay is for our leave (node dead) message to propagate 59 // through the cluster. In particular, we want to stay up long enough to 60 // service any probes from other nodes before they learn about us 61 // leaving and stop probing. Otherwise, we risk getting node failures as 62 // we leave. 63 LeavePropagateDelay time.Duration 64 65 // The settings below relate to Serf's event coalescence feature. Serf 66 // is able to coalesce multiple events into single events in order to 67 // reduce the amount of noise that is sent along the EventCh. For example 68 // if five nodes quickly join, the EventCh will be sent one EventMemberJoin 69 // containing the five nodes rather than five individual EventMemberJoin 70 // events. Coalescence can mitigate potential flapping behavior. 71 // 72 // Coalescence is disabled by default and can be enabled by setting 73 // CoalescePeriod. 74 // 75 // CoalescePeriod specifies the time duration to coalesce events. 76 // For example, if this is set to 5 seconds, then all events received 77 // within 5 seconds that can be coalesced will be. 78 // 79 // QuiescentPeriod specifies the duration of time where if no events 80 // are received, coalescence immediately happens. For example, if 81 // CoalscePeriod is set to 10 seconds but QuiscentPeriod is set to 2 82 // seconds, then the events will be coalesced and dispatched if no 83 // new events are received within 2 seconds of the last event. Otherwise, 84 // every event will always be delayed by at least 10 seconds. 85 CoalescePeriod time.Duration 86 QuiescentPeriod time.Duration 87 88 // The settings below relate to Serf's user event coalescing feature. 89 // The settings operate like above but only affect user messages and 90 // not the Member* messages that Serf generates. 91 UserCoalescePeriod time.Duration 92 UserQuiescentPeriod time.Duration 93 94 // The settings below relate to Serf keeping track of recently 95 // failed/left nodes and attempting reconnects. 96 // 97 // ReapInterval is the interval when the reaper runs. If this is not 98 // set (it is zero), it will be set to a reasonable default. 99 // 100 // ReconnectInterval is the interval when we attempt to reconnect 101 // to failed nodes. If this is not set (it is zero), it will be set 102 // to a reasonable default. 103 // 104 // ReconnectTimeout is the amount of time to attempt to reconnect to 105 // a failed node before giving up and considering it completely gone. 106 // 107 // TombstoneTimeout is the amount of time to keep around nodes 108 // that gracefully left as tombstones for syncing state with other 109 // Serf nodes. 110 ReapInterval time.Duration 111 ReconnectInterval time.Duration 112 ReconnectTimeout time.Duration 113 TombstoneTimeout time.Duration 114 115 // FlapTimeout is the amount of time less than which we consider a node 116 // being failed and rejoining looks like a flap for telemetry purposes. 117 // This should be set less than a typical reboot time, but large enough 118 // to see actual events, given our expected detection times for a failed 119 // node. 120 FlapTimeout time.Duration 121 122 // QueueCheckInterval is the interval at which we check the message 123 // queue to apply the warning and max depth. 124 QueueCheckInterval time.Duration 125 126 // QueueDepthWarning is used to generate warning message if the 127 // number of queued messages to broadcast exceeds this number. This 128 // is to provide the user feedback if events are being triggered 129 // faster than they can be disseminated 130 QueueDepthWarning int 131 132 // MaxQueueDepth is used to start dropping messages if the number 133 // of queued messages to broadcast exceeds this number. This is to 134 // prevent an unbounded growth of memory utilization 135 MaxQueueDepth int 136 137 // MinQueueDepth, if >0 will enforce a lower limit for dropping messages 138 // and then the max will be max(MinQueueDepth, 2*SizeOfCluster). This 139 // defaults to 0 which disables this dynamic sizing feature. If this is 140 // >0 then MaxQueueDepth will be ignored. 141 MinQueueDepth int 142 143 // RecentIntentTimeout is used to determine how long we store recent 144 // join and leave intents. This is used to guard against the case where 145 // Serf broadcasts an intent that arrives before the Memberlist event. 146 // It is important that this not be too short to avoid continuous 147 // rebroadcasting of dead events. 148 RecentIntentTimeout time.Duration 149 150 // EventBuffer is used to control how many events are buffered. 151 // This is used to prevent re-delivery of events to a client. The buffer 152 // must be large enough to handle all "recent" events, since Serf will 153 // not deliver messages that are older than the oldest entry in the buffer. 154 // Thus if a client is generating too many events, it's possible that the 155 // buffer gets overrun and messages are not delivered. 156 EventBuffer int 157 158 // QueryBuffer is used to control how many queries are buffered. 159 // This is used to prevent re-delivery of queries to a client. The buffer 160 // must be large enough to handle all "recent" events, since Serf will not 161 // deliver queries older than the oldest entry in the buffer. 162 // Thus if a client is generating too many queries, it's possible that the 163 // buffer gets overrun and messages are not delivered. 164 QueryBuffer int 165 166 // QueryTimeoutMult configures the default timeout multipler for a query to run if no 167 // specific value is provided. Queries are real-time by nature, where the 168 // reply is time sensitive. As a result, results are collected in an async 169 // fashion, however the query must have a bounded duration. We want the timeout 170 // to be long enough that all nodes have time to receive the message, run a handler, 171 // and generate a reply. Once the timeout is exceeded, any further replies are ignored. 172 // The default value is 173 // 174 // Timeout = GossipInterval * QueryTimeoutMult * log(N+1) 175 // 176 QueryTimeoutMult int 177 178 // QueryResponseSizeLimit and QuerySizeLimit limit the inbound and 179 // outbound payload sizes for queries, respectively. These must fit 180 // in a UDP packet with some additional overhead, so tuning these 181 // past the default values of 1024 will depend on your network 182 // configuration. 183 QueryResponseSizeLimit int 184 QuerySizeLimit int 185 186 // MemberlistConfig is the memberlist configuration that Serf will 187 // use to do the underlying membership management and gossip. Some 188 // fields in the MemberlistConfig will be overwritten by Serf no 189 // matter what: 190 // 191 // * Name - This will always be set to the same as the NodeName 192 // in this configuration. 193 // 194 // * Events - Serf uses a custom event delegate. 195 // 196 // * Delegate - Serf uses a custom delegate. 197 // 198 MemberlistConfig *memberlist.Config 199 200 // LogOutput is the location to write logs to. If this is not set, 201 // logs will go to stderr. 202 LogOutput io.Writer 203 204 // Logger is a custom logger which you provide. If Logger is set, it will use 205 // this for the internal logger. If Logger is not set, it will fall back to the 206 // behavior for using LogOutput. You cannot specify both LogOutput and Logger 207 // at the same time. 208 Logger *log.Logger 209 210 // SnapshotPath if provided is used to snapshot live nodes as well 211 // as lamport clock values. When Serf is started with a snapshot, 212 // it will attempt to join all the previously known nodes until one 213 // succeeds and will also avoid replaying old user events. 214 SnapshotPath string 215 216 // RejoinAfterLeave controls our interaction with the snapshot file. 217 // When set to false (default), a leave causes a Serf to not rejoin 218 // the cluster until an explicit join is received. If this is set to 219 // true, we ignore the leave, and rejoin the cluster on start. 220 RejoinAfterLeave bool 221 222 // EnableNameConflictResolution controls if Serf will actively attempt 223 // to resolve a name conflict. Since each Serf member must have a unique 224 // name, a cluster can run into issues if multiple nodes claim the same 225 // name. Without automatic resolution, Serf merely logs some warnings, but 226 // otherwise does not take any action. Automatic resolution detects the 227 // conflict and issues a special query which asks the cluster for the 228 // Name -> IP:Port mapping. If there is a simple majority of votes, that 229 // node stays while the other node will leave the cluster and exit. 230 EnableNameConflictResolution bool 231 232 // DisableCoordinates controls if Serf will maintain an estimate of this 233 // node's network coordinate internally. A network coordinate is useful 234 // for estimating the network distance (i.e. round trip time) between 235 // two nodes. Enabling this option adds some overhead to ping messages. 236 DisableCoordinates bool 237 238 // KeyringFile provides the location of a writable file where Serf can 239 // persist changes to the encryption keyring. 240 KeyringFile string 241 242 // Merge can be optionally provided to intercept a cluster merge 243 // and conditionally abort the merge. 244 Merge MergeDelegate 245 246 // UserEventSizeLimit is maximum byte size limit of user event `name` + `payload` in bytes. 247 // It's optimal to be relatively small, since it's going to be gossiped through the cluster. 248 UserEventSizeLimit int 249 250 // messageDropper is a callback used for selectively ignoring inbound 251 // gossip messages. This should only be used in unit tests needing careful 252 // control over sequencing of gossip arrival 253 // 254 // WARNING: this should ONLY be used in tests 255 messageDropper func(typ messageType) bool 256 257 // ReconnectTimeoutOverride is an optional interface which when present allows 258 // the application to cause reaping of a node to happen when it otherwise wouldn't 259 ReconnectTimeoutOverride ReconnectTimeoutOverrider 260 261 // ValidateNodeNames controls whether nodenames only 262 // contain alphanumeric, dashes and '.'characters 263 // and sets maximum length to 128 characters 264 ValidateNodeNames bool 265} 266 267// Init allocates the subdata structures 268func (c *Config) Init() { 269 if c.Tags == nil { 270 c.Tags = make(map[string]string) 271 } 272 if c.messageDropper == nil { 273 c.messageDropper = func(typ messageType) bool { 274 return false 275 } 276 } 277} 278 279// DefaultConfig returns a Config struct that contains reasonable defaults 280// for most of the configurations. 281func DefaultConfig() *Config { 282 hostname, err := os.Hostname() 283 if err != nil { 284 panic(err) 285 } 286 287 return &Config{ 288 NodeName: hostname, 289 BroadcastTimeout: 5 * time.Second, 290 LeavePropagateDelay: 1 * time.Second, 291 EventBuffer: 512, 292 QueryBuffer: 512, 293 LogOutput: os.Stderr, 294 ProtocolVersion: 4, 295 ReapInterval: 15 * time.Second, 296 RecentIntentTimeout: 5 * time.Minute, 297 ReconnectInterval: 30 * time.Second, 298 ReconnectTimeout: 24 * time.Hour, 299 QueueCheckInterval: 30 * time.Second, 300 QueueDepthWarning: 128, 301 MaxQueueDepth: 4096, 302 TombstoneTimeout: 24 * time.Hour, 303 FlapTimeout: 60 * time.Second, 304 MemberlistConfig: memberlist.DefaultLANConfig(), 305 QueryTimeoutMult: 16, 306 QueryResponseSizeLimit: 1024, 307 QuerySizeLimit: 1024, 308 EnableNameConflictResolution: true, 309 DisableCoordinates: false, 310 ValidateNodeNames: false, 311 UserEventSizeLimit: 512, 312 } 313} 314