1package serf
2
3import (
4	"io"
5	"log"
6	"os"
7	"time"
8
9	"github.com/hashicorp/memberlist"
10)
11
12// ProtocolVersionMap is the mapping of Serf delegate protocol versions
13// to memberlist protocol versions. We mask the memberlist protocols using
14// our own protocol version.
15var ProtocolVersionMap map[uint8]uint8
16
17func init() {
18	ProtocolVersionMap = map[uint8]uint8{
19		5: 2,
20		4: 2,
21		3: 2,
22		2: 2,
23	}
24}
25
26// Config is the configuration for creating a Serf instance.
27type Config struct {
28	// The name of this node. This must be unique in the cluster. If this
29	// is not set, Serf will set it to the hostname of the running machine.
30	NodeName string
31
32	// The tags for this role, if any. This is used to provide arbitrary
33	// key/value metadata per-node. For example, a "role" tag may be used to
34	// differentiate "load-balancer" from a "web" role as parts of the same cluster.
35	// Tags are deprecating 'Role', and instead it acts as a special key in this
36	// map.
37	Tags map[string]string
38
39	// EventCh is a channel that receives all the Serf events. The events
40	// are sent on this channel in proper ordering. Care must be taken that
41	// this channel doesn't block, either by processing the events quick
42	// enough or buffering the channel, otherwise it can block state updates
43	// within Serf itself. If no EventCh is specified, no events will be fired,
44	// but point-in-time snapshots of members can still be retrieved by
45	// calling Members on Serf.
46	EventCh chan<- Event
47
48	// ProtocolVersion is the protocol version to speak. This must be between
49	// ProtocolVersionMin and ProtocolVersionMax.
50	ProtocolVersion uint8
51
52	// BroadcastTimeout is the amount of time to wait for a broadcast
53	// message to be sent to the cluster. Broadcast messages are used for
54	// things like leave messages and force remove messages. If this is not
55	// set, a timeout of 5 seconds will be set.
56	BroadcastTimeout time.Duration
57
58	// LeavePropagateDelay is for our leave (node dead) message to propagate
59	// through the cluster. In particular, we want to stay up long enough to
60	// service any probes from other nodes before they learn about us
61	// leaving and stop probing. Otherwise, we risk getting node failures as
62	// we leave.
63	LeavePropagateDelay time.Duration
64
65	// The settings below relate to Serf's event coalescence feature. Serf
66	// is able to coalesce multiple events into single events in order to
67	// reduce the amount of noise that is sent along the EventCh. For example
68	// if five nodes quickly join, the EventCh will be sent one EventMemberJoin
69	// containing the five nodes rather than five individual EventMemberJoin
70	// events. Coalescence can mitigate potential flapping behavior.
71	//
72	// Coalescence is disabled by default and can be enabled by setting
73	// CoalescePeriod.
74	//
75	// CoalescePeriod specifies the time duration to coalesce events.
76	// For example, if this is set to 5 seconds, then all events received
77	// within 5 seconds that can be coalesced will be.
78	//
79	// QuiescentPeriod specifies the duration of time where if no events
80	// are received, coalescence immediately happens. For example, if
81	// CoalscePeriod is set to 10 seconds but QuiscentPeriod is set to 2
82	// seconds, then the events will be coalesced and dispatched if no
83	// new events are received within 2 seconds of the last event. Otherwise,
84	// every event will always be delayed by at least 10 seconds.
85	CoalescePeriod  time.Duration
86	QuiescentPeriod time.Duration
87
88	// The settings below relate to Serf's user event coalescing feature.
89	// The settings operate like above but only affect user messages and
90	// not the Member* messages that Serf generates.
91	UserCoalescePeriod  time.Duration
92	UserQuiescentPeriod time.Duration
93
94	// The settings below relate to Serf keeping track of recently
95	// failed/left nodes and attempting reconnects.
96	//
97	// ReapInterval is the interval when the reaper runs. If this is not
98	// set (it is zero), it will be set to a reasonable default.
99	//
100	// ReconnectInterval is the interval when we attempt to reconnect
101	// to failed nodes. If this is not set (it is zero), it will be set
102	// to a reasonable default.
103	//
104	// ReconnectTimeout is the amount of time to attempt to reconnect to
105	// a failed node before giving up and considering it completely gone.
106	//
107	// TombstoneTimeout is the amount of time to keep around nodes
108	// that gracefully left as tombstones for syncing state with other
109	// Serf nodes.
110	ReapInterval      time.Duration
111	ReconnectInterval time.Duration
112	ReconnectTimeout  time.Duration
113	TombstoneTimeout  time.Duration
114
115	// FlapTimeout is the amount of time less than which we consider a node
116	// being failed and rejoining looks like a flap for telemetry purposes.
117	// This should be set less than a typical reboot time, but large enough
118	// to see actual events, given our expected detection times for a failed
119	// node.
120	FlapTimeout time.Duration
121
122	// QueueCheckInterval is the interval at which we check the message
123	// queue to apply the warning and max depth.
124	QueueCheckInterval time.Duration
125
126	// QueueDepthWarning is used to generate warning message if the
127	// number of queued messages to broadcast exceeds this number. This
128	// is to provide the user feedback if events are being triggered
129	// faster than they can be disseminated
130	QueueDepthWarning int
131
132	// MaxQueueDepth is used to start dropping messages if the number
133	// of queued messages to broadcast exceeds this number. This is to
134	// prevent an unbounded growth of memory utilization
135	MaxQueueDepth int
136
137	// MinQueueDepth, if >0 will enforce a lower limit for dropping messages
138	// and then the max will be max(MinQueueDepth, 2*SizeOfCluster). This
139	// defaults to 0 which disables this dynamic sizing feature. If this is
140	// >0 then MaxQueueDepth will be ignored.
141	MinQueueDepth int
142
143	// RecentIntentTimeout is used to determine how long we store recent
144	// join and leave intents. This is used to guard against the case where
145	// Serf broadcasts an intent that arrives before the Memberlist event.
146	// It is important that this not be too short to avoid continuous
147	// rebroadcasting of dead events.
148	RecentIntentTimeout time.Duration
149
150	// EventBuffer is used to control how many events are buffered.
151	// This is used to prevent re-delivery of events to a client. The buffer
152	// must be large enough to handle all "recent" events, since Serf will
153	// not deliver messages that are older than the oldest entry in the buffer.
154	// Thus if a client is generating too many events, it's possible that the
155	// buffer gets overrun and messages are not delivered.
156	EventBuffer int
157
158	// QueryBuffer is used to control how many queries are buffered.
159	// This is used to prevent re-delivery of queries to a client. The buffer
160	// must be large enough to handle all "recent" events, since Serf will not
161	// deliver queries older than the oldest entry in the buffer.
162	// Thus if a client is generating too many queries, it's possible that the
163	// buffer gets overrun and messages are not delivered.
164	QueryBuffer int
165
166	// QueryTimeoutMult configures the default timeout multipler for a query to run if no
167	// specific value is provided. Queries are real-time by nature, where the
168	// reply is time sensitive. As a result, results are collected in an async
169	// fashion, however the query must have a bounded duration. We want the timeout
170	// to be long enough that all nodes have time to receive the message, run a handler,
171	// and generate a reply. Once the timeout is exceeded, any further replies are ignored.
172	// The default value is
173	//
174	// Timeout = GossipInterval * QueryTimeoutMult * log(N+1)
175	//
176	QueryTimeoutMult int
177
178	// QueryResponseSizeLimit and QuerySizeLimit limit the inbound and
179	// outbound payload sizes for queries, respectively. These must fit
180	// in a UDP packet with some additional overhead, so tuning these
181	// past the default values of 1024 will depend on your network
182	// configuration.
183	QueryResponseSizeLimit int
184	QuerySizeLimit         int
185
186	// MemberlistConfig is the memberlist configuration that Serf will
187	// use to do the underlying membership management and gossip. Some
188	// fields in the MemberlistConfig will be overwritten by Serf no
189	// matter what:
190	//
191	//   * Name - This will always be set to the same as the NodeName
192	//     in this configuration.
193	//
194	//   * Events - Serf uses a custom event delegate.
195	//
196	//   * Delegate - Serf uses a custom delegate.
197	//
198	MemberlistConfig *memberlist.Config
199
200	// LogOutput is the location to write logs to. If this is not set,
201	// logs will go to stderr.
202	LogOutput io.Writer
203
204	// Logger is a custom logger which you provide. If Logger is set, it will use
205	// this for the internal logger. If Logger is not set, it will fall back to the
206	// behavior for using LogOutput. You cannot specify both LogOutput and Logger
207	// at the same time.
208	Logger *log.Logger
209
210	// SnapshotPath if provided is used to snapshot live nodes as well
211	// as lamport clock values. When Serf is started with a snapshot,
212	// it will attempt to join all the previously known nodes until one
213	// succeeds and will also avoid replaying old user events.
214	SnapshotPath string
215
216	// RejoinAfterLeave controls our interaction with the snapshot file.
217	// When set to false (default), a leave causes a Serf to not rejoin
218	// the cluster until an explicit join is received. If this is set to
219	// true, we ignore the leave, and rejoin the cluster on start.
220	RejoinAfterLeave bool
221
222	// EnableNameConflictResolution controls if Serf will actively attempt
223	// to resolve a name conflict. Since each Serf member must have a unique
224	// name, a cluster can run into issues if multiple nodes claim the same
225	// name. Without automatic resolution, Serf merely logs some warnings, but
226	// otherwise does not take any action. Automatic resolution detects the
227	// conflict and issues a special query which asks the cluster for the
228	// Name -> IP:Port mapping. If there is a simple majority of votes, that
229	// node stays while the other node will leave the cluster and exit.
230	EnableNameConflictResolution bool
231
232	// DisableCoordinates controls if Serf will maintain an estimate of this
233	// node's network coordinate internally. A network coordinate is useful
234	// for estimating the network distance (i.e. round trip time) between
235	// two nodes. Enabling this option adds some overhead to ping messages.
236	DisableCoordinates bool
237
238	// KeyringFile provides the location of a writable file where Serf can
239	// persist changes to the encryption keyring.
240	KeyringFile string
241
242	// Merge can be optionally provided to intercept a cluster merge
243	// and conditionally abort the merge.
244	Merge MergeDelegate
245
246	// UserEventSizeLimit is maximum byte size limit of user event `name` + `payload` in bytes.
247	// It's optimal to be relatively small, since it's going to be gossiped through the cluster.
248	UserEventSizeLimit int
249
250	// messageDropper is a callback used for selectively ignoring inbound
251	// gossip messages. This should only be used in unit tests needing careful
252	// control over sequencing of gossip arrival
253	//
254	// WARNING: this should ONLY be used in tests
255	messageDropper func(typ messageType) bool
256
257	// ReconnectTimeoutOverride is an optional interface which when present allows
258	// the application to cause reaping of a node to happen when it otherwise wouldn't
259	ReconnectTimeoutOverride ReconnectTimeoutOverrider
260
261	// ValidateNodeNames controls whether nodenames only
262	// contain alphanumeric, dashes and '.'characters
263	// and sets maximum length to 128 characters
264	ValidateNodeNames bool
265}
266
267// Init allocates the subdata structures
268func (c *Config) Init() {
269	if c.Tags == nil {
270		c.Tags = make(map[string]string)
271	}
272	if c.messageDropper == nil {
273		c.messageDropper = func(typ messageType) bool {
274			return false
275		}
276	}
277}
278
279// DefaultConfig returns a Config struct that contains reasonable defaults
280// for most of the configurations.
281func DefaultConfig() *Config {
282	hostname, err := os.Hostname()
283	if err != nil {
284		panic(err)
285	}
286
287	return &Config{
288		NodeName:                     hostname,
289		BroadcastTimeout:             5 * time.Second,
290		LeavePropagateDelay:          1 * time.Second,
291		EventBuffer:                  512,
292		QueryBuffer:                  512,
293		LogOutput:                    os.Stderr,
294		ProtocolVersion:              4,
295		ReapInterval:                 15 * time.Second,
296		RecentIntentTimeout:          5 * time.Minute,
297		ReconnectInterval:            30 * time.Second,
298		ReconnectTimeout:             24 * time.Hour,
299		QueueCheckInterval:           30 * time.Second,
300		QueueDepthWarning:            128,
301		MaxQueueDepth:                4096,
302		TombstoneTimeout:             24 * time.Hour,
303		FlapTimeout:                  60 * time.Second,
304		MemberlistConfig:             memberlist.DefaultLANConfig(),
305		QueryTimeoutMult:             16,
306		QueryResponseSizeLimit:       1024,
307		QuerySizeLimit:               1024,
308		EnableNameConflictResolution: true,
309		DisableCoordinates:           false,
310		ValidateNodeNames:            false,
311		UserEventSizeLimit:           512,
312	}
313}
314