1package memberlist
2
3import (
4	"fmt"
5	"io"
6	"log"
7	"net"
8	"os"
9	"strings"
10	"time"
11
12	multierror "github.com/hashicorp/go-multierror"
13)
14
15type Config struct {
16	// The name of this node. This must be unique in the cluster.
17	Name string
18
19	// Transport is a hook for providing custom code to communicate with
20	// other nodes. If this is left nil, then memberlist will by default
21	// make a NetTransport using BindAddr and BindPort from this structure.
22	Transport Transport
23
24	// Configuration related to what address to bind to and ports to
25	// listen on. The port is used for both UDP and TCP gossip. It is
26	// assumed other nodes are running on this port, but they do not need
27	// to.
28	BindAddr string
29	BindPort int
30
31	// Configuration related to what address to advertise to other
32	// cluster members. Used for nat traversal.
33	AdvertiseAddr string
34	AdvertisePort int
35
36	// ProtocolVersion is the configured protocol version that we
37	// will _speak_. This must be between ProtocolVersionMin and
38	// ProtocolVersionMax.
39	ProtocolVersion uint8
40
41	// TCPTimeout is the timeout for establishing a stream connection with
42	// a remote node for a full state sync, and for stream read and write
43	// operations. This is a legacy name for backwards compatibility, but
44	// should really be called StreamTimeout now that we have generalized
45	// the transport.
46	TCPTimeout time.Duration
47
48	// IndirectChecks is the number of nodes that will be asked to perform
49	// an indirect probe of a node in the case a direct probe fails. Memberlist
50	// waits for an ack from any single indirect node, so increasing this
51	// number will increase the likelihood that an indirect probe will succeed
52	// at the expense of bandwidth.
53	IndirectChecks int
54
55	// RetransmitMult is the multiplier for the number of retransmissions
56	// that are attempted for messages broadcasted over gossip. The actual
57	// count of retransmissions is calculated using the formula:
58	//
59	//   Retransmits = RetransmitMult * log(N+1)
60	//
61	// This allows the retransmits to scale properly with cluster size. The
62	// higher the multiplier, the more likely a failed broadcast is to converge
63	// at the expense of increased bandwidth.
64	RetransmitMult int
65
66	// SuspicionMult is the multiplier for determining the time an
67	// inaccessible node is considered suspect before declaring it dead.
68	// The actual timeout is calculated using the formula:
69	//
70	//   SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval
71	//
72	// This allows the timeout to scale properly with expected propagation
73	// delay with a larger cluster size. The higher the multiplier, the longer
74	// an inaccessible node is considered part of the cluster before declaring
75	// it dead, giving that suspect node more time to refute if it is indeed
76	// still alive.
77	SuspicionMult int
78
79	// SuspicionMaxTimeoutMult is the multiplier applied to the
80	// SuspicionTimeout used as an upper bound on detection time. This max
81	// timeout is calculated using the formula:
82	//
83	// SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout
84	//
85	// If everything is working properly, confirmations from other nodes will
86	// accelerate suspicion timers in a manner which will cause the timeout
87	// to reach the base SuspicionTimeout before that elapses, so this value
88	// will typically only come into play if a node is experiencing issues
89	// communicating with other nodes. It should be set to a something fairly
90	// large so that a node having problems will have a lot of chances to
91	// recover before falsely declaring other nodes as failed, but short
92	// enough for a legitimately isolated node to still make progress marking
93	// nodes failed in a reasonable amount of time.
94	SuspicionMaxTimeoutMult int
95
96	// PushPullInterval is the interval between complete state syncs.
97	// Complete state syncs are done with a single node over TCP and are
98	// quite expensive relative to standard gossiped messages. Setting this
99	// to zero will disable state push/pull syncs completely.
100	//
101	// Setting this interval lower (more frequent) will increase convergence
102	// speeds across larger clusters at the expense of increased bandwidth
103	// usage.
104	PushPullInterval time.Duration
105
106	// ProbeInterval and ProbeTimeout are used to configure probing
107	// behavior for memberlist.
108	//
109	// ProbeInterval is the interval between random node probes. Setting
110	// this lower (more frequent) will cause the memberlist cluster to detect
111	// failed nodes more quickly at the expense of increased bandwidth usage.
112	//
113	// ProbeTimeout is the timeout to wait for an ack from a probed node
114	// before assuming it is unhealthy. This should be set to 99-percentile
115	// of RTT (round-trip time) on your network.
116	ProbeInterval time.Duration
117	ProbeTimeout  time.Duration
118
119	// DisableTcpPings will turn off the fallback TCP pings that are attempted
120	// if the direct UDP ping fails. These get pipelined along with the
121	// indirect UDP pings.
122	DisableTcpPings bool
123
124	// DisableTcpPingsForNode is like DisableTcpPings, but lets you control
125	// whether to perform TCP pings on a node-by-node basis.
126	DisableTcpPingsForNode func(nodeName string) bool
127
128	// AwarenessMaxMultiplier will increase the probe interval if the node
129	// becomes aware that it might be degraded and not meeting the soft real
130	// time requirements to reliably probe other nodes.
131	AwarenessMaxMultiplier int
132
133	// GossipInterval and GossipNodes are used to configure the gossip
134	// behavior of memberlist.
135	//
136	// GossipInterval is the interval between sending messages that need
137	// to be gossiped that haven't been able to piggyback on probing messages.
138	// If this is set to zero, non-piggyback gossip is disabled. By lowering
139	// this value (more frequent) gossip messages are propagated across
140	// the cluster more quickly at the expense of increased bandwidth.
141	//
142	// GossipNodes is the number of random nodes to send gossip messages to
143	// per GossipInterval. Increasing this number causes the gossip messages
144	// to propagate across the cluster more quickly at the expense of
145	// increased bandwidth.
146	//
147	// GossipToTheDeadTime is the interval after which a node has died that
148	// we will still try to gossip to it. This gives it a chance to refute.
149	GossipInterval      time.Duration
150	GossipNodes         int
151	GossipToTheDeadTime time.Duration
152
153	// GossipVerifyIncoming controls whether to enforce encryption for incoming
154	// gossip. It is used for upshifting from unencrypted to encrypted gossip on
155	// a running cluster.
156	GossipVerifyIncoming bool
157
158	// GossipVerifyOutgoing controls whether to enforce encryption for outgoing
159	// gossip. It is used for upshifting from unencrypted to encrypted gossip on
160	// a running cluster.
161	GossipVerifyOutgoing bool
162
163	// EnableCompression is used to control message compression. This can
164	// be used to reduce bandwidth usage at the cost of slightly more CPU
165	// utilization. This is only available starting at protocol version 1.
166	EnableCompression bool
167
168	// SecretKey is used to initialize the primary encryption key in a keyring.
169	// The primary encryption key is the only key used to encrypt messages and
170	// the first key used while attempting to decrypt messages. Providing a
171	// value for this primary key will enable message-level encryption and
172	// verification, and automatically install the key onto the keyring.
173	// The value should be either 16, 24, or 32 bytes to select AES-128,
174	// AES-192, or AES-256.
175	SecretKey []byte
176
177	// The keyring holds all of the encryption keys used internally. It is
178	// automatically initialized using the SecretKey and SecretKeys values.
179	Keyring *Keyring
180
181	// Delegate and Events are delegates for receiving and providing
182	// data to memberlist via callback mechanisms. For Delegate, see
183	// the Delegate interface. For Events, see the EventDelegate interface.
184	//
185	// The DelegateProtocolMin/Max are used to guarantee protocol-compatibility
186	// for any custom messages that the delegate might do (broadcasts,
187	// local/remote state, etc.). If you don't set these, then the protocol
188	// versions will just be zero, and version compliance won't be done.
189	Delegate                Delegate
190	DelegateProtocolVersion uint8
191	DelegateProtocolMin     uint8
192	DelegateProtocolMax     uint8
193	Events                  EventDelegate
194	Conflict                ConflictDelegate
195	Merge                   MergeDelegate
196	Ping                    PingDelegate
197	Alive                   AliveDelegate
198
199	// DNSConfigPath points to the system's DNS config file, usually located
200	// at /etc/resolv.conf. It can be overridden via config for easier testing.
201	DNSConfigPath string
202
203	// LogOutput is the writer where logs should be sent. If this is not
204	// set, logging will go to stderr by default. You cannot specify both LogOutput
205	// and Logger at the same time.
206	LogOutput io.Writer
207
208	// Logger is a custom logger which you provide. If Logger is set, it will use
209	// this for the internal logger. If Logger is not set, it will fall back to the
210	// behavior for using LogOutput. You cannot specify both LogOutput and Logger
211	// at the same time.
212	Logger *log.Logger
213
214	// Size of Memberlist's internal channel which handles UDP messages. The
215	// size of this determines the size of the queue which Memberlist will keep
216	// while UDP messages are handled.
217	HandoffQueueDepth int
218
219	// Maximum number of bytes that memberlist will put in a packet (this
220	// will be for UDP packets by default with a NetTransport). A safe value
221	// for this is typically 1400 bytes (which is the default). However,
222	// depending on your network's MTU (Maximum Transmission Unit) you may
223	// be able to increase this to get more content into each gossip packet.
224	// This is a legacy name for backward compatibility but should really be
225	// called PacketBufferSize now that we have generalized the transport.
226	UDPBufferSize int
227
228	// DeadNodeReclaimTime controls the time before a dead node's name can be
229	// reclaimed by one with a different address or port. By default, this is 0,
230	// meaning nodes cannot be reclaimed this way.
231	DeadNodeReclaimTime time.Duration
232
233	// RequireNodeNames controls if the name of a node is required when sending
234	// a message to that node.
235	RequireNodeNames bool
236	// CIDRsAllowed If nil, allow any connection (default), otherwise specify all networks
237	// allowed to connect (you must specify IPv6/IPv4 separately)
238	// Using [] will block all connections.
239	CIDRsAllowed []net.IPNet
240}
241
242// ParseCIDRs return a possible empty list of all Network that have been parsed
243// In case of error, it returns succesfully parsed CIDRs and the last error found
244func ParseCIDRs(v []string) ([]net.IPNet, error) {
245	nets := make([]net.IPNet, 0)
246	if v == nil {
247		return nets, nil
248	}
249	var errs error
250	hasErrors := false
251	for _, p := range v {
252		_, net, err := net.ParseCIDR(strings.TrimSpace(p))
253		if err != nil {
254			err = fmt.Errorf("invalid cidr: %s", p)
255			errs = multierror.Append(errs, err)
256			hasErrors = true
257		} else {
258			nets = append(nets, *net)
259		}
260	}
261	if !hasErrors {
262		errs = nil
263	}
264	return nets, errs
265}
266
267// DefaultLANConfig returns a sane set of configurations for Memberlist.
268// It uses the hostname as the node name, and otherwise sets very conservative
269// values that are sane for most LAN environments. The default configuration
270// errs on the side of caution, choosing values that are optimized
271// for higher convergence at the cost of higher bandwidth usage. Regardless,
272// these values are a good starting point when getting started with memberlist.
273func DefaultLANConfig() *Config {
274	hostname, _ := os.Hostname()
275	return &Config{
276		Name:                    hostname,
277		BindAddr:                "0.0.0.0",
278		BindPort:                7946,
279		AdvertiseAddr:           "",
280		AdvertisePort:           7946,
281		ProtocolVersion:         ProtocolVersion2Compatible,
282		TCPTimeout:              10 * time.Second,       // Timeout after 10 seconds
283		IndirectChecks:          3,                      // Use 3 nodes for the indirect ping
284		RetransmitMult:          4,                      // Retransmit a message 4 * log(N+1) nodes
285		SuspicionMult:           4,                      // Suspect a node for 4 * log(N+1) * Interval
286		SuspicionMaxTimeoutMult: 6,                      // For 10k nodes this will give a max timeout of 120 seconds
287		PushPullInterval:        30 * time.Second,       // Low frequency
288		ProbeTimeout:            500 * time.Millisecond, // Reasonable RTT time for LAN
289		ProbeInterval:           1 * time.Second,        // Failure check every second
290		DisableTcpPings:         false,                  // TCP pings are safe, even with mixed versions
291		AwarenessMaxMultiplier:  8,                      // Probe interval backs off to 8 seconds
292
293		GossipNodes:          3,                      // Gossip to 3 nodes
294		GossipInterval:       200 * time.Millisecond, // Gossip more rapidly
295		GossipToTheDeadTime:  30 * time.Second,       // Same as push/pull
296		GossipVerifyIncoming: true,
297		GossipVerifyOutgoing: true,
298
299		EnableCompression: true, // Enable compression by default
300
301		SecretKey: nil,
302		Keyring:   nil,
303
304		DNSConfigPath: "/etc/resolv.conf",
305
306		HandoffQueueDepth: 1024,
307		UDPBufferSize:     1400,
308		CIDRsAllowed:      nil, // same as allow all
309	}
310}
311
312// DefaultWANConfig works like DefaultConfig, however it returns a configuration
313// that is optimized for most WAN environments. The default configuration is
314// still very conservative and errs on the side of caution.
315func DefaultWANConfig() *Config {
316	conf := DefaultLANConfig()
317	conf.TCPTimeout = 30 * time.Second
318	conf.SuspicionMult = 6
319	conf.PushPullInterval = 60 * time.Second
320	conf.ProbeTimeout = 3 * time.Second
321	conf.ProbeInterval = 5 * time.Second
322	conf.GossipNodes = 4 // Gossip less frequently, but to an additional node
323	conf.GossipInterval = 500 * time.Millisecond
324	conf.GossipToTheDeadTime = 60 * time.Second
325	return conf
326}
327
328// IPMustBeChecked return true if IPAllowed must be called
329func (c *Config) IPMustBeChecked() bool {
330	return len(c.CIDRsAllowed) > 0
331}
332
333// IPAllowed return an error if access to memberlist is denied
334func (c *Config) IPAllowed(ip net.IP) error {
335	if !c.IPMustBeChecked() {
336		return nil
337	}
338	for _, n := range c.CIDRsAllowed {
339		if n.Contains(ip) {
340			return nil
341		}
342	}
343	return fmt.Errorf("%s is not allowed", ip)
344}
345
346// DefaultLocalConfig works like DefaultConfig, however it returns a configuration
347// that is optimized for a local loopback environments. The default configuration is
348// still very conservative and errs on the side of caution.
349func DefaultLocalConfig() *Config {
350	conf := DefaultLANConfig()
351	conf.TCPTimeout = time.Second
352	conf.IndirectChecks = 1
353	conf.RetransmitMult = 2
354	conf.SuspicionMult = 3
355	conf.PushPullInterval = 15 * time.Second
356	conf.ProbeTimeout = 200 * time.Millisecond
357	conf.ProbeInterval = time.Second
358	conf.GossipInterval = 100 * time.Millisecond
359	conf.GossipToTheDeadTime = 15 * time.Second
360	return conf
361}
362
363// Returns whether or not encryption is enabled
364func (c *Config) EncryptionEnabled() bool {
365	return c.Keyring != nil && len(c.Keyring.GetKeys()) > 0
366}
367