1package memberlist
2
3import (
4	"math"
5	"sync/atomic"
6	"time"
7)
8
9// suspicion manages the suspect timer for a node and provides an interface
10// to accelerate the timeout as we get more independent confirmations that
11// a node is suspect.
12type suspicion struct {
13	// n is the number of independent confirmations we've seen. This must
14	// be updated using atomic instructions to prevent contention with the
15	// timer callback.
16	n int32
17
18	// k is the number of independent confirmations we'd like to see in
19	// order to drive the timer to its minimum value.
20	k int32
21
22	// min is the minimum timer value.
23	min time.Duration
24
25	// max is the maximum timer value.
26	max time.Duration
27
28	// start captures the timestamp when we began the timer. This is used
29	// so we can calculate durations to feed the timer during updates in
30	// a way the achieves the overall time we'd like.
31	start time.Time
32
33	// timer is the underlying timer that implements the timeout.
34	timer *time.Timer
35
36	// f is the function to call when the timer expires. We hold on to this
37	// because there are cases where we call it directly.
38	timeoutFn func()
39
40	// confirmations is a map of "from" nodes that have confirmed a given
41	// node is suspect. This prevents double counting.
42	confirmations map[string]struct{}
43}
44
45// newSuspicion returns a timer started with the max time, and that will drive
46// to the min time after seeing k or more confirmations. The from node will be
47// excluded from confirmations since we might get our own suspicion message
48// gossiped back to us. The minimum time will be used if no confirmations are
49// called for (k <= 0).
50func newSuspicion(from string, k int, min time.Duration, max time.Duration, fn func(int)) *suspicion {
51	s := &suspicion{
52		k:             int32(k),
53		min:           min,
54		max:           max,
55		confirmations: make(map[string]struct{}),
56	}
57
58	// Exclude the from node from any confirmations.
59	s.confirmations[from] = struct{}{}
60
61	// Pass the number of confirmations into the timeout function for
62	// easy telemetry.
63	s.timeoutFn = func() {
64		fn(int(atomic.LoadInt32(&s.n)))
65	}
66
67	// If there aren't any confirmations to be made then take the min
68	// time from the start.
69	timeout := max
70	if k < 1 {
71		timeout = min
72	}
73	s.timer = time.AfterFunc(timeout, s.timeoutFn)
74
75	// Capture the start time right after starting the timer above so
76	// we should always err on the side of a little longer timeout if
77	// there's any preemption that separates this and the step above.
78	s.start = time.Now()
79	return s
80}
81
82// remainingSuspicionTime takes the state variables of the suspicion timer and
83// calculates the remaining time to wait before considering a node dead. The
84// return value can be negative, so be prepared to fire the timer immediately in
85// that case.
86func remainingSuspicionTime(n, k int32, elapsed time.Duration, min, max time.Duration) time.Duration {
87	frac := math.Log(float64(n)+1.0) / math.Log(float64(k)+1.0)
88	raw := max.Seconds() - frac*(max.Seconds()-min.Seconds())
89	timeout := time.Duration(math.Floor(1000.0*raw)) * time.Millisecond
90	if timeout < min {
91		timeout = min
92	}
93
94	// We have to take into account the amount of time that has passed so
95	// far, so we get the right overall timeout.
96	return timeout - elapsed
97}
98
99// Confirm registers that a possibly new peer has also determined the given
100// node is suspect. This returns true if this was new information, and false
101// if it was a duplicate confirmation, or if we've got enough confirmations to
102// hit the minimum.
103func (s *suspicion) Confirm(from string) bool {
104	// If we've got enough confirmations then stop accepting them.
105	if atomic.LoadInt32(&s.n) >= s.k {
106		return false
107	}
108
109	// Only allow one confirmation from each possible peer.
110	if _, ok := s.confirmations[from]; ok {
111		return false
112	}
113	s.confirmations[from] = struct{}{}
114
115	// Compute the new timeout given the current number of confirmations and
116	// adjust the timer. If the timeout becomes negative *and* we can cleanly
117	// stop the timer then we will call the timeout function directly from
118	// here.
119	n := atomic.AddInt32(&s.n, 1)
120	elapsed := time.Since(s.start)
121	remaining := remainingSuspicionTime(n, s.k, elapsed, s.min, s.max)
122	if s.timer.Stop() {
123		if remaining > 0 {
124			s.timer.Reset(remaining)
125		} else {
126			go s.timeoutFn()
127		}
128	}
129	return true
130}
131