1package memberlist 2 3import ( 4 "math" 5 "sync/atomic" 6 "time" 7) 8 9// suspicion manages the suspect timer for a node and provides an interface 10// to accelerate the timeout as we get more independent confirmations that 11// a node is suspect. 12type suspicion struct { 13 // n is the number of independent confirmations we've seen. This must 14 // be updated using atomic instructions to prevent contention with the 15 // timer callback. 16 n int32 17 18 // k is the number of independent confirmations we'd like to see in 19 // order to drive the timer to its minimum value. 20 k int32 21 22 // min is the minimum timer value. 23 min time.Duration 24 25 // max is the maximum timer value. 26 max time.Duration 27 28 // start captures the timestamp when we began the timer. This is used 29 // so we can calculate durations to feed the timer during updates in 30 // a way the achieves the overall time we'd like. 31 start time.Time 32 33 // timer is the underlying timer that implements the timeout. 34 timer *time.Timer 35 36 // f is the function to call when the timer expires. We hold on to this 37 // because there are cases where we call it directly. 38 timeoutFn func() 39 40 // confirmations is a map of "from" nodes that have confirmed a given 41 // node is suspect. This prevents double counting. 42 confirmations map[string]struct{} 43} 44 45// newSuspicion returns a timer started with the max time, and that will drive 46// to the min time after seeing k or more confirmations. The from node will be 47// excluded from confirmations since we might get our own suspicion message 48// gossiped back to us. The minimum time will be used if no confirmations are 49// called for (k <= 0). 50func newSuspicion(from string, k int, min time.Duration, max time.Duration, fn func(int)) *suspicion { 51 s := &suspicion{ 52 k: int32(k), 53 min: min, 54 max: max, 55 confirmations: make(map[string]struct{}), 56 } 57 58 // Exclude the from node from any confirmations. 59 s.confirmations[from] = struct{}{} 60 61 // Pass the number of confirmations into the timeout function for 62 // easy telemetry. 63 s.timeoutFn = func() { 64 fn(int(atomic.LoadInt32(&s.n))) 65 } 66 67 // If there aren't any confirmations to be made then take the min 68 // time from the start. 69 timeout := max 70 if k < 1 { 71 timeout = min 72 } 73 s.timer = time.AfterFunc(timeout, s.timeoutFn) 74 75 // Capture the start time right after starting the timer above so 76 // we should always err on the side of a little longer timeout if 77 // there's any preemption that separates this and the step above. 78 s.start = time.Now() 79 return s 80} 81 82// remainingSuspicionTime takes the state variables of the suspicion timer and 83// calculates the remaining time to wait before considering a node dead. The 84// return value can be negative, so be prepared to fire the timer immediately in 85// that case. 86func remainingSuspicionTime(n, k int32, elapsed time.Duration, min, max time.Duration) time.Duration { 87 frac := math.Log(float64(n)+1.0) / math.Log(float64(k)+1.0) 88 raw := max.Seconds() - frac*(max.Seconds()-min.Seconds()) 89 timeout := time.Duration(math.Floor(1000.0*raw)) * time.Millisecond 90 if timeout < min { 91 timeout = min 92 } 93 94 // We have to take into account the amount of time that has passed so 95 // far, so we get the right overall timeout. 96 return timeout - elapsed 97} 98 99// Confirm registers that a possibly new peer has also determined the given 100// node is suspect. This returns true if this was new information, and false 101// if it was a duplicate confirmation, or if we've got enough confirmations to 102// hit the minimum. 103func (s *suspicion) Confirm(from string) bool { 104 // If we've got enough confirmations then stop accepting them. 105 if atomic.LoadInt32(&s.n) >= s.k { 106 return false 107 } 108 109 // Only allow one confirmation from each possible peer. 110 if _, ok := s.confirmations[from]; ok { 111 return false 112 } 113 s.confirmations[from] = struct{}{} 114 115 // Compute the new timeout given the current number of confirmations and 116 // adjust the timer. If the timeout becomes negative *and* we can cleanly 117 // stop the timer then we will call the timeout function directly from 118 // here. 119 n := atomic.AddInt32(&s.n, 1) 120 elapsed := time.Since(s.start) 121 remaining := remainingSuspicionTime(n, s.k, elapsed, s.min, s.max) 122 if s.timer.Stop() { 123 if remaining > 0 { 124 s.timer.Reset(remaining) 125 } else { 126 go s.timeoutFn() 127 } 128 } 129 return true 130} 131