serf/serf/snapshot.go

package serf

import (
	"bufio"
	"fmt"
	"log"
	"math/rand"
	"net"
	"os"
	"strconv"
	"strings"
	"time"

	"github.com/armon/go-metrics"
)

/*
Serf supports using a "snapshot" file that contains various
transactional data that is used to help Serf recover quickly
and gracefully from a failure. We append member events, as well
as the latest clock values to the file during normal operation,
and periodically checkpoint and roll over the file. During a restore,
we can replay the various member events to recall a list of known
nodes to re-join, as well as restore our clock values to avoid replaying
old events.
*/

const (
	// flushInterval is how often we force a flush of the snapshot file
	flushInterval = 500 * time.Millisecond

	// clockUpdateInterval is how often we fetch the current lamport time of the cluster and write to the snapshot file
	clockUpdateInterval = 500 * time.Millisecond

	// tmpExt is the extention we use for the temporary file during compaction
	tmpExt = ".compact"

	// snapshotErrorRecoveryInterval is how often we attempt to recover from
	// errors writing to the snapshot file.
	snapshotErrorRecoveryInterval = 30 * time.Second

	// eventChSize is the size of the event buffers between Serf and the
	// consuming application. If this is exhausted we will block Serf and Memberlist.
	eventChSize = 2048

	// shutdownFlushTimeout is the time limit to write pending events to the snapshot during a shutdown
	shutdownFlushTimeout = 250 * time.Millisecond

	// snapshotBytesPerNode is an estimated bytes per node to snapshot
	snapshotBytesPerNode = 128

	// snapshotCompactionThreshold is the threshold we apply to
	// the snapshot size estimate (nodes * bytes per node) before compacting.
	snapshotCompactionThreshold = 2
)

// Snapshotter is responsible for ingesting events and persisting
// them to disk, and providing a recovery mechanism at start time.
type Snapshotter struct {
	aliveNodes              map[string]string
	clock                   *LamportClock
	fh                      *os.File
	buffered                *bufio.Writer
	inCh                    <-chan Event
	streamCh                chan Event
	lastFlush               time.Time
	lastClock               LamportTime
	lastEventClock          LamportTime
	lastQueryClock          LamportTime
	leaveCh                 chan struct{}
	leaving                 bool
	logger                  *log.Logger
	minCompactSize          int64
	path                    string
	offset                  int64
	outCh                   chan<- Event
	rejoinAfterLeave        bool
	shutdownCh              <-chan struct{}
	waitCh                  chan struct{}
	lastAttemptedCompaction time.Time
}

// PreviousNode is used to represent the previously known alive nodes
type PreviousNode struct {
	Name string
	Addr string
}

func (p PreviousNode) String() string {
	return fmt.Sprintf("%s: %s", p.Name, p.Addr)
}

// NewSnapshotter creates a new Snapshotter that records events up to a
// max byte size before rotating the file. It can also be used to
// recover old state. Snapshotter works by reading an event channel it returns,
// passing through to an output channel, and persisting relevant events to disk.
// Setting rejoinAfterLeave makes leave not clear the state, and can be used
// if you intend to rejoin the same cluster after a leave.
func NewSnapshotter(path string,
	minCompactSize int,
	rejoinAfterLeave bool,
	logger *log.Logger,
	clock *LamportClock,
	outCh chan<- Event,
	shutdownCh <-chan struct{}) (chan<- Event, *Snapshotter, error) {
	inCh := make(chan Event, eventChSize)
	streamCh := make(chan Event, eventChSize)

	// Try to open the file
	fh, err := os.OpenFile(path, os.O_RDWR|os.O_APPEND|os.O_CREATE, 0644)
	if err != nil {
		return nil, nil, fmt.Errorf("failed to open snapshot: %v", err)
	}

	// Determine the offset
	info, err := fh.Stat()
	if err != nil {
		fh.Close()
		return nil, nil, fmt.Errorf("failed to stat snapshot: %v", err)
	}
	offset := info.Size()

	// Create the snapshotter
	snap := &Snapshotter{
		aliveNodes:       make(map[string]string),
		clock:            clock,
		fh:               fh,
		buffered:         bufio.NewWriter(fh),
		inCh:             inCh,
		streamCh:         streamCh,
		lastClock:        0,
		lastEventClock:   0,
		lastQueryClock:   0,
		leaveCh:          make(chan struct{}),
		logger:           logger,
		minCompactSize:   int64(minCompactSize),
		path:             path,
		offset:           offset,
		outCh:            outCh,
		rejoinAfterLeave: rejoinAfterLeave,
		shutdownCh:       shutdownCh,
		waitCh:           make(chan struct{}),
	}

	// Recover the last known state
	if err := snap.replay(); err != nil {
		fh.Close()
		return nil, nil, err
	}

	// Start handling new commands
	go snap.teeStream()
	go snap.stream()
	return inCh, snap, nil
}

// LastClock returns the last known clock time
func (s *Snapshotter) LastClock() LamportTime {
	return s.lastClock
}

// LastEventClock returns the last known event clock time
func (s *Snapshotter) LastEventClock() LamportTime {
	return s.lastEventClock
}

// LastQueryClock returns the last known query clock time
func (s *Snapshotter) LastQueryClock() LamportTime {
	return s.lastQueryClock
}

// AliveNodes returns the last known alive nodes
func (s *Snapshotter) AliveNodes() []*PreviousNode {
	// Copy the previously known
	previous := make([]*PreviousNode, 0, len(s.aliveNodes))
	for name, addr := range s.aliveNodes {
		previous = append(previous, &PreviousNode{name, addr})
	}

	// Randomize the order, prevents hot shards
	for i := range previous {
		j := rand.Intn(i + 1)
		previous[i], previous[j] = previous[j], previous[i]
	}
	return previous
}

// Wait is used to wait until the snapshotter finishes shut down
func (s *Snapshotter) Wait() {
	<-s.waitCh
}

// Leave is used to remove known nodes to prevent a restart from
// causing a join. Otherwise nodes will re-join after leaving!
func (s *Snapshotter) Leave() {
	select {
	case s.leaveCh <- struct{}{}:
	case <-s.shutdownCh:
	}
}

// teeStream is a long running routine that is used to copy events
// to the output channel and the internal event handler.
func (s *Snapshotter) teeStream() {
	flushEvent := func(e Event) {
		// Forward to the internal stream, do not block
		select {
		case s.streamCh <- e:
		default:
		}

		// Forward the event immediately, do not block
		if s.outCh != nil {
			select {
			case s.outCh <- e:
			default:
			}
		}
	}

OUTER:
	for {
		select {
		case e := <-s.inCh:
			flushEvent(e)
		case <-s.shutdownCh:
			break OUTER
		}
	}

	// Drain any remaining events before exiting
	for {
		select {
		case e := <-s.inCh:
			flushEvent(e)
		default:
			return
		}
	}
}

// stream is a long running routine that is used to handle events
func (s *Snapshotter) stream() {
	clockTicker := time.NewTicker(clockUpdateInterval)
	defer clockTicker.Stop()

	// flushEvent is used to handle writing out an event
	flushEvent := func(e Event) {
		// Stop recording events after a leave is issued
		if s.leaving {
			return
		}
		switch typed := e.(type) {
		case MemberEvent:
			s.processMemberEvent(typed)
		case UserEvent:
			s.processUserEvent(typed)
		case *Query:
			s.processQuery(typed)
		default:
			s.logger.Printf("[ERR] serf: Unknown event to snapshot: %#v", e)
		}
	}

	for {
		select {
		case <-s.leaveCh:
			s.leaving = true

			// If we plan to re-join, keep our state
			if !s.rejoinAfterLeave {
				s.aliveNodes = make(map[string]string)
			}
			s.tryAppend("leave\n")
			if err := s.buffered.Flush(); err != nil {
				s.logger.Printf("[ERR] serf: failed to flush leave to snapshot: %v", err)
			}
			if err := s.fh.Sync(); err != nil {
				s.logger.Printf("[ERR] serf: failed to sync leave to snapshot: %v", err)
			}

		case e := <-s.streamCh:
			flushEvent(e)

		case <-clockTicker.C:
			s.updateClock()

		case <-s.shutdownCh:
			// Setup a timeout
			flushTimeout := time.After(shutdownFlushTimeout)

			// Snapshot the clock
			s.updateClock()

			// Clear out the buffers
		FLUSH:
			for {
				select {
				case e := <-s.streamCh:
					flushEvent(e)
				case <-flushTimeout:
					break FLUSH
				default:
					break FLUSH
				}
			}

			if err := s.buffered.Flush(); err != nil {
				s.logger.Printf("[ERR] serf: failed to flush snapshot: %v", err)
			}
			if err := s.fh.Sync(); err != nil {
				s.logger.Printf("[ERR] serf: failed to sync snapshot: %v", err)
			}
			s.fh.Close()
			close(s.waitCh)
			return
		}
	}
}

// processMemberEvent is used to handle a single member event
func (s *Snapshotter) processMemberEvent(e MemberEvent) {
	switch e.Type {
	case EventMemberJoin:
		for _, mem := range e.Members {
			addr := net.TCPAddr{IP: mem.Addr, Port: int(mem.Port)}
			s.aliveNodes[mem.Name] = addr.String()
			s.tryAppend(fmt.Sprintf("alive: %s %s\n", mem.Name, addr.String()))
		}

	case EventMemberLeave:
		fallthrough
	case EventMemberFailed:
		for _, mem := range e.Members {
			delete(s.aliveNodes, mem.Name)
			s.tryAppend(fmt.Sprintf("not-alive: %s\n", mem.Name))
		}
	}
	s.updateClock()
}

// updateClock is called periodically to check if we should udpate our
// clock value. This is done after member events but should also be done
// periodically due to race conditions with join and leave intents
func (s *Snapshotter) updateClock() {
	lastSeen := s.clock.Time() - 1
	if lastSeen > s.lastClock {
		s.lastClock = lastSeen
		s.tryAppend(fmt.Sprintf("clock: %d\n", s.lastClock))
	}
}

// processUserEvent is used to handle a single user event
func (s *Snapshotter) processUserEvent(e UserEvent) {
	// Ignore old clocks
	if e.LTime <= s.lastEventClock {
		return
	}
	s.lastEventClock = e.LTime
	s.tryAppend(fmt.Sprintf("event-clock: %d\n", e.LTime))
}

// processQuery is used to handle a single query event
func (s *Snapshotter) processQuery(q *Query) {
	// Ignore old clocks
	if q.LTime <= s.lastQueryClock {
		return
	}
	s.lastQueryClock = q.LTime
	s.tryAppend(fmt.Sprintf("query-clock: %d\n", q.LTime))
}

// tryAppend will invoke append line but will not return an error
func (s *Snapshotter) tryAppend(l string) {
	if err := s.appendLine(l); err != nil {
		s.logger.Printf("[ERR] serf: Failed to update snapshot: %v", err)
		now := time.Now()
		if now.Sub(s.lastAttemptedCompaction) > snapshotErrorRecoveryInterval {
			s.lastAttemptedCompaction = now
			s.logger.Printf("[INFO] serf: Attempting compaction to recover from error...")
			err = s.compact()
			if err != nil {
				s.logger.Printf("[ERR] serf: Compaction failed, will reattempt after %v: %v", snapshotErrorRecoveryInterval, err)
			} else {
				s.logger.Printf("[INFO] serf: Finished compaction, successfully recovered from error state")
			}
		}
	}
}

// appendLine is used to append a line to the existing log
func (s *Snapshotter) appendLine(l string) error {
	defer metrics.MeasureSince([]string{"serf", "snapshot", "appendLine"}, time.Now())

	n, err := s.buffered.WriteString(l)
	if err != nil {
		return err
	}

	// Check if we should flush
	now := time.Now()
	if now.Sub(s.lastFlush) > flushInterval {
		s.lastFlush = now
		if err := s.buffered.Flush(); err != nil {
			return err
		}
	}

	// Check if a compaction is necessary
	s.offset += int64(n)
	if s.offset > s.snapshotMaxSize() {
		return s.compact()
	}
	return nil
}

// snapshotMaxSize computes the maximum size and is used to force periodic compaction.
func (s *Snapshotter) snapshotMaxSize() int64 {
	nodes := int64(len(s.aliveNodes))
	estSize := nodes * snapshotBytesPerNode
	threshold := estSize * snapshotCompactionThreshold

	// Apply a minimum threshold to avoid frequent compaction
	if threshold < s.minCompactSize {
		threshold = s.minCompactSize
	}
	return threshold
}

// Compact is used to compact the snapshot once it is too large
func (s *Snapshotter) compact() error {
	defer metrics.MeasureSince([]string{"serf", "snapshot", "compact"}, time.Now())

	// Try to open the file to new fiel
	newPath := s.path + tmpExt
	fh, err := os.OpenFile(newPath, os.O_RDWR|os.O_TRUNC|os.O_CREATE, 0755)
	if err != nil {
		return fmt.Errorf("failed to open new snapshot: %v", err)
	}

	// Create a buffered writer
	buf := bufio.NewWriter(fh)

	// Write out the live nodes
	var offset int64
	for name, addr := range s.aliveNodes {
		line := fmt.Sprintf("alive: %s %s\n", name, addr)
		n, err := buf.WriteString(line)
		if err != nil {
			fh.Close()
			return err
		}
		offset += int64(n)
	}

	// Write out the clocks
	line := fmt.Sprintf("clock: %d\n", s.lastClock)
	n, err := buf.WriteString(line)
	if err != nil {
		fh.Close()
		return err
	}
	offset += int64(n)

	line = fmt.Sprintf("event-clock: %d\n", s.lastEventClock)
	n, err = buf.WriteString(line)
	if err != nil {
		fh.Close()
		return err
	}
	offset += int64(n)

	line = fmt.Sprintf("query-clock: %d\n", s.lastQueryClock)
	n, err = buf.WriteString(line)
	if err != nil {
		fh.Close()
		return err
	}
	offset += int64(n)

	// Flush the new snapshot
	err = buf.Flush()

	if err != nil {
		return fmt.Errorf("failed to flush new snapshot: %v", err)
	}

	err = fh.Sync()

	if err != nil {
		fh.Close()
		return fmt.Errorf("failed to fsync new snapshot: %v", err)
	}

	fh.Close()

	// We now need to swap the old snapshot file with the new snapshot.
	// Turns out, Windows won't let us rename the files if we have
	// open handles to them or if the destination already exists. This
	// means we are forced to close the existing handles, delete the
	// old file, move the new one in place, and then re-open the file
	// handles.

	// Flush the existing snapshot, ignoring errors since we will
	// delete it momentarily.
	s.buffered.Flush()
	s.buffered = nil

	// Close the file handle to the old snapshot
	s.fh.Close()
	s.fh = nil

	// Delete the old file
	if err := os.Remove(s.path); err != nil {
		return fmt.Errorf("failed to remove old snapshot: %v", err)
	}

	// Move the new file into place
	if err := os.Rename(newPath, s.path); err != nil {
		return fmt.Errorf("failed to install new snapshot: %v", err)
	}

	// Open the new snapshot
	fh, err = os.OpenFile(s.path, os.O_RDWR|os.O_APPEND|os.O_CREATE, 0755)
	if err != nil {
		return fmt.Errorf("failed to open snapshot: %v", err)
	}
	buf = bufio.NewWriter(fh)

	// Rotate our handles
	s.fh = fh
	s.buffered = buf
	s.offset = offset
	s.lastFlush = time.Now()
	return nil
}

// replay is used to seek to reset our internal state by replaying
// the snapshot file. It is used at initialization time to read old
// state
func (s *Snapshotter) replay() error {
	// Seek to the beginning
	if _, err := s.fh.Seek(0, os.SEEK_SET); err != nil {
		return err
	}

	// Read each line
	reader := bufio.NewReader(s.fh)
	for {
		line, err := reader.ReadString('\n')
		if err != nil {
			break
		}

		// Skip the newline
		line = line[:len(line)-1]

		// Switch on the prefix
		if strings.HasPrefix(line, "alive: ") {
			info := strings.TrimPrefix(line, "alive: ")
			addrIdx := strings.LastIndex(info, " ")
			if addrIdx == -1 {
				s.logger.Printf("[WARN] serf: Failed to parse address: %v", line)
				continue
			}
			addr := info[addrIdx+1:]
			name := info[:addrIdx]
			s.aliveNodes[name] = addr

		} else if strings.HasPrefix(line, "not-alive: ") {
			name := strings.TrimPrefix(line, "not-alive: ")
			delete(s.aliveNodes, name)

		} else if strings.HasPrefix(line, "clock: ") {
			timeStr := strings.TrimPrefix(line, "clock: ")
			timeInt, err := strconv.ParseUint(timeStr, 10, 64)
			if err != nil {
				s.logger.Printf("[WARN] serf: Failed to convert clock time: %v", err)
				continue
			}
			s.lastClock = LamportTime(timeInt)

		} else if strings.HasPrefix(line, "event-clock: ") {
			timeStr := strings.TrimPrefix(line, "event-clock: ")
			timeInt, err := strconv.ParseUint(timeStr, 10, 64)
			if err != nil {
				s.logger.Printf("[WARN] serf: Failed to convert event clock time: %v", err)
				continue
			}
			s.lastEventClock = LamportTime(timeInt)

		} else if strings.HasPrefix(line, "query-clock: ") {
			timeStr := strings.TrimPrefix(line, "query-clock: ")
			timeInt, err := strconv.ParseUint(timeStr, 10, 64)
			if err != nil {
				s.logger.Printf("[WARN] serf: Failed to convert query clock time: %v", err)
				continue
			}
			s.lastQueryClock = LamportTime(timeInt)

		} else if strings.HasPrefix(line, "coordinate: ") {
			continue // Ignores any coordinate persistence from old snapshots, serf should re-converge
		} else if line == "leave" {
			// Ignore a leave if we plan on re-joining
			if s.rejoinAfterLeave {
				s.logger.Printf("[INFO] serf: Ignoring previous leave in snapshot")
				continue
			}
			s.aliveNodes = make(map[string]string)
			s.lastClock = 0
			s.lastEventClock = 0
			s.lastQueryClock = 0

		} else if strings.HasPrefix(line, "#") {
			// Skip comment lines

		} else {
			s.logger.Printf("[WARN] serf: Unrecognized snapshot line: %v", line)
		}
	}

	// Seek to the end
	if _, err := s.fh.Seek(0, os.SEEK_END); err != nil {
		return err
	}
	return nil
}