1package command
2
3import (
4	"flag"
5	"fmt"
6	"strings"
7	"time"
8
9	"github.com/hashicorp/serf/client"
10	"github.com/hashicorp/serf/serf"
11	"github.com/mitchellh/cli"
12)
13
14const (
15	tooManyAcks        = `This could mean Serf is detecting false-failures due to a misconfiguration or network issue.`
16	tooFewAcks         = `This could mean Serf gossip packets are being lost due to a misconfiguration or network issue.`
17	duplicateResponses = `Duplicate responses means there is a misconfiguration. Verify that node names are unique.`
18	troubleshooting    = `
19Troubleshooting tips:
20* Ensure that the bind addr:port is accessible by all other nodes
21* If an advertise address is set, ensure it routes to the bind address
22* Check that no nodes are behind a NAT
23* If nodes are behind firewalls or iptables, check that Serf traffic is permitted (UDP and TCP)
24* Verify networking equipment is functional`
25)
26
27// ReachabilityCommand is a Command implementation that is used to trigger
28// a new reachability test
29type ReachabilityCommand struct {
30	ShutdownCh <-chan struct{}
31	Ui         cli.Ui
32}
33
34var _ cli.Command = &ReachabilityCommand{}
35
36func (c *ReachabilityCommand) Help() string {
37	helpText := `
38Usage: serf reachability [options]
39
40  Tests the network reachability of this node
41
42Options:
43
44  -rpc-addr=127.0.0.1:7373  RPC address of the Serf agent.
45  -rpc-auth=""              RPC auth token of the Serf agent.
46  -verbose                  Verbose mode
47`
48	return strings.TrimSpace(helpText)
49}
50
51func (c *ReachabilityCommand) Run(args []string) int {
52	var verbose bool
53	cmdFlags := flag.NewFlagSet("reachability", flag.ContinueOnError)
54	cmdFlags.Usage = func() { c.Ui.Output(c.Help()) }
55	cmdFlags.BoolVar(&verbose, "verbose", false, "verbose mode")
56	rpcAddr := RPCAddrFlag(cmdFlags)
57	rpcAuth := RPCAuthFlag(cmdFlags)
58	if err := cmdFlags.Parse(args); err != nil {
59		return 1
60	}
61
62	cl, err := RPCClient(*rpcAddr, *rpcAuth)
63	if err != nil {
64		c.Ui.Error(fmt.Sprintf("Error connecting to Serf agent: %s", err))
65		return 1
66	}
67	defer cl.Close()
68
69	ackCh := make(chan string, 128)
70
71	// Get the list of members
72	members, err := cl.Members()
73	if err != nil {
74		c.Ui.Error(fmt.Sprintf("Error getting members: %s", err))
75		return 1
76	}
77
78	// Get only the live members
79	liveMembers := make(map[string]struct{})
80	for _, m := range members {
81		if m.Status == "alive" {
82			liveMembers[m.Name] = struct{}{}
83		}
84	}
85	c.Ui.Output(fmt.Sprintf("Total members: %d, live members: %d", len(members), len(liveMembers)))
86
87	// Start the query
88	params := client.QueryParam{
89		RequestAck: true,
90		Name:       serf.InternalQueryPrefix + "ping",
91		AckCh:      ackCh,
92	}
93	if err := cl.Query(&params); err != nil {
94		c.Ui.Error(fmt.Sprintf("Error sending query: %s", err))
95		return 1
96	}
97	c.Ui.Output("Starting reachability test...")
98	start := time.Now()
99	last := time.Now()
100
101	// Track responses and acknowledgements
102	exit := 0
103	dups := false
104	numAcks := 0
105	acksFrom := make(map[string]struct{}, len(members))
106
107OUTER:
108	for {
109		select {
110		case a := <-ackCh:
111			if a == "" {
112				break OUTER
113			}
114			if verbose {
115				c.Ui.Output(fmt.Sprintf("\tAck from '%s'", a))
116			}
117			numAcks++
118			if _, ok := acksFrom[a]; ok {
119				dups = true
120				c.Ui.Output(fmt.Sprintf("Duplicate response from '%v'", a))
121			}
122			acksFrom[a] = struct{}{}
123			last = time.Now()
124
125		case <-c.ShutdownCh:
126			c.Ui.Error("Test interrupted")
127			return 1
128		}
129	}
130
131	if verbose {
132		total := float64(time.Now().Sub(start)) / float64(time.Second)
133		timeToLast := float64(last.Sub(start)) / float64(time.Second)
134		c.Ui.Output(fmt.Sprintf("Query time: %0.2f sec, time to last response: %0.2f sec", total, timeToLast))
135	}
136
137	// Print troubleshooting info for duplicate responses
138	if dups {
139		c.Ui.Output(duplicateResponses)
140		exit = 1
141	}
142
143	n := len(liveMembers)
144	if numAcks == n {
145		c.Ui.Output("Successfully contacted all live nodes")
146
147	} else if numAcks > n {
148		c.Ui.Output("Received more acks than live nodes! Acks from non-live nodes:")
149		for m := range acksFrom {
150			if _, ok := liveMembers[m]; !ok {
151				c.Ui.Output(fmt.Sprintf("\t%s", m))
152			}
153		}
154		c.Ui.Output(tooManyAcks)
155		c.Ui.Output(troubleshooting)
156		return 1
157
158	} else if numAcks < n {
159		c.Ui.Output("Received less acks than live nodes! Missing acks from:")
160		for m := range liveMembers {
161			if _, ok := acksFrom[m]; !ok {
162				c.Ui.Output(fmt.Sprintf("\t%s", m))
163			}
164		}
165		c.Ui.Output(tooFewAcks)
166		c.Ui.Output(troubleshooting)
167		return 1
168	}
169	return exit
170}
171
172func (c *ReachabilityCommand) Synopsis() string {
173	return "Test network reachability"
174}
175