1package command 2 3import ( 4 "flag" 5 "fmt" 6 "strings" 7 "time" 8 9 "github.com/hashicorp/serf/client" 10 "github.com/hashicorp/serf/serf" 11 "github.com/mitchellh/cli" 12) 13 14const ( 15 tooManyAcks = `This could mean Serf is detecting false-failures due to a misconfiguration or network issue.` 16 tooFewAcks = `This could mean Serf gossip packets are being lost due to a misconfiguration or network issue.` 17 duplicateResponses = `Duplicate responses means there is a misconfiguration. Verify that node names are unique.` 18 troubleshooting = ` 19Troubleshooting tips: 20* Ensure that the bind addr:port is accessible by all other nodes 21* If an advertise address is set, ensure it routes to the bind address 22* Check that no nodes are behind a NAT 23* If nodes are behind firewalls or iptables, check that Serf traffic is permitted (UDP and TCP) 24* Verify networking equipment is functional` 25) 26 27// ReachabilityCommand is a Command implementation that is used to trigger 28// a new reachability test 29type ReachabilityCommand struct { 30 ShutdownCh <-chan struct{} 31 Ui cli.Ui 32} 33 34var _ cli.Command = &ReachabilityCommand{} 35 36func (c *ReachabilityCommand) Help() string { 37 helpText := ` 38Usage: serf reachability [options] 39 40 Tests the network reachability of this node 41 42Options: 43 44 -rpc-addr=127.0.0.1:7373 RPC address of the Serf agent. 45 -rpc-auth="" RPC auth token of the Serf agent. 46 -verbose Verbose mode 47` 48 return strings.TrimSpace(helpText) 49} 50 51func (c *ReachabilityCommand) Run(args []string) int { 52 var verbose bool 53 cmdFlags := flag.NewFlagSet("reachability", flag.ContinueOnError) 54 cmdFlags.Usage = func() { c.Ui.Output(c.Help()) } 55 cmdFlags.BoolVar(&verbose, "verbose", false, "verbose mode") 56 rpcAddr := RPCAddrFlag(cmdFlags) 57 rpcAuth := RPCAuthFlag(cmdFlags) 58 if err := cmdFlags.Parse(args); err != nil { 59 return 1 60 } 61 62 cl, err := RPCClient(*rpcAddr, *rpcAuth) 63 if err != nil { 64 c.Ui.Error(fmt.Sprintf("Error connecting to Serf agent: %s", err)) 65 return 1 66 } 67 defer cl.Close() 68 69 ackCh := make(chan string, 128) 70 71 // Get the list of members 72 members, err := cl.Members() 73 if err != nil { 74 c.Ui.Error(fmt.Sprintf("Error getting members: %s", err)) 75 return 1 76 } 77 78 // Get only the live members 79 liveMembers := make(map[string]struct{}) 80 for _, m := range members { 81 if m.Status == "alive" { 82 liveMembers[m.Name] = struct{}{} 83 } 84 } 85 c.Ui.Output(fmt.Sprintf("Total members: %d, live members: %d", len(members), len(liveMembers))) 86 87 // Start the query 88 params := client.QueryParam{ 89 RequestAck: true, 90 Name: serf.InternalQueryPrefix + "ping", 91 AckCh: ackCh, 92 } 93 if err := cl.Query(¶ms); err != nil { 94 c.Ui.Error(fmt.Sprintf("Error sending query: %s", err)) 95 return 1 96 } 97 c.Ui.Output("Starting reachability test...") 98 start := time.Now() 99 last := time.Now() 100 101 // Track responses and acknowledgements 102 exit := 0 103 dups := false 104 numAcks := 0 105 acksFrom := make(map[string]struct{}, len(members)) 106 107OUTER: 108 for { 109 select { 110 case a := <-ackCh: 111 if a == "" { 112 break OUTER 113 } 114 if verbose { 115 c.Ui.Output(fmt.Sprintf("\tAck from '%s'", a)) 116 } 117 numAcks++ 118 if _, ok := acksFrom[a]; ok { 119 dups = true 120 c.Ui.Output(fmt.Sprintf("Duplicate response from '%v'", a)) 121 } 122 acksFrom[a] = struct{}{} 123 last = time.Now() 124 125 case <-c.ShutdownCh: 126 c.Ui.Error("Test interrupted") 127 return 1 128 } 129 } 130 131 if verbose { 132 total := float64(time.Now().Sub(start)) / float64(time.Second) 133 timeToLast := float64(last.Sub(start)) / float64(time.Second) 134 c.Ui.Output(fmt.Sprintf("Query time: %0.2f sec, time to last response: %0.2f sec", total, timeToLast)) 135 } 136 137 // Print troubleshooting info for duplicate responses 138 if dups { 139 c.Ui.Output(duplicateResponses) 140 exit = 1 141 } 142 143 n := len(liveMembers) 144 if numAcks == n { 145 c.Ui.Output("Successfully contacted all live nodes") 146 147 } else if numAcks > n { 148 c.Ui.Output("Received more acks than live nodes! Acks from non-live nodes:") 149 for m := range acksFrom { 150 if _, ok := liveMembers[m]; !ok { 151 c.Ui.Output(fmt.Sprintf("\t%s", m)) 152 } 153 } 154 c.Ui.Output(tooManyAcks) 155 c.Ui.Output(troubleshooting) 156 return 1 157 158 } else if numAcks < n { 159 c.Ui.Output("Received less acks than live nodes! Missing acks from:") 160 for m := range liveMembers { 161 if _, ok := acksFrom[m]; !ok { 162 c.Ui.Output(fmt.Sprintf("\t%s", m)) 163 } 164 } 165 c.Ui.Output(tooFewAcks) 166 c.Ui.Output(troubleshooting) 167 return 1 168 } 169 return exit 170} 171 172func (c *ReachabilityCommand) Synopsis() string { 173 return "Test network reachability" 174} 175