1package debug
2
3import (
4	"archive/tar"
5	"compress/gzip"
6	"encoding/json"
7	"errors"
8	"flag"
9	"fmt"
10	"golang.org/x/sync/errgroup"
11	"io"
12	"io/ioutil"
13	"os"
14	"path/filepath"
15	"strings"
16	"time"
17
18	"github.com/hashicorp/go-multierror"
19	"github.com/mitchellh/cli"
20
21	"github.com/hashicorp/consul/api"
22	"github.com/hashicorp/consul/command/flags"
23)
24
25const (
26	// debugInterval is the interval in which to capture dynamic information
27	// when running debug
28	debugInterval = 30 * time.Second
29
30	// debugDuration is the total duration that debug runs before being
31	// shut down
32	debugDuration = 2 * time.Minute
33
34	// debugDurationGrace is a period of time added to the specified
35	// duration to allow intervals to capture within that time
36	debugDurationGrace = 2 * time.Second
37
38	// debugMinInterval is the minimum a user can configure the interval
39	// to prevent accidental DOS
40	debugMinInterval = 5 * time.Second
41
42	// debugMinDuration is the minimum a user can configure the duration
43	// to ensure that all information can be collected in time
44	debugMinDuration = 10 * time.Second
45
46	// debugArchiveExtension is the extension for archive files
47	debugArchiveExtension = ".tar.gz"
48
49	// debugProtocolVersion is the version of the package that is
50	// generated. If this format changes interface, this version
51	// can be incremented so clients can selectively support packages
52	debugProtocolVersion = 1
53)
54
55func New(ui cli.Ui, shutdownCh <-chan struct{}) *cmd {
56	ui = &cli.PrefixedUi{
57		OutputPrefix: "==> ",
58		InfoPrefix:   "    ",
59		ErrorPrefix:  "==> ",
60		Ui:           ui,
61	}
62
63	c := &cmd{UI: ui, shutdownCh: shutdownCh}
64	c.init()
65	return c
66}
67
68type cmd struct {
69	UI    cli.Ui
70	flags *flag.FlagSet
71	http  *flags.HTTPFlags
72	help  string
73
74	shutdownCh <-chan struct{}
75
76	// flags
77	interval time.Duration
78	duration time.Duration
79	output   string
80	archive  bool
81	capture  []string
82	client   *api.Client
83	// validateTiming can be used to skip validation of interval, duration. This
84	// is primarily useful for testing
85	validateTiming bool
86
87	index *debugIndex
88}
89
90// debugIndex is used to manage the summary of all data recorded
91// during the debug, to be written to json at the end of the run
92// and stored at the root. Each attribute corresponds to a file or files.
93type debugIndex struct {
94	// Version of the debug package
95	Version int
96	// Version of the target Consul agent
97	AgentVersion string
98
99	Interval string
100	Duration string
101
102	Targets []string
103}
104
105func (c *cmd) init() {
106	c.flags = flag.NewFlagSet("", flag.ContinueOnError)
107
108	defaultFilename := fmt.Sprintf("consul-debug-%d", time.Now().Unix())
109
110	c.flags.Var((*flags.AppendSliceValue)(&c.capture), "capture",
111		fmt.Sprintf("One or more types of information to capture. This can be used "+
112			"to capture a subset of information, and defaults to capturing "+
113			"everything available. Possible information for capture: %s. "+
114			"This can be repeated multiple times.", strings.Join(c.defaultTargets(), ", ")))
115	c.flags.DurationVar(&c.interval, "interval", debugInterval,
116		fmt.Sprintf("The interval in which to capture dynamic information such as "+
117			"telemetry, and profiling. Defaults to %s.", debugInterval))
118	c.flags.DurationVar(&c.duration, "duration", debugDuration,
119		fmt.Sprintf("The total time to record information. "+
120			"Defaults to %s.", debugDuration))
121	c.flags.BoolVar(&c.archive, "archive", true, "Boolean value for if the files "+
122		"should be archived and compressed. Setting this to false will skip the "+
123		"archive step and leave the directory of information on the current path.")
124	c.flags.StringVar(&c.output, "output", defaultFilename, "The path "+
125		"to the compressed archive that will be created with the "+
126		"information after collection.")
127
128	c.http = &flags.HTTPFlags{}
129	flags.Merge(c.flags, c.http.ClientFlags())
130	c.help = flags.Usage(help, c.flags)
131
132	c.validateTiming = true
133}
134
135func (c *cmd) Run(args []string) int {
136	if err := c.flags.Parse(args); err != nil {
137		c.UI.Error(fmt.Sprintf("Error parsing flags: %s", err))
138		return 1
139	}
140
141	if len(c.flags.Args()) > 0 {
142		c.UI.Error("debug: Too many arguments provided, expected 0")
143		return 1
144	}
145
146	// Connect to the agent
147	client, err := c.http.APIClient()
148	if err != nil {
149		c.UI.Error(fmt.Sprintf("Error connecting to Consul agent: %s", err))
150		return 1
151	}
152	c.client = client
153
154	version, err := c.prepare()
155	if err != nil {
156		c.UI.Error(fmt.Sprintf("Capture validation failed: %v", err))
157		return 1
158	}
159
160	archiveName := c.output
161	// Show the user the final file path if archiving
162	if c.archive {
163		archiveName = archiveName + debugArchiveExtension
164	}
165
166	c.UI.Output("Starting debugger and capturing static information...")
167
168	// Output metadata about target agent
169	c.UI.Info(fmt.Sprintf(" Agent Version: '%s'", version))
170	c.UI.Info(fmt.Sprintf("      Interval: '%s'", c.interval))
171	c.UI.Info(fmt.Sprintf("      Duration: '%s'", c.duration))
172	c.UI.Info(fmt.Sprintf("        Output: '%s'", archiveName))
173	c.UI.Info(fmt.Sprintf("       Capture: '%s'", strings.Join(c.capture, ", ")))
174
175	// Record some information for the index at the root of the archive
176	index := &debugIndex{
177		Version:      debugProtocolVersion,
178		AgentVersion: version,
179		Interval:     c.interval.String(),
180		Duration:     c.duration.String(),
181		Targets:      c.capture,
182	}
183
184	// Add the extra grace period to ensure
185	// all intervals will be captured within the time allotted
186	c.duration = c.duration + debugDurationGrace
187
188	// Capture static information from the target agent
189	err = c.captureStatic()
190	if err != nil {
191		c.UI.Warn(fmt.Sprintf("Static capture failed: %v", err))
192	}
193
194	// Capture dynamic information from the target agent, blocking for duration
195	if c.configuredTarget("metrics") || c.configuredTarget("logs") || c.configuredTarget("pprof") {
196		g := new(errgroup.Group)
197		g.Go(c.captureInterval)
198		g.Go(c.captureLongRunning)
199		err = g.Wait()
200		if err != nil {
201			c.UI.Error(fmt.Sprintf("Error encountered during collection: %v", err))
202		}
203	}
204
205	// Write the index document
206	idxMarshalled, err := json.MarshalIndent(index, "", "\t")
207	if err != nil {
208		c.UI.Error(fmt.Sprintf("Error marshalling index document: %v", err))
209		return 1
210	}
211
212	err = ioutil.WriteFile(fmt.Sprintf("%s/index.json", c.output), idxMarshalled, 0644)
213	if err != nil {
214		c.UI.Error(fmt.Sprintf("Error creating index document: %v", err))
215		return 1
216	}
217
218	// Archive the data if configured to
219	if c.archive {
220		err = c.createArchive()
221
222		if err != nil {
223			c.UI.Warn(fmt.Sprintf("Archive creation failed: %v", err))
224			return 1
225		}
226	}
227
228	c.UI.Info(fmt.Sprintf("Saved debug archive: %s", archiveName))
229
230	return 0
231}
232
233// prepare validates agent settings against targets and prepares the environment for capturing
234func (c *cmd) prepare() (version string, err error) {
235	// Ensure realistic duration and intervals exists
236	if c.validateTiming {
237		if c.duration < debugMinDuration {
238			return "", fmt.Errorf("duration must be longer than %s", debugMinDuration)
239		}
240
241		if c.interval < debugMinInterval {
242			return "", fmt.Errorf("interval must be longer than %s", debugMinDuration)
243		}
244
245		if c.duration < c.interval {
246			return "", fmt.Errorf("duration (%s) must be longer than interval (%s)", c.duration, c.interval)
247		}
248	}
249
250	// Retrieve and process agent information necessary to validate
251	self, err := c.client.Agent().Self()
252	if err != nil {
253		return "", fmt.Errorf("error querying target agent: %s. verify connectivity and agent address", err)
254	}
255
256	version, ok := self["Config"]["Version"].(string)
257	if !ok {
258		return "", fmt.Errorf("agent response did not contain version key")
259	}
260
261	// If none are specified we will collect information from
262	// all by default
263	if len(c.capture) == 0 {
264		c.capture = c.defaultTargets()
265	}
266
267	for _, t := range c.capture {
268		if !c.allowedTarget(t) {
269			return version, fmt.Errorf("target not found: %s", t)
270		}
271	}
272
273	if _, err := os.Stat(c.output); os.IsNotExist(err) {
274		err := os.MkdirAll(c.output, 0755)
275		if err != nil {
276			return version, fmt.Errorf("could not create output directory: %s", err)
277		}
278	} else {
279		return version, fmt.Errorf("output directory already exists: %s", c.output)
280	}
281
282	return version, nil
283}
284
285// captureStatic captures static target information and writes it
286// to the output path
287func (c *cmd) captureStatic() error {
288	// Collect errors via multierror as we want to gracefully
289	// fail if an API is inaccessible
290	var errs error
291
292	// Collect the named outputs here
293	outputs := make(map[string]interface{})
294
295	// Capture host information
296	if c.configuredTarget("host") {
297		host, err := c.client.Agent().Host()
298		if err != nil {
299			errs = multierror.Append(errs, err)
300		}
301		outputs["host"] = host
302	}
303
304	// Capture agent information
305	if c.configuredTarget("agent") {
306		agent, err := c.client.Agent().Self()
307		if err != nil {
308			errs = multierror.Append(errs, err)
309		}
310		outputs["agent"] = agent
311	}
312
313	// Capture cluster members information, including WAN
314	if c.configuredTarget("cluster") {
315		members, err := c.client.Agent().Members(true)
316		if err != nil {
317			errs = multierror.Append(errs, err)
318		}
319		outputs["cluster"] = members
320	}
321
322	// Write all outputs to disk as JSON
323	for output, v := range outputs {
324		marshaled, err := json.MarshalIndent(v, "", "\t")
325		if err != nil {
326			errs = multierror.Append(errs, err)
327		}
328
329		err = ioutil.WriteFile(fmt.Sprintf("%s/%s.json", c.output, output), marshaled, 0644)
330		if err != nil {
331			errs = multierror.Append(errs, err)
332		}
333	}
334
335	return errs
336}
337
338// captureInterval blocks for the duration of the command
339// specified by the duration flag, capturing the dynamic
340// targets at the interval specified
341func (c *cmd) captureInterval() error {
342	intervalChn := time.NewTicker(c.interval)
343	defer intervalChn.Stop()
344	durationChn := time.After(c.duration)
345	intervalCount := 0
346
347	c.UI.Output(fmt.Sprintf("Beginning capture interval %s (%d)", time.Now().Local().String(), intervalCount))
348
349	err := captureShortLived(c)
350	if err != nil {
351		return err
352	}
353	c.UI.Output(fmt.Sprintf("Capture successful %s (%d)", time.Now().Local().String(), intervalCount))
354	for {
355		select {
356		case t := <-intervalChn.C:
357			intervalCount++
358			err := captureShortLived(c)
359			if err != nil {
360				return err
361			}
362			c.UI.Output(fmt.Sprintf("Capture successful %s (%d)", t.Local().String(), intervalCount))
363		case <-durationChn:
364			intervalChn.Stop()
365			return nil
366		case <-c.shutdownCh:
367			return errors.New("stopping collection due to shutdown signal")
368		}
369	}
370}
371
372func captureShortLived(c *cmd) error {
373	g := new(errgroup.Group)
374	timestamp := time.Now().Local().Unix()
375
376	timestampDir, err := c.createTimestampDir(timestamp)
377	if err != nil {
378		return err
379	}
380	if c.configuredTarget("pprof") {
381		g.Go(func() error {
382			return c.captureHeap(timestampDir)
383		})
384
385		g.Go(func() error {
386			return c.captureGoRoutines(timestampDir)
387		})
388	}
389
390	// Capture metrics
391	if c.configuredTarget("metrics") {
392		g.Go(func() error {
393			return c.captureMetrics(timestampDir)
394		})
395	}
396
397	return g.Wait()
398}
399
400func (c *cmd) createTimestampDir(timestamp int64) (string, error) {
401	// Make the directory that will store all captured data
402	// for this interval
403	timestampDir := fmt.Sprintf("%s/%d", c.output, timestamp)
404	err := os.MkdirAll(timestampDir, 0755)
405	if err != nil {
406		return "", err
407	}
408	return timestampDir, nil
409}
410
411func (c *cmd) captureLongRunning() error {
412	timestamp := time.Now().Local().Unix()
413
414	timestampDir, err := c.createTimestampDir(timestamp)
415
416	if err != nil {
417		return err
418	}
419
420	g := new(errgroup.Group)
421	// Capture a profile/trace with a minimum of 1s
422	s := c.duration.Seconds()
423	if s < 1 {
424		s = 1
425	}
426	// Capture pprof
427	if c.configuredTarget("pprof") {
428		g.Go(func() error {
429			return c.captureProfile(s, timestampDir)
430		})
431
432		g.Go(func() error {
433			return c.captureTrace(s, timestampDir)
434		})
435	}
436	// Capture logs
437	if c.configuredTarget("logs") {
438		g.Go(func() error {
439			return c.captureLogs(timestampDir)
440		})
441	}
442
443	return g.Wait()
444}
445
446func (c *cmd) captureGoRoutines(timestampDir string) error {
447	gr, err := c.client.Debug().Goroutine()
448	if err != nil {
449		return fmt.Errorf("failed to collect goroutine profile: %w", err)
450	}
451
452	err = ioutil.WriteFile(fmt.Sprintf("%s/goroutine.prof", timestampDir), gr, 0644)
453	return err
454}
455
456func (c *cmd) captureTrace(s float64, timestampDir string) error {
457	trace, err := c.client.Debug().Trace(int(s))
458	if err != nil {
459		return fmt.Errorf("failed to collect trace: %w", err)
460	}
461
462	err = ioutil.WriteFile(fmt.Sprintf("%s/trace.out", timestampDir), trace, 0644)
463	return err
464}
465
466func (c *cmd) captureProfile(s float64, timestampDir string) error {
467	prof, err := c.client.Debug().Profile(int(s))
468	if err != nil {
469		return fmt.Errorf("failed to collect cpu profile: %w", err)
470	}
471
472	err = ioutil.WriteFile(fmt.Sprintf("%s/profile.prof", timestampDir), prof, 0644)
473	return err
474}
475
476func (c *cmd) captureHeap(timestampDir string) error {
477	heap, err := c.client.Debug().Heap()
478	if err != nil {
479		return fmt.Errorf("failed to collect heap profile: %w", err)
480	}
481
482	err = ioutil.WriteFile(fmt.Sprintf("%s/heap.prof", timestampDir), heap, 0644)
483	return err
484}
485
486func (c *cmd) captureLogs(timestampDir string) error {
487	endLogChn := make(chan struct{})
488	timeIsUp := time.After(c.duration)
489	logCh, err := c.client.Agent().Monitor("DEBUG", endLogChn, nil)
490	if err != nil {
491		return err
492	}
493	// Close the log stream
494	defer close(endLogChn)
495
496	// Create the log file for writing
497	f, err := os.Create(fmt.Sprintf("%s/%s", timestampDir, "consul.log"))
498	if err != nil {
499		return err
500	}
501	defer f.Close()
502
503	for {
504		select {
505		case log := <-logCh:
506			if log == "" {
507				return nil
508			}
509			if _, err = f.WriteString(log + "\n"); err != nil {
510				return err
511			}
512		case <-timeIsUp:
513			return nil
514		}
515	}
516}
517
518func (c *cmd) captureMetrics(timestampDir string) error {
519
520	metrics, err := c.client.Agent().Metrics()
521	if err != nil {
522		return err
523	}
524
525	marshaled, err := json.MarshalIndent(metrics, "", "\t")
526	if err != nil {
527		return err
528	}
529
530	err = ioutil.WriteFile(fmt.Sprintf("%s/%s.json", timestampDir, "metrics"), marshaled, 0644)
531	return err
532}
533
534// allowedTarget returns a boolean if the target is able to be captured
535func (c *cmd) allowedTarget(target string) bool {
536	for _, dt := range c.defaultTargets() {
537		if dt == target {
538			return true
539		}
540	}
541	return false
542}
543
544// configuredTarget returns a boolean if the target is configured to be
545// captured in the command
546func (c *cmd) configuredTarget(target string) bool {
547	for _, dt := range c.capture {
548		if dt == target {
549			return true
550		}
551	}
552	return false
553}
554
555// createArchive walks the files in the temporary directory
556// and creates a tar file that is gzipped with the contents
557func (c *cmd) createArchive() error {
558	path := c.output + debugArchiveExtension
559
560	tempName, err := c.createArchiveTemp(path)
561	if err != nil {
562		return err
563	}
564
565	if err := os.Rename(tempName, path); err != nil {
566		return err
567	}
568	// fsync the dir to make the rename stick
569	if err := syncParentDir(path); err != nil {
570		return err
571	}
572
573	// Remove directory that has been archived
574	if err := os.RemoveAll(c.output); err != nil {
575		return fmt.Errorf("failed to remove archived directory: %s", err)
576	}
577
578	return nil
579}
580
581func syncParentDir(name string) error {
582	f, err := os.Open(filepath.Dir(name))
583	if err != nil {
584		return err
585	}
586	defer f.Close()
587
588	return f.Sync()
589}
590
591func (c *cmd) createArchiveTemp(path string) (tempName string, err error) {
592	dir := filepath.Dir(path)
593	name := filepath.Base(path)
594
595	f, err := ioutil.TempFile(dir, name+".tmp")
596	if err != nil {
597		return "", fmt.Errorf("failed to create compressed temp archive: %s", err)
598	}
599
600	g := gzip.NewWriter(f)
601	t := tar.NewWriter(g)
602
603	tempName = f.Name()
604
605	cleanup := func(err error) (string, error) {
606		_ = t.Close()
607		_ = g.Close()
608		_ = f.Close()
609		_ = os.Remove(tempName)
610		return "", err
611	}
612
613	err = filepath.Walk(c.output, func(file string, fi os.FileInfo, err error) error {
614		if err != nil {
615			return fmt.Errorf("failed to walk filepath for archive: %s", err)
616		}
617
618		header, err := tar.FileInfoHeader(fi, fi.Name())
619		if err != nil {
620			return fmt.Errorf("failed to create compressed archive header: %s", err)
621		}
622
623		header.Name = filepath.Join(filepath.Base(c.output), strings.TrimPrefix(file, c.output))
624
625		if err := t.WriteHeader(header); err != nil {
626			return fmt.Errorf("failed to write compressed archive header: %s", err)
627		}
628
629		// Only copy files
630		if !fi.Mode().IsRegular() {
631			return nil
632		}
633
634		f, err := os.Open(file)
635		if err != nil {
636			return fmt.Errorf("failed to open target files for archive: %s", err)
637		}
638
639		if _, err := io.Copy(t, f); err != nil {
640			return fmt.Errorf("failed to copy files for archive: %s", err)
641		}
642
643		return f.Close()
644	})
645
646	if err != nil {
647		return cleanup(fmt.Errorf("failed to walk output path for archive: %s", err))
648	}
649
650	// Explicitly close things in the correct order (tar then gzip) so we
651	// know if they worked.
652	if err := t.Close(); err != nil {
653		return cleanup(err)
654	}
655	if err := g.Close(); err != nil {
656		return cleanup(err)
657	}
658
659	// Guarantee that the contents of the temp file are flushed to disk.
660	if err := f.Sync(); err != nil {
661		return cleanup(err)
662	}
663
664	// Close the temp file and go back to the wrapper function for the rest.
665	if err := f.Close(); err != nil {
666		return cleanup(err)
667	}
668
669	return tempName, nil
670}
671
672// defaultTargets specifies the list of all targets that
673// will be captured by default
674func (c *cmd) defaultTargets() []string {
675	return append(c.dynamicTargets(), c.staticTargets()...)
676}
677
678// dynamicTargets returns all the supported targets
679// that are retrieved at the interval specified
680func (c *cmd) dynamicTargets() []string {
681	return []string{"metrics", "logs", "pprof"}
682}
683
684// staticTargets returns all the supported targets
685// that are retrieved at the start of the command execution
686func (c *cmd) staticTargets() []string {
687	return []string{"host", "agent", "cluster"}
688}
689
690func (c *cmd) Synopsis() string {
691	return synopsis
692}
693
694func (c *cmd) Help() string {
695	return c.help
696}
697
698const synopsis = "Records a debugging archive for operators"
699const help = `
700Usage: consul debug [options]
701
702  Monitors a Consul agent for the specified period of time, recording
703  information about the agent, cluster, and environment to an archive
704  written to the specified path.
705
706  If ACLs are enabled, an 'operator:read' token must be supplied in order
707  to perform this operation.
708
709  To create a debug archive in the current directory for the default
710  duration and interval, capturing all information available:
711
712      $ consul debug
713
714  The command stores captured data at the configured output path
715  through the duration, and will archive the data at the same
716  path if interrupted.
717
718  Flags can be used to customize the duration and interval of the
719  operation. Duration is the total time to capture data for from the target
720  agent and interval controls how often dynamic data such as metrics
721  are scraped.
722
723      $ consul debug -interval=20s -duration=1m
724
725  The capture flag can be specified multiple times to limit information
726  retrieved.
727
728      $ consul debug -capture metrics -capture agent
729
730  By default, the archive containing the debugging information is
731  saved to the current directory as a .tar.gz file. The
732  output path can be specified, as well as an option to disable
733  archiving, leaving the directory intact.
734
735      $ consul debug -output=/foo/bar/my-debugging -archive=false
736
737  Note: Information collected by this command has the potential
738  to be highly sensitive. Sensitive material such as ACL tokens and
739  other commonly secret material are redacted automatically, but we
740  strongly recommend review of the data within the archive prior to
741  transmitting it.
742
743  For a full list of options and examples, please see the Consul
744  documentation.
745`
746