1package debug 2 3import ( 4 "archive/tar" 5 "compress/gzip" 6 "encoding/json" 7 "errors" 8 "flag" 9 "fmt" 10 "golang.org/x/sync/errgroup" 11 "io" 12 "io/ioutil" 13 "os" 14 "path/filepath" 15 "strings" 16 "time" 17 18 "github.com/hashicorp/go-multierror" 19 "github.com/mitchellh/cli" 20 21 "github.com/hashicorp/consul/api" 22 "github.com/hashicorp/consul/command/flags" 23) 24 25const ( 26 // debugInterval is the interval in which to capture dynamic information 27 // when running debug 28 debugInterval = 30 * time.Second 29 30 // debugDuration is the total duration that debug runs before being 31 // shut down 32 debugDuration = 2 * time.Minute 33 34 // debugDurationGrace is a period of time added to the specified 35 // duration to allow intervals to capture within that time 36 debugDurationGrace = 2 * time.Second 37 38 // debugMinInterval is the minimum a user can configure the interval 39 // to prevent accidental DOS 40 debugMinInterval = 5 * time.Second 41 42 // debugMinDuration is the minimum a user can configure the duration 43 // to ensure that all information can be collected in time 44 debugMinDuration = 10 * time.Second 45 46 // debugArchiveExtension is the extension for archive files 47 debugArchiveExtension = ".tar.gz" 48 49 // debugProtocolVersion is the version of the package that is 50 // generated. If this format changes interface, this version 51 // can be incremented so clients can selectively support packages 52 debugProtocolVersion = 1 53) 54 55func New(ui cli.Ui, shutdownCh <-chan struct{}) *cmd { 56 ui = &cli.PrefixedUi{ 57 OutputPrefix: "==> ", 58 InfoPrefix: " ", 59 ErrorPrefix: "==> ", 60 Ui: ui, 61 } 62 63 c := &cmd{UI: ui, shutdownCh: shutdownCh} 64 c.init() 65 return c 66} 67 68type cmd struct { 69 UI cli.Ui 70 flags *flag.FlagSet 71 http *flags.HTTPFlags 72 help string 73 74 shutdownCh <-chan struct{} 75 76 // flags 77 interval time.Duration 78 duration time.Duration 79 output string 80 archive bool 81 capture []string 82 client *api.Client 83 // validateTiming can be used to skip validation of interval, duration. This 84 // is primarily useful for testing 85 validateTiming bool 86 87 index *debugIndex 88} 89 90// debugIndex is used to manage the summary of all data recorded 91// during the debug, to be written to json at the end of the run 92// and stored at the root. Each attribute corresponds to a file or files. 93type debugIndex struct { 94 // Version of the debug package 95 Version int 96 // Version of the target Consul agent 97 AgentVersion string 98 99 Interval string 100 Duration string 101 102 Targets []string 103} 104 105func (c *cmd) init() { 106 c.flags = flag.NewFlagSet("", flag.ContinueOnError) 107 108 defaultFilename := fmt.Sprintf("consul-debug-%d", time.Now().Unix()) 109 110 c.flags.Var((*flags.AppendSliceValue)(&c.capture), "capture", 111 fmt.Sprintf("One or more types of information to capture. This can be used "+ 112 "to capture a subset of information, and defaults to capturing "+ 113 "everything available. Possible information for capture: %s. "+ 114 "This can be repeated multiple times.", strings.Join(c.defaultTargets(), ", "))) 115 c.flags.DurationVar(&c.interval, "interval", debugInterval, 116 fmt.Sprintf("The interval in which to capture dynamic information such as "+ 117 "telemetry, and profiling. Defaults to %s.", debugInterval)) 118 c.flags.DurationVar(&c.duration, "duration", debugDuration, 119 fmt.Sprintf("The total time to record information. "+ 120 "Defaults to %s.", debugDuration)) 121 c.flags.BoolVar(&c.archive, "archive", true, "Boolean value for if the files "+ 122 "should be archived and compressed. Setting this to false will skip the "+ 123 "archive step and leave the directory of information on the current path.") 124 c.flags.StringVar(&c.output, "output", defaultFilename, "The path "+ 125 "to the compressed archive that will be created with the "+ 126 "information after collection.") 127 128 c.http = &flags.HTTPFlags{} 129 flags.Merge(c.flags, c.http.ClientFlags()) 130 c.help = flags.Usage(help, c.flags) 131 132 c.validateTiming = true 133} 134 135func (c *cmd) Run(args []string) int { 136 if err := c.flags.Parse(args); err != nil { 137 c.UI.Error(fmt.Sprintf("Error parsing flags: %s", err)) 138 return 1 139 } 140 141 if len(c.flags.Args()) > 0 { 142 c.UI.Error("debug: Too many arguments provided, expected 0") 143 return 1 144 } 145 146 // Connect to the agent 147 client, err := c.http.APIClient() 148 if err != nil { 149 c.UI.Error(fmt.Sprintf("Error connecting to Consul agent: %s", err)) 150 return 1 151 } 152 c.client = client 153 154 version, err := c.prepare() 155 if err != nil { 156 c.UI.Error(fmt.Sprintf("Capture validation failed: %v", err)) 157 return 1 158 } 159 160 archiveName := c.output 161 // Show the user the final file path if archiving 162 if c.archive { 163 archiveName = archiveName + debugArchiveExtension 164 } 165 166 c.UI.Output("Starting debugger and capturing static information...") 167 168 // Output metadata about target agent 169 c.UI.Info(fmt.Sprintf(" Agent Version: '%s'", version)) 170 c.UI.Info(fmt.Sprintf(" Interval: '%s'", c.interval)) 171 c.UI.Info(fmt.Sprintf(" Duration: '%s'", c.duration)) 172 c.UI.Info(fmt.Sprintf(" Output: '%s'", archiveName)) 173 c.UI.Info(fmt.Sprintf(" Capture: '%s'", strings.Join(c.capture, ", "))) 174 175 // Record some information for the index at the root of the archive 176 index := &debugIndex{ 177 Version: debugProtocolVersion, 178 AgentVersion: version, 179 Interval: c.interval.String(), 180 Duration: c.duration.String(), 181 Targets: c.capture, 182 } 183 184 // Add the extra grace period to ensure 185 // all intervals will be captured within the time allotted 186 c.duration = c.duration + debugDurationGrace 187 188 // Capture static information from the target agent 189 err = c.captureStatic() 190 if err != nil { 191 c.UI.Warn(fmt.Sprintf("Static capture failed: %v", err)) 192 } 193 194 // Capture dynamic information from the target agent, blocking for duration 195 if c.configuredTarget("metrics") || c.configuredTarget("logs") || c.configuredTarget("pprof") { 196 g := new(errgroup.Group) 197 g.Go(c.captureInterval) 198 g.Go(c.captureLongRunning) 199 err = g.Wait() 200 if err != nil { 201 c.UI.Error(fmt.Sprintf("Error encountered during collection: %v", err)) 202 } 203 } 204 205 // Write the index document 206 idxMarshalled, err := json.MarshalIndent(index, "", "\t") 207 if err != nil { 208 c.UI.Error(fmt.Sprintf("Error marshalling index document: %v", err)) 209 return 1 210 } 211 212 err = ioutil.WriteFile(fmt.Sprintf("%s/index.json", c.output), idxMarshalled, 0644) 213 if err != nil { 214 c.UI.Error(fmt.Sprintf("Error creating index document: %v", err)) 215 return 1 216 } 217 218 // Archive the data if configured to 219 if c.archive { 220 err = c.createArchive() 221 222 if err != nil { 223 c.UI.Warn(fmt.Sprintf("Archive creation failed: %v", err)) 224 return 1 225 } 226 } 227 228 c.UI.Info(fmt.Sprintf("Saved debug archive: %s", archiveName)) 229 230 return 0 231} 232 233// prepare validates agent settings against targets and prepares the environment for capturing 234func (c *cmd) prepare() (version string, err error) { 235 // Ensure realistic duration and intervals exists 236 if c.validateTiming { 237 if c.duration < debugMinDuration { 238 return "", fmt.Errorf("duration must be longer than %s", debugMinDuration) 239 } 240 241 if c.interval < debugMinInterval { 242 return "", fmt.Errorf("interval must be longer than %s", debugMinDuration) 243 } 244 245 if c.duration < c.interval { 246 return "", fmt.Errorf("duration (%s) must be longer than interval (%s)", c.duration, c.interval) 247 } 248 } 249 250 // Retrieve and process agent information necessary to validate 251 self, err := c.client.Agent().Self() 252 if err != nil { 253 return "", fmt.Errorf("error querying target agent: %s. verify connectivity and agent address", err) 254 } 255 256 version, ok := self["Config"]["Version"].(string) 257 if !ok { 258 return "", fmt.Errorf("agent response did not contain version key") 259 } 260 261 // If none are specified we will collect information from 262 // all by default 263 if len(c.capture) == 0 { 264 c.capture = c.defaultTargets() 265 } 266 267 for _, t := range c.capture { 268 if !c.allowedTarget(t) { 269 return version, fmt.Errorf("target not found: %s", t) 270 } 271 } 272 273 if _, err := os.Stat(c.output); os.IsNotExist(err) { 274 err := os.MkdirAll(c.output, 0755) 275 if err != nil { 276 return version, fmt.Errorf("could not create output directory: %s", err) 277 } 278 } else { 279 return version, fmt.Errorf("output directory already exists: %s", c.output) 280 } 281 282 return version, nil 283} 284 285// captureStatic captures static target information and writes it 286// to the output path 287func (c *cmd) captureStatic() error { 288 // Collect errors via multierror as we want to gracefully 289 // fail if an API is inaccessible 290 var errs error 291 292 // Collect the named outputs here 293 outputs := make(map[string]interface{}) 294 295 // Capture host information 296 if c.configuredTarget("host") { 297 host, err := c.client.Agent().Host() 298 if err != nil { 299 errs = multierror.Append(errs, err) 300 } 301 outputs["host"] = host 302 } 303 304 // Capture agent information 305 if c.configuredTarget("agent") { 306 agent, err := c.client.Agent().Self() 307 if err != nil { 308 errs = multierror.Append(errs, err) 309 } 310 outputs["agent"] = agent 311 } 312 313 // Capture cluster members information, including WAN 314 if c.configuredTarget("cluster") { 315 members, err := c.client.Agent().Members(true) 316 if err != nil { 317 errs = multierror.Append(errs, err) 318 } 319 outputs["cluster"] = members 320 } 321 322 // Write all outputs to disk as JSON 323 for output, v := range outputs { 324 marshaled, err := json.MarshalIndent(v, "", "\t") 325 if err != nil { 326 errs = multierror.Append(errs, err) 327 } 328 329 err = ioutil.WriteFile(fmt.Sprintf("%s/%s.json", c.output, output), marshaled, 0644) 330 if err != nil { 331 errs = multierror.Append(errs, err) 332 } 333 } 334 335 return errs 336} 337 338// captureInterval blocks for the duration of the command 339// specified by the duration flag, capturing the dynamic 340// targets at the interval specified 341func (c *cmd) captureInterval() error { 342 intervalChn := time.NewTicker(c.interval) 343 defer intervalChn.Stop() 344 durationChn := time.After(c.duration) 345 intervalCount := 0 346 347 c.UI.Output(fmt.Sprintf("Beginning capture interval %s (%d)", time.Now().Local().String(), intervalCount)) 348 349 err := captureShortLived(c) 350 if err != nil { 351 return err 352 } 353 c.UI.Output(fmt.Sprintf("Capture successful %s (%d)", time.Now().Local().String(), intervalCount)) 354 for { 355 select { 356 case t := <-intervalChn.C: 357 intervalCount++ 358 err := captureShortLived(c) 359 if err != nil { 360 return err 361 } 362 c.UI.Output(fmt.Sprintf("Capture successful %s (%d)", t.Local().String(), intervalCount)) 363 case <-durationChn: 364 intervalChn.Stop() 365 return nil 366 case <-c.shutdownCh: 367 return errors.New("stopping collection due to shutdown signal") 368 } 369 } 370} 371 372func captureShortLived(c *cmd) error { 373 g := new(errgroup.Group) 374 timestamp := time.Now().Local().Unix() 375 376 timestampDir, err := c.createTimestampDir(timestamp) 377 if err != nil { 378 return err 379 } 380 if c.configuredTarget("pprof") { 381 g.Go(func() error { 382 return c.captureHeap(timestampDir) 383 }) 384 385 g.Go(func() error { 386 return c.captureGoRoutines(timestampDir) 387 }) 388 } 389 390 // Capture metrics 391 if c.configuredTarget("metrics") { 392 g.Go(func() error { 393 return c.captureMetrics(timestampDir) 394 }) 395 } 396 397 return g.Wait() 398} 399 400func (c *cmd) createTimestampDir(timestamp int64) (string, error) { 401 // Make the directory that will store all captured data 402 // for this interval 403 timestampDir := fmt.Sprintf("%s/%d", c.output, timestamp) 404 err := os.MkdirAll(timestampDir, 0755) 405 if err != nil { 406 return "", err 407 } 408 return timestampDir, nil 409} 410 411func (c *cmd) captureLongRunning() error { 412 timestamp := time.Now().Local().Unix() 413 414 timestampDir, err := c.createTimestampDir(timestamp) 415 416 if err != nil { 417 return err 418 } 419 420 g := new(errgroup.Group) 421 // Capture a profile/trace with a minimum of 1s 422 s := c.duration.Seconds() 423 if s < 1 { 424 s = 1 425 } 426 // Capture pprof 427 if c.configuredTarget("pprof") { 428 g.Go(func() error { 429 return c.captureProfile(s, timestampDir) 430 }) 431 432 g.Go(func() error { 433 return c.captureTrace(s, timestampDir) 434 }) 435 } 436 // Capture logs 437 if c.configuredTarget("logs") { 438 g.Go(func() error { 439 return c.captureLogs(timestampDir) 440 }) 441 } 442 443 return g.Wait() 444} 445 446func (c *cmd) captureGoRoutines(timestampDir string) error { 447 gr, err := c.client.Debug().Goroutine() 448 if err != nil { 449 return fmt.Errorf("failed to collect goroutine profile: %w", err) 450 } 451 452 err = ioutil.WriteFile(fmt.Sprintf("%s/goroutine.prof", timestampDir), gr, 0644) 453 return err 454} 455 456func (c *cmd) captureTrace(s float64, timestampDir string) error { 457 trace, err := c.client.Debug().Trace(int(s)) 458 if err != nil { 459 return fmt.Errorf("failed to collect trace: %w", err) 460 } 461 462 err = ioutil.WriteFile(fmt.Sprintf("%s/trace.out", timestampDir), trace, 0644) 463 return err 464} 465 466func (c *cmd) captureProfile(s float64, timestampDir string) error { 467 prof, err := c.client.Debug().Profile(int(s)) 468 if err != nil { 469 return fmt.Errorf("failed to collect cpu profile: %w", err) 470 } 471 472 err = ioutil.WriteFile(fmt.Sprintf("%s/profile.prof", timestampDir), prof, 0644) 473 return err 474} 475 476func (c *cmd) captureHeap(timestampDir string) error { 477 heap, err := c.client.Debug().Heap() 478 if err != nil { 479 return fmt.Errorf("failed to collect heap profile: %w", err) 480 } 481 482 err = ioutil.WriteFile(fmt.Sprintf("%s/heap.prof", timestampDir), heap, 0644) 483 return err 484} 485 486func (c *cmd) captureLogs(timestampDir string) error { 487 endLogChn := make(chan struct{}) 488 timeIsUp := time.After(c.duration) 489 logCh, err := c.client.Agent().Monitor("DEBUG", endLogChn, nil) 490 if err != nil { 491 return err 492 } 493 // Close the log stream 494 defer close(endLogChn) 495 496 // Create the log file for writing 497 f, err := os.Create(fmt.Sprintf("%s/%s", timestampDir, "consul.log")) 498 if err != nil { 499 return err 500 } 501 defer f.Close() 502 503 for { 504 select { 505 case log := <-logCh: 506 if log == "" { 507 return nil 508 } 509 if _, err = f.WriteString(log + "\n"); err != nil { 510 return err 511 } 512 case <-timeIsUp: 513 return nil 514 } 515 } 516} 517 518func (c *cmd) captureMetrics(timestampDir string) error { 519 520 metrics, err := c.client.Agent().Metrics() 521 if err != nil { 522 return err 523 } 524 525 marshaled, err := json.MarshalIndent(metrics, "", "\t") 526 if err != nil { 527 return err 528 } 529 530 err = ioutil.WriteFile(fmt.Sprintf("%s/%s.json", timestampDir, "metrics"), marshaled, 0644) 531 return err 532} 533 534// allowedTarget returns a boolean if the target is able to be captured 535func (c *cmd) allowedTarget(target string) bool { 536 for _, dt := range c.defaultTargets() { 537 if dt == target { 538 return true 539 } 540 } 541 return false 542} 543 544// configuredTarget returns a boolean if the target is configured to be 545// captured in the command 546func (c *cmd) configuredTarget(target string) bool { 547 for _, dt := range c.capture { 548 if dt == target { 549 return true 550 } 551 } 552 return false 553} 554 555// createArchive walks the files in the temporary directory 556// and creates a tar file that is gzipped with the contents 557func (c *cmd) createArchive() error { 558 path := c.output + debugArchiveExtension 559 560 tempName, err := c.createArchiveTemp(path) 561 if err != nil { 562 return err 563 } 564 565 if err := os.Rename(tempName, path); err != nil { 566 return err 567 } 568 // fsync the dir to make the rename stick 569 if err := syncParentDir(path); err != nil { 570 return err 571 } 572 573 // Remove directory that has been archived 574 if err := os.RemoveAll(c.output); err != nil { 575 return fmt.Errorf("failed to remove archived directory: %s", err) 576 } 577 578 return nil 579} 580 581func syncParentDir(name string) error { 582 f, err := os.Open(filepath.Dir(name)) 583 if err != nil { 584 return err 585 } 586 defer f.Close() 587 588 return f.Sync() 589} 590 591func (c *cmd) createArchiveTemp(path string) (tempName string, err error) { 592 dir := filepath.Dir(path) 593 name := filepath.Base(path) 594 595 f, err := ioutil.TempFile(dir, name+".tmp") 596 if err != nil { 597 return "", fmt.Errorf("failed to create compressed temp archive: %s", err) 598 } 599 600 g := gzip.NewWriter(f) 601 t := tar.NewWriter(g) 602 603 tempName = f.Name() 604 605 cleanup := func(err error) (string, error) { 606 _ = t.Close() 607 _ = g.Close() 608 _ = f.Close() 609 _ = os.Remove(tempName) 610 return "", err 611 } 612 613 err = filepath.Walk(c.output, func(file string, fi os.FileInfo, err error) error { 614 if err != nil { 615 return fmt.Errorf("failed to walk filepath for archive: %s", err) 616 } 617 618 header, err := tar.FileInfoHeader(fi, fi.Name()) 619 if err != nil { 620 return fmt.Errorf("failed to create compressed archive header: %s", err) 621 } 622 623 header.Name = filepath.Join(filepath.Base(c.output), strings.TrimPrefix(file, c.output)) 624 625 if err := t.WriteHeader(header); err != nil { 626 return fmt.Errorf("failed to write compressed archive header: %s", err) 627 } 628 629 // Only copy files 630 if !fi.Mode().IsRegular() { 631 return nil 632 } 633 634 f, err := os.Open(file) 635 if err != nil { 636 return fmt.Errorf("failed to open target files for archive: %s", err) 637 } 638 639 if _, err := io.Copy(t, f); err != nil { 640 return fmt.Errorf("failed to copy files for archive: %s", err) 641 } 642 643 return f.Close() 644 }) 645 646 if err != nil { 647 return cleanup(fmt.Errorf("failed to walk output path for archive: %s", err)) 648 } 649 650 // Explicitly close things in the correct order (tar then gzip) so we 651 // know if they worked. 652 if err := t.Close(); err != nil { 653 return cleanup(err) 654 } 655 if err := g.Close(); err != nil { 656 return cleanup(err) 657 } 658 659 // Guarantee that the contents of the temp file are flushed to disk. 660 if err := f.Sync(); err != nil { 661 return cleanup(err) 662 } 663 664 // Close the temp file and go back to the wrapper function for the rest. 665 if err := f.Close(); err != nil { 666 return cleanup(err) 667 } 668 669 return tempName, nil 670} 671 672// defaultTargets specifies the list of all targets that 673// will be captured by default 674func (c *cmd) defaultTargets() []string { 675 return append(c.dynamicTargets(), c.staticTargets()...) 676} 677 678// dynamicTargets returns all the supported targets 679// that are retrieved at the interval specified 680func (c *cmd) dynamicTargets() []string { 681 return []string{"metrics", "logs", "pprof"} 682} 683 684// staticTargets returns all the supported targets 685// that are retrieved at the start of the command execution 686func (c *cmd) staticTargets() []string { 687 return []string{"host", "agent", "cluster"} 688} 689 690func (c *cmd) Synopsis() string { 691 return synopsis 692} 693 694func (c *cmd) Help() string { 695 return c.help 696} 697 698const synopsis = "Records a debugging archive for operators" 699const help = ` 700Usage: consul debug [options] 701 702 Monitors a Consul agent for the specified period of time, recording 703 information about the agent, cluster, and environment to an archive 704 written to the specified path. 705 706 If ACLs are enabled, an 'operator:read' token must be supplied in order 707 to perform this operation. 708 709 To create a debug archive in the current directory for the default 710 duration and interval, capturing all information available: 711 712 $ consul debug 713 714 The command stores captured data at the configured output path 715 through the duration, and will archive the data at the same 716 path if interrupted. 717 718 Flags can be used to customize the duration and interval of the 719 operation. Duration is the total time to capture data for from the target 720 agent and interval controls how often dynamic data such as metrics 721 are scraped. 722 723 $ consul debug -interval=20s -duration=1m 724 725 The capture flag can be specified multiple times to limit information 726 retrieved. 727 728 $ consul debug -capture metrics -capture agent 729 730 By default, the archive containing the debugging information is 731 saved to the current directory as a .tar.gz file. The 732 output path can be specified, as well as an option to disable 733 archiving, leaving the directory intact. 734 735 $ consul debug -output=/foo/bar/my-debugging -archive=false 736 737 Note: Information collected by this command has the potential 738 to be highly sensitive. Sensitive material such as ACL tokens and 739 other commonly secret material are redacted automatically, but we 740 strongly recommend review of the data within the archive prior to 741 transmitting it. 742 743 For a full list of options and examples, please see the Consul 744 documentation. 745` 746