1// Copyright 2017 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package profiler is a client for the Cloud Profiler service. 16// 17// Usage example: 18// 19// import "cloud.google.com/go/profiler" 20// ... 21// if err := profiler.Start(profiler.Config{Service: "my-service"}); err != nil { 22// // TODO: Handle error. 23// } 24// 25// Calling Start will start a goroutine to collect profiles and upload to 26// the profiler server, at the rhythm specified by the server. 27// 28// The caller must provide the service string in the config, and may provide 29// other information as well. See Config for details. 30// 31// Profiler has CPU, heap and goroutine profiling enabled by default. Mutex 32// profiling can be enabled in the config. Note that goroutine and mutex 33// profiles are shown as "threads" and "contention" profiles in the profiler 34// UI. 35package profiler 36 37import ( 38 "bytes" 39 "context" 40 "errors" 41 "fmt" 42 "log" 43 "os" 44 "regexp" 45 "runtime" 46 "runtime/pprof" 47 "sync" 48 "time" 49 50 gcemd "cloud.google.com/go/compute/metadata" 51 "cloud.google.com/go/internal/version" 52 "github.com/golang/protobuf/proto" 53 "github.com/golang/protobuf/ptypes" 54 "github.com/google/pprof/profile" 55 gax "github.com/googleapis/gax-go/v2" 56 "google.golang.org/api/option" 57 gtransport "google.golang.org/api/transport/grpc" 58 pb "google.golang.org/genproto/googleapis/devtools/cloudprofiler/v2" 59 edpb "google.golang.org/genproto/googleapis/rpc/errdetails" 60 "google.golang.org/grpc" 61 "google.golang.org/grpc/codes" 62 grpcmd "google.golang.org/grpc/metadata" 63 "google.golang.org/grpc/status" 64) 65 66var ( 67 config Config 68 startOnce allowUntilSuccess 69 mutexEnabled bool 70 // The functions below are stubbed to be overrideable for testing. 71 getProjectID = gcemd.ProjectID 72 getInstanceName = gcemd.InstanceName 73 getZone = gcemd.Zone 74 startCPUProfile = pprof.StartCPUProfile 75 stopCPUProfile = pprof.StopCPUProfile 76 writeHeapProfile = pprof.WriteHeapProfile 77 sleep = gax.Sleep 78 dialGRPC = gtransport.DialPool 79 onGCE = gcemd.OnGCE 80 serviceRegexp = regexp.MustCompile(`^[a-z]([-a-z0-9_.]{0,253}[a-z0-9])?$`) 81 82 // For testing only. 83 // When the profiling loop has exited without error and this channel is 84 // non-nil, "true" will be sent to this channel. 85 profilingDone chan bool 86) 87 88const ( 89 apiAddress = "cloudprofiler.googleapis.com:443" 90 xGoogAPIMetadata = "x-goog-api-client" 91 zoneNameLabel = "zone" 92 versionLabel = "version" 93 languageLabel = "language" 94 instanceLabel = "instance" 95 scope = "https://www.googleapis.com/auth/monitoring.write" 96 97 initialBackoff = time.Minute 98 // Ensure the agent will recover within 1 hour. 99 maxBackoff = time.Hour 100 backoffMultiplier = 1.3 // Backoff envelope increases by this factor on each retry. 101 retryInfoMetadata = "google.rpc.retryinfo-bin" 102) 103 104// Config is the profiler configuration. 105type Config struct { 106 // Service must be provided to start the profiler. It specifies the name of 107 // the service under which the profiled data will be recorded and exposed at 108 // the Profiler UI for the project. You can specify an arbitrary string, but 109 // see Deployment.target at 110 // https://github.com/googleapis/googleapis/blob/master/google/devtools/cloudprofiler/v2/profiler.proto 111 // for restrictions. If the parameter is not set, the agent will probe 112 // GAE_SERVICE environment variable which is present in Google App Engine 113 // environment. 114 // NOTE: The string should be the same across different replicas of 115 // your service so that the globally constant profiling rate is 116 // maintained. Do not put things like PID or unique pod ID in the name. 117 Service string 118 119 // ServiceVersion is an optional field specifying the version of the 120 // service. It can be an arbitrary string. Profiler profiles 121 // once per minute for each version of each service in each zone. 122 // ServiceVersion defaults to GAE_VERSION environment variable if that is 123 // set, or to empty string otherwise. 124 ServiceVersion string 125 126 // DebugLogging enables detailed debug logging from profiler. It 127 // defaults to false. 128 DebugLogging bool 129 130 // MutexProfiling enables mutex profiling. It defaults to false. 131 // Note that mutex profiling is not supported by Go versions older 132 // than Go 1.8. 133 MutexProfiling bool 134 135 // When true, collecting the CPU profiles is disabled. 136 NoCPUProfiling bool 137 138 // When true, collecting the allocation profiles is disabled. 139 NoAllocProfiling bool 140 141 // AllocForceGC forces garbage collection before the collection of each heap 142 // profile collected to produce the allocation profile. This increases the 143 // accuracy of allocation profiling. It defaults to false. 144 AllocForceGC bool 145 146 // When true, collecting the heap profiles is disabled. 147 NoHeapProfiling bool 148 149 // When true, collecting the goroutine profiles is disabled. 150 NoGoroutineProfiling bool 151 152 // When true, the agent sends all telemetries via OpenCensus exporter, which 153 // can be viewed in Cloud Trace and Cloud Monitoring. 154 // Default is false. 155 EnableOCTelemetry bool 156 157 // ProjectID is the Cloud Console project ID to use instead of the one set by 158 // GOOGLE_CLOUD_PROJECT environment variable or read from the VM metadata 159 // server. 160 // 161 // Set this if you are running the agent in your local environment 162 // or anywhere else outside of Google Cloud Platform. 163 ProjectID string 164 165 // APIAddr is the HTTP endpoint to use to connect to the profiler 166 // agent API. Defaults to the production environment, overridable 167 // for testing. 168 APIAddr string 169 170 // Instance is the name of Compute Engine instance the profiler agent runs 171 // on. This is normally determined from the Compute Engine metadata server 172 // and doesn't need to be initialized. It needs to be set in rare cases where 173 // the metadata server is present but is flaky or otherwise misbehave. 174 Instance string 175 176 // Zone is the zone of Compute Engine instance the profiler agent runs 177 // on. This is normally determined from the Compute Engine metadata server 178 // and doesn't need to be initialized. It needs to be set in rare cases where 179 // the metadata server is present but is flaky or otherwise misbehave. 180 Zone string 181 182 // numProfiles is the number of profiles which should be collected before 183 // the profile collection loop exits.When numProfiles is 0, profiles will 184 // be collected for the duration of the program. For testing only. 185 numProfiles int 186} 187 188// allowUntilSuccess is an object that will perform action till 189// it succeeds once. 190// This is a modified form of Go's sync.Once 191type allowUntilSuccess struct { 192 m sync.Mutex 193 done uint32 194} 195 196// do calls function f only if it hasnt returned nil previously. 197// Once f returns nil, do will not call function f any more. 198// This is a modified form of Go's sync.Once.Do 199func (o *allowUntilSuccess) do(f func() error) (err error) { 200 o.m.Lock() 201 defer o.m.Unlock() 202 if o.done == 0 { 203 if err = f(); err == nil { 204 o.done = 1 205 } 206 } else { 207 debugLog("profiler.Start() called again after it was previously called") 208 err = nil 209 } 210 return err 211} 212 213// Start starts a goroutine to collect and upload profiles. The 214// caller must provide the service string in the config. See 215// Config for details. Start should only be called once. Any 216// additional calls will be ignored. 217func Start(cfg Config, options ...option.ClientOption) error { 218 startError := startOnce.do(func() error { 219 return start(cfg, options...) 220 }) 221 return startError 222} 223 224func start(cfg Config, options ...option.ClientOption) error { 225 if err := initializeConfig(cfg); err != nil { 226 debugLog("failed to initialize config: %v", err) 227 return err 228 } 229 if config.MutexProfiling { 230 if mutexEnabled = enableMutexProfiling(); !mutexEnabled { 231 return fmt.Errorf("mutex profiling is not supported by %s, requires Go 1.8 or later", runtime.Version()) 232 } 233 } 234 235 ctx := context.Background() 236 237 opts := []option.ClientOption{ 238 option.WithEndpoint(config.APIAddr), 239 option.WithScopes(scope), 240 option.WithUserAgent(fmt.Sprintf("gcloud-go-profiler/%s", version.Repo)), 241 } 242 if !config.EnableOCTelemetry { 243 opts = append(opts, option.WithTelemetryDisabled()) 244 } 245 opts = append(opts, options...) 246 247 connPool, err := dialGRPC(ctx, opts...) 248 if err != nil { 249 debugLog("failed to dial GRPC: %v", err) 250 return err 251 } 252 253 a, err := initializeAgent(pb.NewProfilerServiceClient(connPool)) 254 if err != nil { 255 debugLog("failed to start the profiling agent: %v", err) 256 return err 257 } 258 go pollProfilerService(withXGoogHeader(ctx), a) 259 return nil 260} 261 262func debugLog(format string, e ...interface{}) { 263 if config.DebugLogging { 264 log.Printf(format, e...) 265 } 266} 267 268// agent polls the profiler server for instructions on behalf of a task, 269// and collects and uploads profiles as requested. 270type agent struct { 271 client pb.ProfilerServiceClient 272 deployment *pb.Deployment 273 profileLabels map[string]string 274 profileTypes []pb.ProfileType 275} 276 277// abortedBackoffDuration retrieves the retry duration from gRPC trailing 278// metadata, which is set by the profiler server. 279func abortedBackoffDuration(md grpcmd.MD) (time.Duration, error) { 280 elem := md[retryInfoMetadata] 281 if len(elem) <= 0 { 282 return 0, errors.New("no retry info") 283 } 284 285 var retryInfo edpb.RetryInfo 286 if err := proto.Unmarshal([]byte(elem[0]), &retryInfo); err != nil { 287 return 0, err 288 } else if time, err := ptypes.Duration(retryInfo.RetryDelay); err != nil { 289 return 0, err 290 } else { 291 if time < 0 { 292 return 0, errors.New("negative retry duration") 293 } 294 return time, nil 295 } 296} 297 298type retryer struct { 299 backoff gax.Backoff 300 md grpcmd.MD 301} 302 303func (r *retryer) Retry(err error) (time.Duration, bool) { 304 st, _ := status.FromError(err) 305 if st != nil && st.Code() == codes.Aborted { 306 dur, err := abortedBackoffDuration(r.md) 307 if err == nil { 308 return dur, true 309 } 310 debugLog("failed to get backoff duration: %v", err) 311 } 312 return r.backoff.Pause(), true 313} 314 315// createProfile talks to the profiler server to create profile. In 316// case of error, the goroutine will sleep and retry. Sleep duration may 317// be specified by the server. Otherwise it will be an exponentially 318// increasing value, bounded by maxBackoff. 319func (a *agent) createProfile(ctx context.Context) *pb.Profile { 320 req := pb.CreateProfileRequest{ 321 Parent: "projects/" + a.deployment.ProjectId, 322 Deployment: a.deployment, 323 ProfileType: a.profileTypes, 324 } 325 326 var p *pb.Profile 327 md := grpcmd.New(map[string]string{}) 328 329 gax.Invoke(ctx, func(ctx context.Context, settings gax.CallSettings) error { 330 debugLog("creating a new profile via profiler service") 331 var err error 332 p, err = a.client.CreateProfile(ctx, &req, grpc.Trailer(&md)) 333 if err != nil { 334 debugLog("failed to create profile, will retry: %v", err) 335 } 336 return err 337 }, gax.WithRetry(func() gax.Retryer { 338 return &retryer{ 339 backoff: gax.Backoff{ 340 Initial: initialBackoff, 341 Max: maxBackoff, 342 Multiplier: backoffMultiplier, 343 }, 344 md: md, 345 } 346 })) 347 348 debugLog("successfully created profile %v", p.GetProfileType()) 349 return p 350} 351 352func (a *agent) profileAndUpload(ctx context.Context, p *pb.Profile) { 353 var prof bytes.Buffer 354 pt := p.GetProfileType() 355 356 ptEnabled := false 357 for _, enabled := range a.profileTypes { 358 if enabled == pt { 359 ptEnabled = true 360 break 361 } 362 } 363 364 if !ptEnabled { 365 debugLog("skipping collection of disabled profile type: %v", pt) 366 return 367 } 368 369 switch pt { 370 case pb.ProfileType_CPU: 371 duration, err := ptypes.Duration(p.Duration) 372 if err != nil { 373 debugLog("failed to get profile duration for CPU profile: %v", err) 374 return 375 } 376 if err := startCPUProfile(&prof); err != nil { 377 debugLog("failed to start CPU profile: %v", err) 378 return 379 } 380 sleep(ctx, duration) 381 stopCPUProfile() 382 case pb.ProfileType_HEAP: 383 if err := heapProfile(&prof); err != nil { 384 debugLog("failed to write heap profile: %v", err) 385 return 386 } 387 case pb.ProfileType_HEAP_ALLOC: 388 duration, err := ptypes.Duration(p.Duration) 389 if err != nil { 390 debugLog("failed to get profile duration for allocation profile: %v", err) 391 return 392 } 393 if err := deltaAllocProfile(ctx, duration, config.AllocForceGC, &prof); err != nil { 394 debugLog("failed to collect allocation profile: %v", err) 395 return 396 } 397 case pb.ProfileType_THREADS: 398 if err := pprof.Lookup("goroutine").WriteTo(&prof, 0); err != nil { 399 debugLog("failed to collect goroutine profile: %v", err) 400 return 401 } 402 case pb.ProfileType_CONTENTION: 403 duration, err := ptypes.Duration(p.Duration) 404 if err != nil { 405 debugLog("failed to get profile duration: %v", err) 406 return 407 } 408 if err := deltaMutexProfile(ctx, duration, &prof); err != nil { 409 debugLog("failed to collect mutex profile: %v", err) 410 return 411 } 412 default: 413 debugLog("unexpected profile type: %v", pt) 414 return 415 } 416 417 p.ProfileBytes = prof.Bytes() 418 p.Labels = a.profileLabels 419 req := pb.UpdateProfileRequest{Profile: p} 420 421 // Upload profile, discard profile in case of error. 422 debugLog("start uploading profile") 423 if _, err := a.client.UpdateProfile(ctx, &req); err != nil { 424 debugLog("failed to upload profile: %v", err) 425 } 426} 427 428// deltaMutexProfile writes mutex profile changes over a time period specified 429// with 'duration' to 'prof'. 430func deltaMutexProfile(ctx context.Context, duration time.Duration, prof *bytes.Buffer) error { 431 if !mutexEnabled { 432 return errors.New("mutex profiling is not enabled") 433 } 434 p0, err := mutexProfile() 435 if err != nil { 436 return err 437 } 438 sleep(ctx, duration) 439 p, err := mutexProfile() 440 if err != nil { 441 return err 442 } 443 444 p0.Scale(-1) 445 p, err = profile.Merge([]*profile.Profile{p0, p}) 446 if err != nil { 447 return err 448 } 449 450 return p.Write(prof) 451} 452 453func mutexProfile() (*profile.Profile, error) { 454 p := pprof.Lookup("mutex") 455 if p == nil { 456 return nil, errors.New("mutex profiling is not supported") 457 } 458 var buf bytes.Buffer 459 if err := p.WriteTo(&buf, 0); err != nil { 460 return nil, err 461 } 462 return profile.Parse(&buf) 463} 464 465// withXGoogHeader sets the name and version of the application in 466// the `x-goog-api-client` header passed on each request. Intended for 467// use by Google-written clients. 468func withXGoogHeader(ctx context.Context, keyval ...string) context.Context { 469 kv := append([]string{"gl-go", version.Go(), "gccl", version.Repo}, keyval...) 470 kv = append(kv, "gax", gax.Version, "grpc", grpc.Version) 471 472 md, _ := grpcmd.FromOutgoingContext(ctx) 473 md = md.Copy() 474 md[xGoogAPIMetadata] = []string{gax.XGoogHeader(kv...)} 475 return grpcmd.NewOutgoingContext(ctx, md) 476} 477 478// initializeAgent initializes the profiling agent. It returns an error if 479// profile collection should not be started because collection is disabled 480// for all profile types. 481func initializeAgent(c pb.ProfilerServiceClient) (*agent, error) { 482 labels := map[string]string{languageLabel: "go"} 483 if config.Zone != "" { 484 labels[zoneNameLabel] = config.Zone 485 } 486 if config.ServiceVersion != "" { 487 labels[versionLabel] = config.ServiceVersion 488 } 489 d := &pb.Deployment{ 490 ProjectId: config.ProjectID, 491 Target: config.Service, 492 Labels: labels, 493 } 494 495 profileLabels := map[string]string{} 496 497 if config.Instance != "" { 498 profileLabels[instanceLabel] = config.Instance 499 } 500 501 var profileTypes []pb.ProfileType 502 if !config.NoCPUProfiling { 503 profileTypes = append(profileTypes, pb.ProfileType_CPU) 504 } 505 if !config.NoHeapProfiling { 506 profileTypes = append(profileTypes, pb.ProfileType_HEAP) 507 } 508 if !config.NoGoroutineProfiling { 509 profileTypes = append(profileTypes, pb.ProfileType_THREADS) 510 } 511 if !config.NoAllocProfiling { 512 profileTypes = append(profileTypes, pb.ProfileType_HEAP_ALLOC) 513 } 514 if mutexEnabled { 515 profileTypes = append(profileTypes, pb.ProfileType_CONTENTION) 516 } 517 518 if len(profileTypes) == 0 { 519 return nil, fmt.Errorf("collection is not enabled for any profile types") 520 } 521 522 return &agent{ 523 client: c, 524 deployment: d, 525 profileLabels: profileLabels, 526 profileTypes: profileTypes, 527 }, nil 528} 529 530func initializeConfig(cfg Config) error { 531 config = cfg 532 533 if config.Service == "" { 534 for _, ev := range []string{"GAE_SERVICE", "K_SERVICE"} { 535 if val := os.Getenv(ev); val != "" { 536 config.Service = val 537 break 538 } 539 } 540 } 541 if config.Service == "" { 542 return errors.New("service name must be configured") 543 } 544 if !serviceRegexp.MatchString(config.Service) { 545 return fmt.Errorf("service name %q does not match regular expression %v", config.Service, serviceRegexp) 546 } 547 548 if config.ServiceVersion == "" { 549 for _, ev := range []string{"GAE_VERSION", "K_REVISION"} { 550 if val := os.Getenv(ev); val != "" { 551 config.ServiceVersion = val 552 break 553 } 554 } 555 } 556 557 if projectID := os.Getenv("GOOGLE_CLOUD_PROJECT"); config.ProjectID == "" && projectID != "" { 558 // Cloud Shell and App Engine set this environment variable to the project 559 // ID, so use it if present. In case of App Engine the project ID is also 560 // available from the GCE metadata server, but by using the environment 561 // variable saves one request to the metadata server. The environment 562 // project ID is only used if no project ID is provided in the 563 // configuration. 564 config.ProjectID = projectID 565 } 566 if onGCE() { 567 var err error 568 if config.ProjectID == "" { 569 if config.ProjectID, err = getProjectID(); err != nil { 570 return fmt.Errorf("failed to get the project ID from Compute Engine metadata: %v", err) 571 } 572 } 573 574 if config.Zone == "" { 575 if config.Zone, err = getZone(); err != nil { 576 return fmt.Errorf("failed to get zone from Compute Engine metadata: %v", err) 577 } 578 } 579 580 if config.Instance == "" { 581 if config.Instance, err = getInstanceName(); err != nil { 582 if _, ok := err.(gcemd.NotDefinedError); !ok { 583 return fmt.Errorf("failed to get instance name from Compute Engine metadata: %v", err) 584 } 585 debugLog("failed to get instance name from Compute Engine metadata, will use empty name: %v", err) 586 } 587 } 588 } else { 589 if config.ProjectID == "" { 590 return fmt.Errorf("project ID must be specified in the configuration if running outside of GCP") 591 } 592 } 593 594 if config.APIAddr == "" { 595 config.APIAddr = apiAddress 596 } 597 return nil 598} 599 600// pollProfilerService starts an endless loop to poll the profiler 601// server for instructions, and collects and uploads profiles as 602// requested. 603func pollProfilerService(ctx context.Context, a *agent) { 604 debugLog("Cloud Profiler Go Agent version: %s", version.Repo) 605 debugLog("profiler has started") 606 for i := 0; config.numProfiles == 0 || i < config.numProfiles; i++ { 607 p := a.createProfile(ctx) 608 a.profileAndUpload(ctx, p) 609 } 610 611 if profilingDone != nil { 612 profilingDone <- true 613 } 614} 615