1// Copyright 2017 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//      http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package profiler is a client for the Cloud Profiler service.
16//
17// Usage example:
18//
19//   import "cloud.google.com/go/profiler"
20//   ...
21//   if err := profiler.Start(profiler.Config{Service: "my-service"}); err != nil {
22//       // TODO: Handle error.
23//   }
24//
25// Calling Start will start a goroutine to collect profiles and upload to
26// the profiler server, at the rhythm specified by the server.
27//
28// The caller must provide the service string in the config, and may provide
29// other information as well. See Config for details.
30//
31// Profiler has CPU, heap and goroutine profiling enabled by default. Mutex
32// profiling can be enabled in the config. Note that goroutine and mutex
33// profiles are shown as "threads" and "contention" profiles in the profiler
34// UI.
35package profiler
36
37import (
38	"bytes"
39	"context"
40	"errors"
41	"fmt"
42	"log"
43	"os"
44	"regexp"
45	"runtime"
46	"runtime/pprof"
47	"sync"
48	"time"
49
50	gcemd "cloud.google.com/go/compute/metadata"
51	"cloud.google.com/go/internal/version"
52	"github.com/golang/protobuf/proto"
53	"github.com/golang/protobuf/ptypes"
54	"github.com/google/pprof/profile"
55	gax "github.com/googleapis/gax-go/v2"
56	"google.golang.org/api/option"
57	gtransport "google.golang.org/api/transport/grpc"
58	pb "google.golang.org/genproto/googleapis/devtools/cloudprofiler/v2"
59	edpb "google.golang.org/genproto/googleapis/rpc/errdetails"
60	"google.golang.org/grpc"
61	"google.golang.org/grpc/codes"
62	grpcmd "google.golang.org/grpc/metadata"
63	"google.golang.org/grpc/status"
64)
65
66var (
67	config       Config
68	startOnce    allowUntilSuccess
69	mutexEnabled bool
70	// The functions below are stubbed to be overrideable for testing.
71	getProjectID     = gcemd.ProjectID
72	getInstanceName  = gcemd.InstanceName
73	getZone          = gcemd.Zone
74	startCPUProfile  = pprof.StartCPUProfile
75	stopCPUProfile   = pprof.StopCPUProfile
76	writeHeapProfile = pprof.WriteHeapProfile
77	sleep            = gax.Sleep
78	dialGRPC         = gtransport.DialPool
79	onGCE            = gcemd.OnGCE
80	serviceRegexp    = regexp.MustCompile(`^[a-z]([-a-z0-9_.]{0,253}[a-z0-9])?$`)
81
82	// For testing only.
83	// When the profiling loop has exited without error and this channel is
84	// non-nil, "true" will be sent to this channel.
85	profilingDone chan bool
86)
87
88const (
89	apiAddress       = "cloudprofiler.googleapis.com:443"
90	xGoogAPIMetadata = "x-goog-api-client"
91	zoneNameLabel    = "zone"
92	versionLabel     = "version"
93	languageLabel    = "language"
94	instanceLabel    = "instance"
95	scope            = "https://www.googleapis.com/auth/monitoring.write"
96
97	initialBackoff = time.Minute
98	// Ensure the agent will recover within 1 hour.
99	maxBackoff        = time.Hour
100	backoffMultiplier = 1.3 // Backoff envelope increases by this factor on each retry.
101	retryInfoMetadata = "google.rpc.retryinfo-bin"
102)
103
104// Config is the profiler configuration.
105type Config struct {
106	// Service must be provided to start the profiler. It specifies the name of
107	// the service under which the profiled data will be recorded and exposed at
108	// the Profiler UI for the project. You can specify an arbitrary string, but
109	// see Deployment.target at
110	// https://github.com/googleapis/googleapis/blob/master/google/devtools/cloudprofiler/v2/profiler.proto
111	// for restrictions. If the parameter is not set, the agent will probe
112	// GAE_SERVICE environment variable which is present in Google App Engine
113	// environment.
114	// NOTE: The string should be the same across different replicas of
115	// your service so that the globally constant profiling rate is
116	// maintained. Do not put things like PID or unique pod ID in the name.
117	Service string
118
119	// ServiceVersion is an optional field specifying the version of the
120	// service. It can be an arbitrary string. Profiler profiles
121	// once per minute for each version of each service in each zone.
122	// ServiceVersion defaults to GAE_VERSION environment variable if that is
123	// set, or to empty string otherwise.
124	ServiceVersion string
125
126	// DebugLogging enables detailed debug logging from profiler. It
127	// defaults to false.
128	DebugLogging bool
129
130	// MutexProfiling enables mutex profiling. It defaults to false.
131	// Note that mutex profiling is not supported by Go versions older
132	// than Go 1.8.
133	MutexProfiling bool
134
135	// When true, collecting the CPU profiles is disabled.
136	NoCPUProfiling bool
137
138	// When true, collecting the allocation profiles is disabled.
139	NoAllocProfiling bool
140
141	// AllocForceGC forces garbage collection before the collection of each heap
142	// profile collected to produce the allocation profile. This increases the
143	// accuracy of allocation profiling. It defaults to false.
144	AllocForceGC bool
145
146	// When true, collecting the heap profiles is disabled.
147	NoHeapProfiling bool
148
149	// When true, collecting the goroutine profiles is disabled.
150	NoGoroutineProfiling bool
151
152	// When true, the agent sends all telemetries via OpenCensus exporter, which
153	// can be viewed in Cloud Trace and Cloud Monitoring.
154	// Default is false.
155	EnableOCTelemetry bool
156
157	// ProjectID is the Cloud Console project ID to use instead of the one set by
158	// GOOGLE_CLOUD_PROJECT environment variable or read from the VM metadata
159	// server.
160	//
161	// Set this if you are running the agent in your local environment
162	// or anywhere else outside of Google Cloud Platform.
163	ProjectID string
164
165	// APIAddr is the HTTP endpoint to use to connect to the profiler
166	// agent API. Defaults to the production environment, overridable
167	// for testing.
168	APIAddr string
169
170	// Instance is the name of Compute Engine instance the profiler agent runs
171	// on. This is normally determined from the Compute Engine metadata server
172	// and doesn't need to be initialized. It needs to be set in rare cases where
173	// the metadata server is present but is flaky or otherwise misbehave.
174	Instance string
175
176	// Zone is the zone of Compute Engine instance the profiler agent runs
177	// on. This is normally determined from the Compute Engine metadata server
178	// and doesn't need to be initialized. It needs to be set in rare cases where
179	// the metadata server is present but is flaky or otherwise misbehave.
180	Zone string
181
182	// numProfiles is the number of profiles which should be collected before
183	// the profile collection loop exits.When numProfiles is 0, profiles will
184	// be collected for the duration of the program. For testing only.
185	numProfiles int
186}
187
188// allowUntilSuccess is an object that will perform action till
189// it succeeds once.
190// This is a modified form of Go's sync.Once
191type allowUntilSuccess struct {
192	m    sync.Mutex
193	done uint32
194}
195
196// do calls function f only if it hasnt returned nil previously.
197// Once f returns nil, do will not call function f any more.
198// This is a modified form of Go's sync.Once.Do
199func (o *allowUntilSuccess) do(f func() error) (err error) {
200	o.m.Lock()
201	defer o.m.Unlock()
202	if o.done == 0 {
203		if err = f(); err == nil {
204			o.done = 1
205		}
206	} else {
207		debugLog("profiler.Start() called again after it was previously called")
208		err = nil
209	}
210	return err
211}
212
213// Start starts a goroutine to collect and upload profiles. The
214// caller must provide the service string in the config. See
215// Config for details. Start should only be called once. Any
216// additional calls will be ignored.
217func Start(cfg Config, options ...option.ClientOption) error {
218	startError := startOnce.do(func() error {
219		return start(cfg, options...)
220	})
221	return startError
222}
223
224func start(cfg Config, options ...option.ClientOption) error {
225	if err := initializeConfig(cfg); err != nil {
226		debugLog("failed to initialize config: %v", err)
227		return err
228	}
229	if config.MutexProfiling {
230		if mutexEnabled = enableMutexProfiling(); !mutexEnabled {
231			return fmt.Errorf("mutex profiling is not supported by %s, requires Go 1.8 or later", runtime.Version())
232		}
233	}
234
235	ctx := context.Background()
236
237	opts := []option.ClientOption{
238		option.WithEndpoint(config.APIAddr),
239		option.WithScopes(scope),
240		option.WithUserAgent(fmt.Sprintf("gcloud-go-profiler/%s", version.Repo)),
241	}
242	if !config.EnableOCTelemetry {
243		opts = append(opts, option.WithTelemetryDisabled())
244	}
245	opts = append(opts, options...)
246
247	connPool, err := dialGRPC(ctx, opts...)
248	if err != nil {
249		debugLog("failed to dial GRPC: %v", err)
250		return err
251	}
252
253	a, err := initializeAgent(pb.NewProfilerServiceClient(connPool))
254	if err != nil {
255		debugLog("failed to start the profiling agent: %v", err)
256		return err
257	}
258	go pollProfilerService(withXGoogHeader(ctx), a)
259	return nil
260}
261
262func debugLog(format string, e ...interface{}) {
263	if config.DebugLogging {
264		log.Printf(format, e...)
265	}
266}
267
268// agent polls the profiler server for instructions on behalf of a task,
269// and collects and uploads profiles as requested.
270type agent struct {
271	client        pb.ProfilerServiceClient
272	deployment    *pb.Deployment
273	profileLabels map[string]string
274	profileTypes  []pb.ProfileType
275}
276
277// abortedBackoffDuration retrieves the retry duration from gRPC trailing
278// metadata, which is set by the profiler server.
279func abortedBackoffDuration(md grpcmd.MD) (time.Duration, error) {
280	elem := md[retryInfoMetadata]
281	if len(elem) <= 0 {
282		return 0, errors.New("no retry info")
283	}
284
285	var retryInfo edpb.RetryInfo
286	if err := proto.Unmarshal([]byte(elem[0]), &retryInfo); err != nil {
287		return 0, err
288	} else if time, err := ptypes.Duration(retryInfo.RetryDelay); err != nil {
289		return 0, err
290	} else {
291		if time < 0 {
292			return 0, errors.New("negative retry duration")
293		}
294		return time, nil
295	}
296}
297
298type retryer struct {
299	backoff gax.Backoff
300	md      grpcmd.MD
301}
302
303func (r *retryer) Retry(err error) (time.Duration, bool) {
304	st, _ := status.FromError(err)
305	if st != nil && st.Code() == codes.Aborted {
306		dur, err := abortedBackoffDuration(r.md)
307		if err == nil {
308			return dur, true
309		}
310		debugLog("failed to get backoff duration: %v", err)
311	}
312	return r.backoff.Pause(), true
313}
314
315// createProfile talks to the profiler server to create profile. In
316// case of error, the goroutine will sleep and retry. Sleep duration may
317// be specified by the server. Otherwise it will be an exponentially
318// increasing value, bounded by maxBackoff.
319func (a *agent) createProfile(ctx context.Context) *pb.Profile {
320	req := pb.CreateProfileRequest{
321		Parent:      "projects/" + a.deployment.ProjectId,
322		Deployment:  a.deployment,
323		ProfileType: a.profileTypes,
324	}
325
326	var p *pb.Profile
327	md := grpcmd.New(map[string]string{})
328
329	gax.Invoke(ctx, func(ctx context.Context, settings gax.CallSettings) error {
330		debugLog("creating a new profile via profiler service")
331		var err error
332		p, err = a.client.CreateProfile(ctx, &req, grpc.Trailer(&md))
333		if err != nil {
334			debugLog("failed to create profile, will retry: %v", err)
335		}
336		return err
337	}, gax.WithRetry(func() gax.Retryer {
338		return &retryer{
339			backoff: gax.Backoff{
340				Initial:    initialBackoff,
341				Max:        maxBackoff,
342				Multiplier: backoffMultiplier,
343			},
344			md: md,
345		}
346	}))
347
348	debugLog("successfully created profile %v", p.GetProfileType())
349	return p
350}
351
352func (a *agent) profileAndUpload(ctx context.Context, p *pb.Profile) {
353	var prof bytes.Buffer
354	pt := p.GetProfileType()
355
356	ptEnabled := false
357	for _, enabled := range a.profileTypes {
358		if enabled == pt {
359			ptEnabled = true
360			break
361		}
362	}
363
364	if !ptEnabled {
365		debugLog("skipping collection of disabled profile type: %v", pt)
366		return
367	}
368
369	switch pt {
370	case pb.ProfileType_CPU:
371		duration, err := ptypes.Duration(p.Duration)
372		if err != nil {
373			debugLog("failed to get profile duration for CPU profile: %v", err)
374			return
375		}
376		if err := startCPUProfile(&prof); err != nil {
377			debugLog("failed to start CPU profile: %v", err)
378			return
379		}
380		sleep(ctx, duration)
381		stopCPUProfile()
382	case pb.ProfileType_HEAP:
383		if err := heapProfile(&prof); err != nil {
384			debugLog("failed to write heap profile: %v", err)
385			return
386		}
387	case pb.ProfileType_HEAP_ALLOC:
388		duration, err := ptypes.Duration(p.Duration)
389		if err != nil {
390			debugLog("failed to get profile duration for allocation profile: %v", err)
391			return
392		}
393		if err := deltaAllocProfile(ctx, duration, config.AllocForceGC, &prof); err != nil {
394			debugLog("failed to collect allocation profile: %v", err)
395			return
396		}
397	case pb.ProfileType_THREADS:
398		if err := pprof.Lookup("goroutine").WriteTo(&prof, 0); err != nil {
399			debugLog("failed to collect goroutine profile: %v", err)
400			return
401		}
402	case pb.ProfileType_CONTENTION:
403		duration, err := ptypes.Duration(p.Duration)
404		if err != nil {
405			debugLog("failed to get profile duration: %v", err)
406			return
407		}
408		if err := deltaMutexProfile(ctx, duration, &prof); err != nil {
409			debugLog("failed to collect mutex profile: %v", err)
410			return
411		}
412	default:
413		debugLog("unexpected profile type: %v", pt)
414		return
415	}
416
417	p.ProfileBytes = prof.Bytes()
418	p.Labels = a.profileLabels
419	req := pb.UpdateProfileRequest{Profile: p}
420
421	// Upload profile, discard profile in case of error.
422	debugLog("start uploading profile")
423	if _, err := a.client.UpdateProfile(ctx, &req); err != nil {
424		debugLog("failed to upload profile: %v", err)
425	}
426}
427
428// deltaMutexProfile writes mutex profile changes over a time period specified
429// with 'duration' to 'prof'.
430func deltaMutexProfile(ctx context.Context, duration time.Duration, prof *bytes.Buffer) error {
431	if !mutexEnabled {
432		return errors.New("mutex profiling is not enabled")
433	}
434	p0, err := mutexProfile()
435	if err != nil {
436		return err
437	}
438	sleep(ctx, duration)
439	p, err := mutexProfile()
440	if err != nil {
441		return err
442	}
443
444	p0.Scale(-1)
445	p, err = profile.Merge([]*profile.Profile{p0, p})
446	if err != nil {
447		return err
448	}
449
450	return p.Write(prof)
451}
452
453func mutexProfile() (*profile.Profile, error) {
454	p := pprof.Lookup("mutex")
455	if p == nil {
456		return nil, errors.New("mutex profiling is not supported")
457	}
458	var buf bytes.Buffer
459	if err := p.WriteTo(&buf, 0); err != nil {
460		return nil, err
461	}
462	return profile.Parse(&buf)
463}
464
465// withXGoogHeader sets the name and version of the application in
466// the `x-goog-api-client` header passed on each request. Intended for
467// use by Google-written clients.
468func withXGoogHeader(ctx context.Context, keyval ...string) context.Context {
469	kv := append([]string{"gl-go", version.Go(), "gccl", version.Repo}, keyval...)
470	kv = append(kv, "gax", gax.Version, "grpc", grpc.Version)
471
472	md, _ := grpcmd.FromOutgoingContext(ctx)
473	md = md.Copy()
474	md[xGoogAPIMetadata] = []string{gax.XGoogHeader(kv...)}
475	return grpcmd.NewOutgoingContext(ctx, md)
476}
477
478// initializeAgent initializes the profiling agent. It returns an error if
479// profile collection should not be started because collection is disabled
480// for all profile types.
481func initializeAgent(c pb.ProfilerServiceClient) (*agent, error) {
482	labels := map[string]string{languageLabel: "go"}
483	if config.Zone != "" {
484		labels[zoneNameLabel] = config.Zone
485	}
486	if config.ServiceVersion != "" {
487		labels[versionLabel] = config.ServiceVersion
488	}
489	d := &pb.Deployment{
490		ProjectId: config.ProjectID,
491		Target:    config.Service,
492		Labels:    labels,
493	}
494
495	profileLabels := map[string]string{}
496
497	if config.Instance != "" {
498		profileLabels[instanceLabel] = config.Instance
499	}
500
501	var profileTypes []pb.ProfileType
502	if !config.NoCPUProfiling {
503		profileTypes = append(profileTypes, pb.ProfileType_CPU)
504	}
505	if !config.NoHeapProfiling {
506		profileTypes = append(profileTypes, pb.ProfileType_HEAP)
507	}
508	if !config.NoGoroutineProfiling {
509		profileTypes = append(profileTypes, pb.ProfileType_THREADS)
510	}
511	if !config.NoAllocProfiling {
512		profileTypes = append(profileTypes, pb.ProfileType_HEAP_ALLOC)
513	}
514	if mutexEnabled {
515		profileTypes = append(profileTypes, pb.ProfileType_CONTENTION)
516	}
517
518	if len(profileTypes) == 0 {
519		return nil, fmt.Errorf("collection is not enabled for any profile types")
520	}
521
522	return &agent{
523		client:        c,
524		deployment:    d,
525		profileLabels: profileLabels,
526		profileTypes:  profileTypes,
527	}, nil
528}
529
530func initializeConfig(cfg Config) error {
531	config = cfg
532
533	if config.Service == "" {
534		for _, ev := range []string{"GAE_SERVICE", "K_SERVICE"} {
535			if val := os.Getenv(ev); val != "" {
536				config.Service = val
537				break
538			}
539		}
540	}
541	if config.Service == "" {
542		return errors.New("service name must be configured")
543	}
544	if !serviceRegexp.MatchString(config.Service) {
545		return fmt.Errorf("service name %q does not match regular expression %v", config.Service, serviceRegexp)
546	}
547
548	if config.ServiceVersion == "" {
549		for _, ev := range []string{"GAE_VERSION", "K_REVISION"} {
550			if val := os.Getenv(ev); val != "" {
551				config.ServiceVersion = val
552				break
553			}
554		}
555	}
556
557	if projectID := os.Getenv("GOOGLE_CLOUD_PROJECT"); config.ProjectID == "" && projectID != "" {
558		// Cloud Shell and App Engine set this environment variable to the project
559		// ID, so use it if present. In case of App Engine the project ID is also
560		// available from the GCE metadata server, but by using the environment
561		// variable saves one request to the metadata server. The environment
562		// project ID is only used if no project ID is provided in the
563		// configuration.
564		config.ProjectID = projectID
565	}
566	if onGCE() {
567		var err error
568		if config.ProjectID == "" {
569			if config.ProjectID, err = getProjectID(); err != nil {
570				return fmt.Errorf("failed to get the project ID from Compute Engine metadata: %v", err)
571			}
572		}
573
574		if config.Zone == "" {
575			if config.Zone, err = getZone(); err != nil {
576				return fmt.Errorf("failed to get zone from Compute Engine metadata: %v", err)
577			}
578		}
579
580		if config.Instance == "" {
581			if config.Instance, err = getInstanceName(); err != nil {
582				if _, ok := err.(gcemd.NotDefinedError); !ok {
583					return fmt.Errorf("failed to get instance name from Compute Engine metadata: %v", err)
584				}
585				debugLog("failed to get instance name from Compute Engine metadata, will use empty name: %v", err)
586			}
587		}
588	} else {
589		if config.ProjectID == "" {
590			return fmt.Errorf("project ID must be specified in the configuration if running outside of GCP")
591		}
592	}
593
594	if config.APIAddr == "" {
595		config.APIAddr = apiAddress
596	}
597	return nil
598}
599
600// pollProfilerService starts an endless loop to poll the profiler
601// server for instructions, and collects and uploads profiles as
602// requested.
603func pollProfilerService(ctx context.Context, a *agent) {
604	debugLog("Cloud Profiler Go Agent version: %s", version.Repo)
605	debugLog("profiler has started")
606	for i := 0; config.numProfiles == 0 || i < config.numProfiles; i++ {
607		p := a.createProfile(ctx)
608		a.profileAndUpload(ctx, p)
609	}
610
611	if profilingDone != nil {
612		profilingDone <- true
613	}
614}
615