1// Copyright 2019 The Bazel Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package starlark
6
7// This file defines a simple execution-time profiler for Starlark.
8// It measures the wall time spent executing Starlark code, and emits a
9// gzipped protocol message in pprof format (github.com/google/pprof).
10//
11// When profiling is enabled, the interpreter calls the profiler to
12// indicate the start and end of each "span" or time interval. A leaf
13// function (whether Go or Starlark) has a single span. A function that
14// calls another function has spans for each interval in which it is the
15// top of the stack. (A LOAD instruction also ends a span.)
16//
17// At the start of a span, the interpreter records the current time in
18// the thread's topmost frame. At the end of the span, it obtains the
19// time again and subtracts the span start time. The difference is added
20// to an accumulator variable in the thread. If the accumulator exceeds
21// some fixed quantum (10ms, say), the profiler records the current call
22// stack and sends it to the profiler goroutine, along with the number
23// of quanta, which are subtracted. For example, if the accumulator
24// holds 3ms and then a completed span adds 25ms to it, its value is 28ms,
25// which exceeeds 10ms. The profiler records a stack with the value 20ms
26// (2 quanta), and the accumulator is left with 8ms.
27//
28// The profiler goroutine converts the stacks into the pprof format and
29// emits a gzip-compressed protocol message to the designated output
30// file. We use a hand-written streaming proto encoder to avoid
31// dependencies on pprof and proto, and to avoid the need to
32// materialize the profile data structure in memory.
33//
34// A limitation of this profiler is that it measures wall time, which
35// does not necessarily correspond to CPU time. A CPU profiler requires
36// that only running (not runnable) threads are sampled; this is
37// commonly achieved by having the kernel deliver a (PROF) signal to an
38// arbitrary running thread, through setitimer(2). The CPU profiler in the
39// Go runtime uses this mechanism, but it is not possible for a Go
40// application to register a SIGPROF handler, nor is it possible for a
41// Go handler for some other signal to read the stack pointer of
42// the interrupted thread.
43//
44// Two caveats:
45// (1) it is tempting to send the leaf Frame directly to the profiler
46// goroutine instead of making a copy of the stack, since a Frame is a
47// spaghetti stack--a linked list. However, as soon as execution
48// resumes, the stack's Frame.pc values may be mutated, so Frames are
49// not safe to share with the asynchronous profiler goroutine.
50// (2) it is tempting to use Callables as keys in a map when tabulating
51// the pprof protocols's Function entities. However, we cannot assume
52// that Callables are valid map keys, and furthermore we must not
53// pin function values in memory indefinitely as this may cause lambda
54// values to keep their free variables live much longer than necessary.
55
56// TODO(adonovan):
57// - make Start/Stop fully thread-safe.
58// - fix the pc hack.
59// - experiment with other values of quantum.
60
61import (
62	"bufio"
63	"bytes"
64	"compress/gzip"
65	"encoding/binary"
66	"fmt"
67	"io"
68	"log"
69	"reflect"
70	"sync/atomic"
71	"time"
72	"unsafe"
73
74	"go.starlark.net/syntax"
75)
76
77// StartProfile enables time profiling of all Starlark threads,
78// and writes a profile in pprof format to w.
79// It must be followed by a call to StopProfiler to stop
80// the profiler and finalize the profile.
81//
82// StartProfile returns an error if profiling was already enabled.
83//
84// StartProfile must not be called concurrently with Starlark execution.
85func StartProfile(w io.Writer) error {
86	if !atomic.CompareAndSwapUint32(&profiler.on, 0, 1) {
87		return fmt.Errorf("profiler already running")
88	}
89
90	// TODO(adonovan): make the API fully concurrency-safe.
91	// The main challenge is racy reads/writes of profiler.events,
92	// and of send/close races on the channel it refers to.
93	// It's easy to solve them with a mutex but harder to do
94	// it efficiently.
95
96	profiler.events = make(chan *profEvent, 1)
97	profiler.done = make(chan error)
98
99	go profile(w)
100
101	return nil
102}
103
104// StopProfiler stops the profiler started by a prior call to
105// StartProfile and finalizes the profile. It returns an error if the
106// profile could not be completed.
107//
108// StopProfiler must not be called concurrently with Starlark execution.
109func StopProfile() error {
110	// Terminate the profiler goroutine and get its result.
111	close(profiler.events)
112	err := <-profiler.done
113
114	profiler.done = nil
115	profiler.events = nil
116	atomic.StoreUint32(&profiler.on, 0)
117
118	return err
119}
120
121// globals
122var profiler struct {
123	on     uint32          // nonzero => profiler running
124	events chan *profEvent // profile events from interpreter threads
125	done   chan error      // indicates profiler goroutine is ready
126}
127
128func (thread *Thread) beginProfSpan() {
129	if profiler.events == nil {
130		return // profiling not enabled
131	}
132
133	thread.frameAt(0).spanStart = nanotime()
134}
135
136// TODO(adonovan): experiment with smaller values,
137// which trade space and time for greater precision.
138const quantum = 10 * time.Millisecond
139
140func (thread *Thread) endProfSpan() {
141	if profiler.events == nil {
142		return // profiling not enabled
143	}
144
145	// Add the span to the thread's accumulator.
146	thread.proftime += time.Duration(nanotime() - thread.frameAt(0).spanStart)
147	if thread.proftime < quantum {
148		return
149	}
150
151	// Only record complete quanta.
152	n := thread.proftime / quantum
153	thread.proftime -= n * quantum
154
155	// Copy the stack.
156	// (We can't save thread.frame because its pc will change.)
157	ev := &profEvent{
158		thread: thread,
159		time:   n * quantum,
160	}
161	ev.stack = ev.stackSpace[:0]
162	for i := range thread.stack {
163		fr := thread.frameAt(i)
164		ev.stack = append(ev.stack, profFrame{
165			pos: fr.Position(),
166			fn:  fr.Callable(),
167			pc:  fr.pc,
168		})
169	}
170
171	profiler.events <- ev
172}
173
174type profEvent struct {
175	thread     *Thread // currently unused
176	time       time.Duration
177	stack      []profFrame
178	stackSpace [8]profFrame // initial space for stack
179}
180
181type profFrame struct {
182	fn  Callable        // don't hold this live for too long (prevents GC of lambdas)
183	pc  uint32          // program counter (Starlark frames only)
184	pos syntax.Position // position of pc within this frame
185}
186
187// profile is the profiler goroutine.
188// It runs until StopProfiler is called.
189func profile(w io.Writer) {
190	// Field numbers from pprof protocol.
191	// See https://github.com/google/pprof/blob/master/proto/profile.proto
192	const (
193		Profile_sample_type    = 1  // repeated ValueType
194		Profile_sample         = 2  // repeated Sample
195		Profile_mapping        = 3  // repeated Mapping
196		Profile_location       = 4  // repeated Location
197		Profile_function       = 5  // repeated Function
198		Profile_string_table   = 6  // repeated string
199		Profile_time_nanos     = 9  // int64
200		Profile_duration_nanos = 10 // int64
201		Profile_period_type    = 11 // ValueType
202		Profile_period         = 12 // int64
203
204		ValueType_type = 1 // int64
205		ValueType_unit = 2 // int64
206
207		Sample_location_id = 1 // repeated uint64
208		Sample_value       = 2 // repeated int64
209		Sample_label       = 3 // repeated Label
210
211		Label_key      = 1 // int64
212		Label_str      = 2 // int64
213		Label_num      = 3 // int64
214		Label_num_unit = 4 // int64
215
216		Location_id         = 1 // uint64
217		Location_mapping_id = 2 // uint64
218		Location_address    = 3 // uint64
219		Location_line       = 4 // repeated Line
220
221		Line_function_id = 1 // uint64
222		Line_line        = 2 // int64
223
224		Function_id          = 1 // uint64
225		Function_name        = 2 // int64
226		Function_system_name = 3 // int64
227		Function_filename    = 4 // int64
228		Function_start_line  = 5 // int64
229	)
230
231	bufw := bufio.NewWriter(w) // write file in 4KB (not 240B flate-sized) chunks
232	gz := gzip.NewWriter(bufw)
233	enc := protoEncoder{w: gz}
234
235	// strings
236	stringIndex := make(map[string]int64)
237	str := func(s string) int64 {
238		i, ok := stringIndex[s]
239		if !ok {
240			i = int64(len(stringIndex))
241			enc.string(Profile_string_table, s)
242			stringIndex[s] = i
243		}
244		return i
245	}
246	str("") // entry 0
247
248	// functions
249	//
250	// function returns the ID of a Callable for use in Line.FunctionId.
251	// The ID is the same as the function's logical address,
252	// which is supplied by the caller to avoid the need to recompute it.
253	functionId := make(map[uintptr]uint64)
254	function := func(fn Callable, addr uintptr) uint64 {
255		id, ok := functionId[addr]
256		if !ok {
257			id = uint64(addr)
258
259			var pos syntax.Position
260			if fn, ok := fn.(callableWithPosition); ok {
261				pos = fn.Position()
262			}
263
264			name := fn.Name()
265			if name == "<toplevel>" {
266				name = pos.Filename()
267			}
268
269			nameIndex := str(name)
270
271			fun := new(bytes.Buffer)
272			funenc := protoEncoder{w: fun}
273			funenc.uint(Function_id, id)
274			funenc.int(Function_name, nameIndex)
275			funenc.int(Function_system_name, nameIndex)
276			funenc.int(Function_filename, str(pos.Filename()))
277			funenc.int(Function_start_line, int64(pos.Line))
278			enc.bytes(Profile_function, fun.Bytes())
279
280			functionId[addr] = id
281		}
282		return id
283	}
284
285	// locations
286	//
287	// location returns the ID of the location denoted by fr.
288	// For Starlark frames, this is the Frame pc.
289	locationId := make(map[uintptr]uint64)
290	location := func(fr profFrame) uint64 {
291		fnAddr := profFuncAddr(fr.fn)
292
293		// For Starlark functions, the frame position
294		// represents the current PC value.
295		// Mix it into the low bits of the address.
296		// This is super hacky and may result in collisions
297		// in large functions or if functions are numerous.
298		// TODO(adonovan): fix: try making this cleaner by treating
299		// each bytecode segment as a Profile.Mapping.
300		pcAddr := fnAddr
301		if _, ok := fr.fn.(*Function); ok {
302			pcAddr = (pcAddr << 16) ^ uintptr(fr.pc)
303		}
304
305		id, ok := locationId[pcAddr]
306		if !ok {
307			id = uint64(pcAddr)
308
309			line := new(bytes.Buffer)
310			lineenc := protoEncoder{w: line}
311			lineenc.uint(Line_function_id, function(fr.fn, fnAddr))
312			lineenc.int(Line_line, int64(fr.pos.Line))
313			loc := new(bytes.Buffer)
314			locenc := protoEncoder{w: loc}
315			locenc.uint(Location_id, id)
316			locenc.uint(Location_address, uint64(pcAddr))
317			locenc.bytes(Location_line, line.Bytes())
318			enc.bytes(Profile_location, loc.Bytes())
319
320			locationId[pcAddr] = id
321		}
322		return id
323	}
324
325	wallNanos := new(bytes.Buffer)
326	wnenc := protoEncoder{w: wallNanos}
327	wnenc.int(ValueType_type, str("wall"))
328	wnenc.int(ValueType_unit, str("nanoseconds"))
329
330	// informational fields of Profile
331	enc.bytes(Profile_sample_type, wallNanos.Bytes())
332	enc.int(Profile_period, quantum.Nanoseconds())     // magnitude of sampling period
333	enc.bytes(Profile_period_type, wallNanos.Bytes())  // dimension and unit of period
334	enc.int(Profile_time_nanos, time.Now().UnixNano()) // start (real) time of profile
335
336	startNano := nanotime()
337
338	// Read profile events from the channel
339	// until it is closed by StopProfiler.
340	for e := range profiler.events {
341		sample := new(bytes.Buffer)
342		sampleenc := protoEncoder{w: sample}
343		sampleenc.int(Sample_value, e.time.Nanoseconds()) // wall nanoseconds
344		for _, fr := range e.stack {
345			sampleenc.uint(Sample_location_id, location(fr))
346		}
347		enc.bytes(Profile_sample, sample.Bytes())
348	}
349
350	endNano := nanotime()
351	enc.int(Profile_duration_nanos, endNano-startNano)
352
353	err := gz.Close() // Close reports any prior write error
354	if flushErr := bufw.Flush(); err == nil {
355		err = flushErr
356	}
357	profiler.done <- err
358}
359
360// nanotime returns the time in nanoseconds since epoch.
361// It is implemented by runtime.nanotime using the linkname hack;
362// runtime.nanotime is defined for all OSs/ARCHS and uses the
363// monotonic system clock, which there is no portable way to access.
364// Should that function ever go away, these alternatives exist:
365//
366// 	// POSIX only. REALTIME not MONOTONIC. 17ns.
367// 	var tv syscall.Timeval
368// 	syscall.Gettimeofday(&tv) // can't fail
369// 	return tv.Nano()
370//
371// 	// Portable. REALTIME not MONOTONIC. 46ns.
372// 	return time.Now().Nanoseconds()
373//
374//      // POSIX only. Adds a dependency.
375//	import "golang.org/x/sys/unix"
376//	var ts unix.Timespec
377// 	unix.ClockGettime(CLOCK_MONOTONIC, &ts) // can't fail
378//	return unix.TimespecToNsec(ts)
379//
380//go:linkname nanotime runtime.nanotime
381func nanotime() int64
382
383// profFuncAddr returns the canonical "address"
384// of a Callable for use by the profiler.
385func profFuncAddr(fn Callable) uintptr {
386	switch fn := fn.(type) {
387	case *Builtin:
388		return reflect.ValueOf(fn.fn).Pointer()
389	case *Function:
390		return uintptr(unsafe.Pointer(fn.funcode))
391	}
392
393	// User-defined callable types are typically of
394	// of kind pointer-to-struct. Handle them specially.
395	if v := reflect.ValueOf(fn); v.Type().Kind() == reflect.Ptr {
396		return v.Pointer()
397	}
398
399	// Address zero is reserved by the protocol.
400	// Use 1 for callables we don't recognize.
401	log.Printf("Starlark profiler: no address for Callable %T", fn)
402	return 1
403}
404
405// We encode the protocol message by hand to avoid making
406// the interpreter depend on both github.com/google/pprof
407// and github.com/golang/protobuf.
408//
409// This also avoids the need to materialize a protocol message object
410// tree of unbounded size and serialize it all at the end.
411// The pprof format appears to have been designed to
412// permit streaming implementations such as this one.
413//
414// See https://developers.google.com/protocol-buffers/docs/encoding.
415type protoEncoder struct {
416	w   io.Writer // *bytes.Buffer or *gzip.Writer
417	tmp [binary.MaxVarintLen64]byte
418}
419
420func (e *protoEncoder) uvarint(x uint64) {
421	n := binary.PutUvarint(e.tmp[:], x)
422	e.w.Write(e.tmp[:n])
423}
424
425func (e *protoEncoder) tag(field, wire uint) {
426	e.uvarint(uint64(field<<3 | wire))
427}
428
429func (e *protoEncoder) string(field uint, s string) {
430	e.tag(field, 2) // length-delimited
431	e.uvarint(uint64(len(s)))
432	io.WriteString(e.w, s)
433}
434
435func (e *protoEncoder) bytes(field uint, b []byte) {
436	e.tag(field, 2) // length-delimited
437	e.uvarint(uint64(len(b)))
438	e.w.Write(b)
439}
440
441func (e *protoEncoder) uint(field uint, x uint64) {
442	e.tag(field, 0) // varint
443	e.uvarint(x)
444}
445
446func (e *protoEncoder) int(field uint, x int64) {
447	e.tag(field, 0) // varint
448	e.uvarint(uint64(x))
449}
450