1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Go execution tracer.
6// The tracer captures a wide range of execution events like goroutine
7// creation/blocking/unblocking, syscall enter/exit/block, GC-related events,
8// changes of heap size, processor start/stop, etc and writes them to a buffer
9// in a compact form. A precise nanosecond-precision timestamp and a stack
10// trace is captured for most events.
11// See https://golang.org/s/go15trace for more info.
12
13package runtime
14
15import (
16	"runtime/internal/sys"
17	"unsafe"
18)
19
20// Event types in the trace, args are given in square brackets.
21const (
22	traceEvNone              = 0  // unused
23	traceEvBatch             = 1  // start of per-P batch of events [pid, timestamp]
24	traceEvFrequency         = 2  // contains tracer timer frequency [frequency (ticks per second)]
25	traceEvStack             = 3  // stack [stack id, number of PCs, array of {PC, func string ID, file string ID, line}]
26	traceEvGomaxprocs        = 4  // current value of GOMAXPROCS [timestamp, GOMAXPROCS, stack id]
27	traceEvProcStart         = 5  // start of P [timestamp, thread id]
28	traceEvProcStop          = 6  // stop of P [timestamp]
29	traceEvGCStart           = 7  // GC start [timestamp, seq, stack id]
30	traceEvGCDone            = 8  // GC done [timestamp]
31	traceEvGCSTWStart        = 9  // GC STW start [timestamp, kind]
32	traceEvGCSTWDone         = 10 // GC STW done [timestamp]
33	traceEvGCSweepStart      = 11 // GC sweep start [timestamp, stack id]
34	traceEvGCSweepDone       = 12 // GC sweep done [timestamp, swept, reclaimed]
35	traceEvGoCreate          = 13 // goroutine creation [timestamp, new goroutine id, new stack id, stack id]
36	traceEvGoStart           = 14 // goroutine starts running [timestamp, goroutine id, seq]
37	traceEvGoEnd             = 15 // goroutine ends [timestamp]
38	traceEvGoStop            = 16 // goroutine stops (like in select{}) [timestamp, stack]
39	traceEvGoSched           = 17 // goroutine calls Gosched [timestamp, stack]
40	traceEvGoPreempt         = 18 // goroutine is preempted [timestamp, stack]
41	traceEvGoSleep           = 19 // goroutine calls Sleep [timestamp, stack]
42	traceEvGoBlock           = 20 // goroutine blocks [timestamp, stack]
43	traceEvGoUnblock         = 21 // goroutine is unblocked [timestamp, goroutine id, seq, stack]
44	traceEvGoBlockSend       = 22 // goroutine blocks on chan send [timestamp, stack]
45	traceEvGoBlockRecv       = 23 // goroutine blocks on chan recv [timestamp, stack]
46	traceEvGoBlockSelect     = 24 // goroutine blocks on select [timestamp, stack]
47	traceEvGoBlockSync       = 25 // goroutine blocks on Mutex/RWMutex [timestamp, stack]
48	traceEvGoBlockCond       = 26 // goroutine blocks on Cond [timestamp, stack]
49	traceEvGoBlockNet        = 27 // goroutine blocks on network [timestamp, stack]
50	traceEvGoSysCall         = 28 // syscall enter [timestamp, stack]
51	traceEvGoSysExit         = 29 // syscall exit [timestamp, goroutine id, seq, real timestamp]
52	traceEvGoSysBlock        = 30 // syscall blocks [timestamp]
53	traceEvGoWaiting         = 31 // denotes that goroutine is blocked when tracing starts [timestamp, goroutine id]
54	traceEvGoInSyscall       = 32 // denotes that goroutine is in syscall when tracing starts [timestamp, goroutine id]
55	traceEvHeapAlloc         = 33 // memstats.heap_live change [timestamp, heap_alloc]
56	traceEvNextGC            = 34 // memstats.next_gc change [timestamp, next_gc]
57	traceEvTimerGoroutine    = 35 // denotes timer goroutine [timer goroutine id]
58	traceEvFutileWakeup      = 36 // denotes that the previous wakeup of this goroutine was futile [timestamp]
59	traceEvString            = 37 // string dictionary entry [ID, length, string]
60	traceEvGoStartLocal      = 38 // goroutine starts running on the same P as the last event [timestamp, goroutine id]
61	traceEvGoUnblockLocal    = 39 // goroutine is unblocked on the same P as the last event [timestamp, goroutine id, stack]
62	traceEvGoSysExitLocal    = 40 // syscall exit on the same P as the last event [timestamp, goroutine id, real timestamp]
63	traceEvGoStartLabel      = 41 // goroutine starts running with label [timestamp, goroutine id, seq, label string id]
64	traceEvGoBlockGC         = 42 // goroutine blocks on GC assist [timestamp, stack]
65	traceEvGCMarkAssistStart = 43 // GC mark assist start [timestamp, stack]
66	traceEvGCMarkAssistDone  = 44 // GC mark assist done [timestamp]
67	traceEvCount             = 45
68)
69
70const (
71	// Timestamps in trace are cputicks/traceTickDiv.
72	// This makes absolute values of timestamp diffs smaller,
73	// and so they are encoded in less number of bytes.
74	// 64 on x86 is somewhat arbitrary (one tick is ~20ns on a 3GHz machine).
75	// The suggested increment frequency for PowerPC's time base register is
76	// 512 MHz according to Power ISA v2.07 section 6.2, so we use 16 on ppc64
77	// and ppc64le.
78	// Tracing won't work reliably for architectures where cputicks is emulated
79	// by nanotime, so the value doesn't matter for those architectures.
80	traceTickDiv = 16 + 48*(sys.Goarch386|sys.GoarchAmd64|sys.GoarchAmd64p32)
81	// Maximum number of PCs in a single stack trace.
82	// Since events contain only stack id rather than whole stack trace,
83	// we can allow quite large values here.
84	traceStackSize = 128
85	// Identifier of a fake P that is used when we trace without a real P.
86	traceGlobProc = -1
87	// Maximum number of bytes to encode uint64 in base-128.
88	traceBytesPerNumber = 10
89	// Shift of the number of arguments in the first event byte.
90	traceArgCountShift = 6
91	// Flag passed to traceGoPark to denote that the previous wakeup of this
92	// goroutine was futile. For example, a goroutine was unblocked on a mutex,
93	// but another goroutine got ahead and acquired the mutex before the first
94	// goroutine is scheduled, so the first goroutine has to block again.
95	// Such wakeups happen on buffered channels and sync.Mutex,
96	// but are generally not interesting for end user.
97	traceFutileWakeup byte = 128
98)
99
100// trace is global tracing context.
101var trace struct {
102	lock          mutex       // protects the following members
103	lockOwner     *g          // to avoid deadlocks during recursive lock locks
104	enabled       bool        // when set runtime traces events
105	shutdown      bool        // set when we are waiting for trace reader to finish after setting enabled to false
106	headerWritten bool        // whether ReadTrace has emitted trace header
107	footerWritten bool        // whether ReadTrace has emitted trace footer
108	shutdownSema  uint32      // used to wait for ReadTrace completion
109	seqStart      uint64      // sequence number when tracing was started
110	ticksStart    int64       // cputicks when tracing was started
111	ticksEnd      int64       // cputicks when tracing was stopped
112	timeStart     int64       // nanotime when tracing was started
113	timeEnd       int64       // nanotime when tracing was stopped
114	seqGC         uint64      // GC start/done sequencer
115	reading       traceBufPtr // buffer currently handed off to user
116	empty         traceBufPtr // stack of empty buffers
117	fullHead      traceBufPtr // queue of full buffers
118	fullTail      traceBufPtr
119	reader        guintptr        // goroutine that called ReadTrace, or nil
120	stackTab      traceStackTable // maps stack traces to unique ids
121
122	// Dictionary for traceEvString.
123	//
124	// Currently this is used only at trace setup and for
125	// func/file:line info after tracing session, so we assume
126	// single-threaded access.
127	strings   map[string]uint64
128	stringSeq uint64
129
130	// markWorkerLabels maps gcMarkWorkerMode to string ID.
131	markWorkerLabels [len(gcMarkWorkerModeStrings)]uint64
132
133	bufLock mutex       // protects buf
134	buf     traceBufPtr // global trace buffer, used when running without a p
135}
136
137// traceBufHeader is per-P tracing buffer.
138type traceBufHeader struct {
139	link      traceBufPtr              // in trace.empty/full
140	lastTicks uint64                   // when we wrote the last event
141	pos       int                      // next write offset in arr
142	stk       [traceStackSize]location // scratch buffer for traceback
143}
144
145// traceBuf is per-P tracing buffer.
146//
147//go:notinheap
148type traceBuf struct {
149	traceBufHeader
150	arr [64<<10 - unsafe.Sizeof(traceBufHeader{})]byte // underlying buffer for traceBufHeader.buf
151}
152
153// traceBufPtr is a *traceBuf that is not traced by the garbage
154// collector and doesn't have write barriers. traceBufs are not
155// allocated from the GC'd heap, so this is safe, and are often
156// manipulated in contexts where write barriers are not allowed, so
157// this is necessary.
158//
159// TODO: Since traceBuf is now go:notinheap, this isn't necessary.
160type traceBufPtr uintptr
161
162func (tp traceBufPtr) ptr() *traceBuf   { return (*traceBuf)(unsafe.Pointer(tp)) }
163func (tp *traceBufPtr) set(b *traceBuf) { *tp = traceBufPtr(unsafe.Pointer(b)) }
164func traceBufPtrOf(b *traceBuf) traceBufPtr {
165	return traceBufPtr(unsafe.Pointer(b))
166}
167
168// StartTrace enables tracing for the current process.
169// While tracing, the data will be buffered and available via ReadTrace.
170// StartTrace returns an error if tracing is already enabled.
171// Most clients should use the runtime/trace package or the testing package's
172// -test.trace flag instead of calling StartTrace directly.
173func StartTrace() error {
174	// Stop the world, so that we can take a consistent snapshot
175	// of all goroutines at the beginning of the trace.
176	stopTheWorld("start tracing")
177
178	// We are in stop-the-world, but syscalls can finish and write to trace concurrently.
179	// Exitsyscall could check trace.enabled long before and then suddenly wake up
180	// and decide to write to trace at a random point in time.
181	// However, such syscall will use the global trace.buf buffer, because we've
182	// acquired all p's by doing stop-the-world. So this protects us from such races.
183	lock(&trace.bufLock)
184
185	if trace.enabled || trace.shutdown {
186		unlock(&trace.bufLock)
187		startTheWorld()
188		return errorString("tracing is already enabled")
189	}
190
191	// Can't set trace.enabled yet. While the world is stopped, exitsyscall could
192	// already emit a delayed event (see exitTicks in exitsyscall) if we set trace.enabled here.
193	// That would lead to an inconsistent trace:
194	// - either GoSysExit appears before EvGoInSyscall,
195	// - or GoSysExit appears for a goroutine for which we don't emit EvGoInSyscall below.
196	// To instruct traceEvent that it must not ignore events below, we set startingtrace.
197	// trace.enabled is set afterwards once we have emitted all preliminary events.
198	_g_ := getg()
199	_g_.m.startingtrace = true
200
201	// Obtain current stack ID to use in all traceEvGoCreate events below.
202	mp := acquirem()
203	stkBuf := make([]location, traceStackSize)
204	stackID := traceStackID(mp, stkBuf, 2)
205	releasem(mp)
206
207	for _, gp := range allgs {
208		status := readgstatus(gp)
209		if status != _Gdead {
210			gp.traceseq = 0
211			gp.tracelastp = getg().m.p
212			// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
213			id := trace.stackTab.put([]location{location{pc: gp.startpc + sys.PCQuantum}})
214			traceEvent(traceEvGoCreate, -1, uint64(gp.goid), uint64(id), stackID)
215		}
216		if status == _Gwaiting {
217			// traceEvGoWaiting is implied to have seq=1.
218			gp.traceseq++
219			traceEvent(traceEvGoWaiting, -1, uint64(gp.goid))
220		}
221		if status == _Gsyscall {
222			gp.traceseq++
223			traceEvent(traceEvGoInSyscall, -1, uint64(gp.goid))
224		} else {
225			gp.sysblocktraced = false
226		}
227	}
228	traceProcStart()
229	traceGoStart()
230	// Note: ticksStart needs to be set after we emit traceEvGoInSyscall events.
231	// If we do it the other way around, it is possible that exitsyscall will
232	// query sysexitticks after ticksStart but before traceEvGoInSyscall timestamp.
233	// It will lead to a false conclusion that cputicks is broken.
234	trace.ticksStart = cputicks()
235	trace.timeStart = nanotime()
236	trace.headerWritten = false
237	trace.footerWritten = false
238
239	// string to id mapping
240	//  0 : reserved for an empty string
241	//  remaining: other strings registered by traceString
242	trace.stringSeq = 0
243	trace.strings = make(map[string]uint64)
244
245	trace.seqGC = 0
246	_g_.m.startingtrace = false
247	trace.enabled = true
248
249	// Register runtime goroutine labels.
250	_, pid, bufp := traceAcquireBuffer()
251	for i, label := range gcMarkWorkerModeStrings[:] {
252		trace.markWorkerLabels[i], bufp = traceString(bufp, pid, label)
253	}
254	traceReleaseBuffer(pid)
255
256	unlock(&trace.bufLock)
257
258	startTheWorld()
259	return nil
260}
261
262// StopTrace stops tracing, if it was previously enabled.
263// StopTrace only returns after all the reads for the trace have completed.
264func StopTrace() {
265	// Stop the world so that we can collect the trace buffers from all p's below,
266	// and also to avoid races with traceEvent.
267	stopTheWorld("stop tracing")
268
269	// See the comment in StartTrace.
270	lock(&trace.bufLock)
271
272	if !trace.enabled {
273		unlock(&trace.bufLock)
274		startTheWorld()
275		return
276	}
277
278	traceGoSched()
279
280	// Loop over all allocated Ps because dead Ps may still have
281	// trace buffers.
282	for _, p := range allp[:cap(allp)] {
283		buf := p.tracebuf
284		if buf != 0 {
285			traceFullQueue(buf)
286			p.tracebuf = 0
287		}
288	}
289	if trace.buf != 0 {
290		buf := trace.buf
291		trace.buf = 0
292		if buf.ptr().pos != 0 {
293			traceFullQueue(buf)
294		}
295	}
296
297	for {
298		trace.ticksEnd = cputicks()
299		trace.timeEnd = nanotime()
300		// Windows time can tick only every 15ms, wait for at least one tick.
301		if trace.timeEnd != trace.timeStart {
302			break
303		}
304		osyield()
305	}
306
307	trace.enabled = false
308	trace.shutdown = true
309	unlock(&trace.bufLock)
310
311	startTheWorld()
312
313	// The world is started but we've set trace.shutdown, so new tracing can't start.
314	// Wait for the trace reader to flush pending buffers and stop.
315	semacquire(&trace.shutdownSema)
316	if raceenabled {
317		raceacquire(unsafe.Pointer(&trace.shutdownSema))
318	}
319
320	// The lock protects us from races with StartTrace/StopTrace because they do stop-the-world.
321	lock(&trace.lock)
322	for _, p := range allp[:cap(allp)] {
323		if p.tracebuf != 0 {
324			throw("trace: non-empty trace buffer in proc")
325		}
326	}
327	if trace.buf != 0 {
328		throw("trace: non-empty global trace buffer")
329	}
330	if trace.fullHead != 0 || trace.fullTail != 0 {
331		throw("trace: non-empty full trace buffer")
332	}
333	if trace.reading != 0 || trace.reader != 0 {
334		throw("trace: reading after shutdown")
335	}
336	for trace.empty != 0 {
337		buf := trace.empty
338		trace.empty = buf.ptr().link
339		sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf.ptr()), &memstats.other_sys)
340	}
341	trace.strings = nil
342	trace.shutdown = false
343	unlock(&trace.lock)
344}
345
346// ReadTrace returns the next chunk of binary tracing data, blocking until data
347// is available. If tracing is turned off and all the data accumulated while it
348// was on has been returned, ReadTrace returns nil. The caller must copy the
349// returned data before calling ReadTrace again.
350// ReadTrace must be called from one goroutine at a time.
351func ReadTrace() []byte {
352	// This function may need to lock trace.lock recursively
353	// (goparkunlock -> traceGoPark -> traceEvent -> traceFlush).
354	// To allow this we use trace.lockOwner.
355	// Also this function must not allocate while holding trace.lock:
356	// allocation can call heap allocate, which will try to emit a trace
357	// event while holding heap lock.
358	lock(&trace.lock)
359	trace.lockOwner = getg()
360
361	if trace.reader != 0 {
362		// More than one goroutine reads trace. This is bad.
363		// But we rather do not crash the program because of tracing,
364		// because tracing can be enabled at runtime on prod servers.
365		trace.lockOwner = nil
366		unlock(&trace.lock)
367		println("runtime: ReadTrace called from multiple goroutines simultaneously")
368		return nil
369	}
370	// Recycle the old buffer.
371	if buf := trace.reading; buf != 0 {
372		buf.ptr().link = trace.empty
373		trace.empty = buf
374		trace.reading = 0
375	}
376	// Write trace header.
377	if !trace.headerWritten {
378		trace.headerWritten = true
379		trace.lockOwner = nil
380		unlock(&trace.lock)
381		return []byte("go 1.10 trace\x00\x00\x00")
382	}
383	// Wait for new data.
384	if trace.fullHead == 0 && !trace.shutdown {
385		trace.reader.set(getg())
386		goparkunlock(&trace.lock, "trace reader (blocked)", traceEvGoBlock, 2)
387		lock(&trace.lock)
388	}
389	// Write a buffer.
390	if trace.fullHead != 0 {
391		buf := traceFullDequeue()
392		trace.reading = buf
393		trace.lockOwner = nil
394		unlock(&trace.lock)
395		return buf.ptr().arr[:buf.ptr().pos]
396	}
397	// Write footer with timer frequency.
398	if !trace.footerWritten {
399		trace.footerWritten = true
400		// Use float64 because (trace.ticksEnd - trace.ticksStart) * 1e9 can overflow int64.
401		freq := float64(trace.ticksEnd-trace.ticksStart) * 1e9 / float64(trace.timeEnd-trace.timeStart) / traceTickDiv
402		trace.lockOwner = nil
403		unlock(&trace.lock)
404		var data []byte
405		data = append(data, traceEvFrequency|0<<traceArgCountShift)
406		data = traceAppend(data, uint64(freq))
407		for i := range timers {
408			tb := &timers[i]
409			if tb.gp != nil {
410				data = append(data, traceEvTimerGoroutine|0<<traceArgCountShift)
411				data = traceAppend(data, uint64(tb.gp.goid))
412			}
413		}
414		// This will emit a bunch of full buffers, we will pick them up
415		// on the next iteration.
416		trace.stackTab.dump()
417		return data
418	}
419	// Done.
420	if trace.shutdown {
421		trace.lockOwner = nil
422		unlock(&trace.lock)
423		if raceenabled {
424			// Model synchronization on trace.shutdownSema, which race
425			// detector does not see. This is required to avoid false
426			// race reports on writer passed to trace.Start.
427			racerelease(unsafe.Pointer(&trace.shutdownSema))
428		}
429		// trace.enabled is already reset, so can call traceable functions.
430		semrelease(&trace.shutdownSema)
431		return nil
432	}
433	// Also bad, but see the comment above.
434	trace.lockOwner = nil
435	unlock(&trace.lock)
436	println("runtime: spurious wakeup of trace reader")
437	return nil
438}
439
440// traceReader returns the trace reader that should be woken up, if any.
441func traceReader() *g {
442	if trace.reader == 0 || (trace.fullHead == 0 && !trace.shutdown) {
443		return nil
444	}
445	lock(&trace.lock)
446	if trace.reader == 0 || (trace.fullHead == 0 && !trace.shutdown) {
447		unlock(&trace.lock)
448		return nil
449	}
450	gp := trace.reader.ptr()
451	trace.reader.set(nil)
452	unlock(&trace.lock)
453	return gp
454}
455
456// traceProcFree frees trace buffer associated with pp.
457func traceProcFree(pp *p) {
458	buf := pp.tracebuf
459	pp.tracebuf = 0
460	if buf == 0 {
461		return
462	}
463	lock(&trace.lock)
464	traceFullQueue(buf)
465	unlock(&trace.lock)
466}
467
468// traceFullQueue queues buf into queue of full buffers.
469func traceFullQueue(buf traceBufPtr) {
470	buf.ptr().link = 0
471	if trace.fullHead == 0 {
472		trace.fullHead = buf
473	} else {
474		trace.fullTail.ptr().link = buf
475	}
476	trace.fullTail = buf
477}
478
479// traceFullDequeue dequeues from queue of full buffers.
480func traceFullDequeue() traceBufPtr {
481	buf := trace.fullHead
482	if buf == 0 {
483		return 0
484	}
485	trace.fullHead = buf.ptr().link
486	if trace.fullHead == 0 {
487		trace.fullTail = 0
488	}
489	buf.ptr().link = 0
490	return buf
491}
492
493// traceEvent writes a single event to trace buffer, flushing the buffer if necessary.
494// ev is event type.
495// If skip > 0, write current stack id as the last argument (skipping skip top frames).
496// If skip = 0, this event type should contain a stack, but we don't want
497// to collect and remember it for this particular call.
498func traceEvent(ev byte, skip int, args ...uint64) {
499	mp, pid, bufp := traceAcquireBuffer()
500	// Double-check trace.enabled now that we've done m.locks++ and acquired bufLock.
501	// This protects from races between traceEvent and StartTrace/StopTrace.
502
503	// The caller checked that trace.enabled == true, but trace.enabled might have been
504	// turned off between the check and now. Check again. traceLockBuffer did mp.locks++,
505	// StopTrace does stopTheWorld, and stopTheWorld waits for mp.locks to go back to zero,
506	// so if we see trace.enabled == true now, we know it's true for the rest of the function.
507	// Exitsyscall can run even during stopTheWorld. The race with StartTrace/StopTrace
508	// during tracing in exitsyscall is resolved by locking trace.bufLock in traceLockBuffer.
509	if !trace.enabled && !mp.startingtrace {
510		traceReleaseBuffer(pid)
511		return
512	}
513	buf := (*bufp).ptr()
514	const maxSize = 2 + 5*traceBytesPerNumber // event type, length, sequence, timestamp, stack id and two add params
515	if buf == nil || len(buf.arr)-buf.pos < maxSize {
516		buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
517		(*bufp).set(buf)
518	}
519
520	ticks := uint64(cputicks()) / traceTickDiv
521	tickDiff := ticks - buf.lastTicks
522	buf.lastTicks = ticks
523	narg := byte(len(args))
524	if skip >= 0 {
525		narg++
526	}
527	// We have only 2 bits for number of arguments.
528	// If number is >= 3, then the event type is followed by event length in bytes.
529	if narg > 3 {
530		narg = 3
531	}
532	startPos := buf.pos
533	buf.byte(ev | narg<<traceArgCountShift)
534	var lenp *byte
535	if narg == 3 {
536		// Reserve the byte for length assuming that length < 128.
537		buf.varint(0)
538		lenp = &buf.arr[buf.pos-1]
539	}
540	buf.varint(tickDiff)
541	for _, a := range args {
542		buf.varint(a)
543	}
544	if skip == 0 {
545		buf.varint(0)
546	} else if skip > 0 {
547		buf.varint(traceStackID(mp, buf.stk[:], skip))
548	}
549	evSize := buf.pos - startPos
550	if evSize > maxSize {
551		throw("invalid length of trace event")
552	}
553	if lenp != nil {
554		// Fill in actual length.
555		*lenp = byte(evSize - 2)
556	}
557	traceReleaseBuffer(pid)
558}
559
560func traceStackID(mp *m, buf []location, skip int) uint64 {
561	_g_ := getg()
562	gp := mp.curg
563	var nstk int
564	if gp == _g_ {
565		nstk = callers(skip+1, buf[:])
566	} else if gp != nil {
567		// FIXME: get stack trace of different goroutine.
568	}
569	if nstk > 0 {
570		nstk-- // skip runtime.goexit
571	}
572	if nstk > 0 && gp.goid == 1 {
573		nstk-- // skip runtime.main
574	}
575	id := trace.stackTab.put(buf[:nstk])
576	return uint64(id)
577}
578
579// traceAcquireBuffer returns trace buffer to use and, if necessary, locks it.
580func traceAcquireBuffer() (mp *m, pid int32, bufp *traceBufPtr) {
581	mp = acquirem()
582	if p := mp.p.ptr(); p != nil {
583		return mp, p.id, &p.tracebuf
584	}
585	lock(&trace.bufLock)
586	return mp, traceGlobProc, &trace.buf
587}
588
589// traceReleaseBuffer releases a buffer previously acquired with traceAcquireBuffer.
590func traceReleaseBuffer(pid int32) {
591	if pid == traceGlobProc {
592		unlock(&trace.bufLock)
593	}
594	releasem(getg().m)
595}
596
597// traceFlush puts buf onto stack of full buffers and returns an empty buffer.
598func traceFlush(buf traceBufPtr, pid int32) traceBufPtr {
599	owner := trace.lockOwner
600	dolock := owner == nil || owner != getg().m.curg
601	if dolock {
602		lock(&trace.lock)
603	}
604	if buf != 0 {
605		traceFullQueue(buf)
606	}
607	if trace.empty != 0 {
608		buf = trace.empty
609		trace.empty = buf.ptr().link
610	} else {
611		buf = traceBufPtr(sysAlloc(unsafe.Sizeof(traceBuf{}), &memstats.other_sys))
612		if buf == 0 {
613			throw("trace: out of memory")
614		}
615	}
616	bufp := buf.ptr()
617	bufp.link.set(nil)
618	bufp.pos = 0
619
620	// initialize the buffer for a new batch
621	ticks := uint64(cputicks()) / traceTickDiv
622	bufp.lastTicks = ticks
623	bufp.byte(traceEvBatch | 1<<traceArgCountShift)
624	bufp.varint(uint64(pid))
625	bufp.varint(ticks)
626
627	if dolock {
628		unlock(&trace.lock)
629	}
630	return buf
631}
632
633// traceString adds a string to the trace.strings and returns the id.
634func traceString(bufp *traceBufPtr, pid int32, s string) (uint64, *traceBufPtr) {
635	if s == "" {
636		return 0, bufp
637	}
638	if id, ok := trace.strings[s]; ok {
639		return id, bufp
640	}
641
642	trace.stringSeq++
643	id := trace.stringSeq
644	trace.strings[s] = id
645
646	// memory allocation in above may trigger tracing and
647	// cause *bufp changes. Following code now works with *bufp,
648	// so there must be no memory allocation or any activities
649	// that causes tracing after this point.
650
651	buf := (*bufp).ptr()
652	size := 1 + 2*traceBytesPerNumber + len(s)
653	if buf == nil || len(buf.arr)-buf.pos < size {
654		buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
655		(*bufp).set(buf)
656	}
657	buf.byte(traceEvString)
658	buf.varint(id)
659	buf.varint(uint64(len(s)))
660	buf.pos += copy(buf.arr[buf.pos:], s)
661
662	(*bufp).set(buf)
663	return id, bufp
664}
665
666// traceAppend appends v to buf in little-endian-base-128 encoding.
667func traceAppend(buf []byte, v uint64) []byte {
668	for ; v >= 0x80; v >>= 7 {
669		buf = append(buf, 0x80|byte(v))
670	}
671	buf = append(buf, byte(v))
672	return buf
673}
674
675// varint appends v to buf in little-endian-base-128 encoding.
676func (buf *traceBuf) varint(v uint64) {
677	pos := buf.pos
678	for ; v >= 0x80; v >>= 7 {
679		buf.arr[pos] = 0x80 | byte(v)
680		pos++
681	}
682	buf.arr[pos] = byte(v)
683	pos++
684	buf.pos = pos
685}
686
687// byte appends v to buf.
688func (buf *traceBuf) byte(v byte) {
689	buf.arr[buf.pos] = v
690	buf.pos++
691}
692
693// traceStackTable maps stack traces (arrays of PC's) to unique uint32 ids.
694// It is lock-free for reading.
695type traceStackTable struct {
696	lock mutex
697	seq  uint32
698	mem  traceAlloc
699	tab  [1 << 13]traceStackPtr
700}
701
702// traceStack is a single stack in traceStackTable.
703type traceStack struct {
704	link traceStackPtr
705	hash uintptr
706	id   uint32
707	n    int
708	stk  [0]location // real type [n]location
709}
710
711type traceStackPtr uintptr
712
713func (tp traceStackPtr) ptr() *traceStack { return (*traceStack)(unsafe.Pointer(tp)) }
714
715// stack returns slice of PCs.
716func (ts *traceStack) stack() []location {
717	return (*[traceStackSize]location)(unsafe.Pointer(&ts.stk))[:ts.n]
718}
719
720// put returns a unique id for the stack trace pcs and caches it in the table,
721// if it sees the trace for the first time.
722func (tab *traceStackTable) put(pcs []location) uint32 {
723	if len(pcs) == 0 {
724		return 0
725	}
726	var hash uintptr
727	for _, loc := range pcs {
728		hash += loc.pc
729		hash += hash << 10
730		hash ^= hash >> 6
731	}
732	// First, search the hashtable w/o the mutex.
733	if id := tab.find(pcs, hash); id != 0 {
734		return id
735	}
736	// Now, double check under the mutex.
737	lock(&tab.lock)
738	if id := tab.find(pcs, hash); id != 0 {
739		unlock(&tab.lock)
740		return id
741	}
742	// Create new record.
743	tab.seq++
744	stk := tab.newStack(len(pcs))
745	stk.hash = hash
746	stk.id = tab.seq
747	stk.n = len(pcs)
748	stkpc := stk.stack()
749	for i, pc := range pcs {
750		stkpc[i] = pc
751	}
752	part := int(hash % uintptr(len(tab.tab)))
753	stk.link = tab.tab[part]
754	atomicstorep(unsafe.Pointer(&tab.tab[part]), unsafe.Pointer(stk))
755	unlock(&tab.lock)
756	return stk.id
757}
758
759// find checks if the stack trace pcs is already present in the table.
760func (tab *traceStackTable) find(pcs []location, hash uintptr) uint32 {
761	part := int(hash % uintptr(len(tab.tab)))
762Search:
763	for stk := tab.tab[part].ptr(); stk != nil; stk = stk.link.ptr() {
764		if stk.hash == hash && stk.n == len(pcs) {
765			for i, stkpc := range stk.stack() {
766				if stkpc != pcs[i] {
767					continue Search
768				}
769			}
770			return stk.id
771		}
772	}
773	return 0
774}
775
776// newStack allocates a new stack of size n.
777func (tab *traceStackTable) newStack(n int) *traceStack {
778	return (*traceStack)(tab.mem.alloc(unsafe.Sizeof(traceStack{}) + uintptr(n)*unsafe.Sizeof(location{})))
779}
780
781// dump writes all previously cached stacks to trace buffers,
782// releases all memory and resets state.
783func (tab *traceStackTable) dump() {
784	var tmp [(2 + 4*traceStackSize) * traceBytesPerNumber]byte
785	bufp := traceFlush(0, 0)
786	for _, stk := range tab.tab {
787		stk := stk.ptr()
788		for ; stk != nil; stk = stk.link.ptr() {
789			tmpbuf := tmp[:0]
790			tmpbuf = traceAppend(tmpbuf, uint64(stk.id))
791			frames := stk.stack()
792			tmpbuf = traceAppend(tmpbuf, uint64(len(frames)))
793			for _, f := range frames {
794				var frame traceFrame
795				frame, bufp = traceFrameForPC(bufp, 0, f)
796				tmpbuf = traceAppend(tmpbuf, uint64(f.pc))
797				tmpbuf = traceAppend(tmpbuf, uint64(frame.funcID))
798				tmpbuf = traceAppend(tmpbuf, uint64(frame.fileID))
799				tmpbuf = traceAppend(tmpbuf, uint64(frame.line))
800			}
801			// Now copy to the buffer.
802			size := 1 + traceBytesPerNumber + len(tmpbuf)
803			if buf := bufp.ptr(); len(buf.arr)-buf.pos < size {
804				bufp = traceFlush(bufp, 0)
805			}
806			buf := bufp.ptr()
807			buf.byte(traceEvStack | 3<<traceArgCountShift)
808			buf.varint(uint64(len(tmpbuf)))
809			buf.pos += copy(buf.arr[buf.pos:], tmpbuf)
810		}
811	}
812
813	lock(&trace.lock)
814	traceFullQueue(bufp)
815	unlock(&trace.lock)
816
817	tab.mem.drop()
818	*tab = traceStackTable{}
819}
820
821type traceFrame struct {
822	funcID uint64
823	fileID uint64
824	line   uint64
825}
826
827// traceFrameForPC records the frame information.
828// It may allocate memory.
829func traceFrameForPC(buf traceBufPtr, pid int32, f location) (traceFrame, traceBufPtr) {
830	bufp := &buf
831	var frame traceFrame
832
833	fn := f.function
834	const maxLen = 1 << 10
835	if len(fn) > maxLen {
836		fn = fn[len(fn)-maxLen:]
837	}
838	frame.funcID, bufp = traceString(bufp, pid, fn)
839	frame.line = uint64(f.lineno)
840	file := f.filename
841	if len(file) > maxLen {
842		file = file[len(file)-maxLen:]
843	}
844	frame.fileID, bufp = traceString(bufp, pid, file)
845	return frame, (*bufp)
846}
847
848// traceAlloc is a non-thread-safe region allocator.
849// It holds a linked list of traceAllocBlock.
850type traceAlloc struct {
851	head traceAllocBlockPtr
852	off  uintptr
853}
854
855// traceAllocBlock is a block in traceAlloc.
856//
857// traceAllocBlock is allocated from non-GC'd memory, so it must not
858// contain heap pointers. Writes to pointers to traceAllocBlocks do
859// not need write barriers.
860//
861//go:notinheap
862type traceAllocBlock struct {
863	next traceAllocBlockPtr
864	data [64<<10 - sys.PtrSize]byte
865}
866
867// TODO: Since traceAllocBlock is now go:notinheap, this isn't necessary.
868type traceAllocBlockPtr uintptr
869
870func (p traceAllocBlockPtr) ptr() *traceAllocBlock   { return (*traceAllocBlock)(unsafe.Pointer(p)) }
871func (p *traceAllocBlockPtr) set(x *traceAllocBlock) { *p = traceAllocBlockPtr(unsafe.Pointer(x)) }
872
873// alloc allocates n-byte block.
874func (a *traceAlloc) alloc(n uintptr) unsafe.Pointer {
875	n = round(n, sys.PtrSize)
876	if a.head == 0 || a.off+n > uintptr(len(a.head.ptr().data)) {
877		if n > uintptr(len(a.head.ptr().data)) {
878			throw("trace: alloc too large")
879		}
880		// This is only safe because the strings returned by callers
881		// are stored in a location that is not in the Go heap.
882		block := (*traceAllocBlock)(sysAlloc(unsafe.Sizeof(traceAllocBlock{}), &memstats.other_sys))
883		if block == nil {
884			throw("trace: out of memory")
885		}
886		block.next.set(a.head.ptr())
887		a.head.set(block)
888		a.off = 0
889	}
890	p := &a.head.ptr().data[a.off]
891	a.off += n
892	return unsafe.Pointer(p)
893}
894
895// drop frees all previously allocated memory and resets the allocator.
896func (a *traceAlloc) drop() {
897	for a.head != 0 {
898		block := a.head.ptr()
899		a.head.set(block.next.ptr())
900		sysFree(unsafe.Pointer(block), unsafe.Sizeof(traceAllocBlock{}), &memstats.other_sys)
901	}
902}
903
904// The following functions write specific events to trace.
905
906func traceGomaxprocs(procs int32) {
907	traceEvent(traceEvGomaxprocs, 1, uint64(procs))
908}
909
910func traceProcStart() {
911	traceEvent(traceEvProcStart, -1, uint64(getg().m.id))
912}
913
914func traceProcStop(pp *p) {
915	// Sysmon and stopTheWorld can stop Ps blocked in syscalls,
916	// to handle this we temporary employ the P.
917	mp := acquirem()
918	oldp := mp.p
919	mp.p.set(pp)
920	traceEvent(traceEvProcStop, -1)
921	mp.p = oldp
922	releasem(mp)
923}
924
925func traceGCStart() {
926	traceEvent(traceEvGCStart, 3, trace.seqGC)
927	trace.seqGC++
928}
929
930func traceGCDone() {
931	traceEvent(traceEvGCDone, -1)
932}
933
934func traceGCSTWStart(kind int) {
935	traceEvent(traceEvGCSTWStart, -1, uint64(kind))
936}
937
938func traceGCSTWDone() {
939	traceEvent(traceEvGCSTWDone, -1)
940}
941
942// traceGCSweepStart prepares to trace a sweep loop. This does not
943// emit any events until traceGCSweepSpan is called.
944//
945// traceGCSweepStart must be paired with traceGCSweepDone and there
946// must be no preemption points between these two calls.
947func traceGCSweepStart() {
948	// Delay the actual GCSweepStart event until the first span
949	// sweep. If we don't sweep anything, don't emit any events.
950	_p_ := getg().m.p.ptr()
951	if _p_.traceSweep {
952		throw("double traceGCSweepStart")
953	}
954	_p_.traceSweep, _p_.traceSwept, _p_.traceReclaimed = true, 0, 0
955}
956
957// traceGCSweepSpan traces the sweep of a single page.
958//
959// This may be called outside a traceGCSweepStart/traceGCSweepDone
960// pair; however, it will not emit any trace events in this case.
961func traceGCSweepSpan(bytesSwept uintptr) {
962	_p_ := getg().m.p.ptr()
963	if _p_.traceSweep {
964		if _p_.traceSwept == 0 {
965			traceEvent(traceEvGCSweepStart, 1)
966		}
967		_p_.traceSwept += bytesSwept
968	}
969}
970
971func traceGCSweepDone() {
972	_p_ := getg().m.p.ptr()
973	if !_p_.traceSweep {
974		throw("missing traceGCSweepStart")
975	}
976	if _p_.traceSwept != 0 {
977		traceEvent(traceEvGCSweepDone, -1, uint64(_p_.traceSwept), uint64(_p_.traceReclaimed))
978	}
979	_p_.traceSweep = false
980}
981
982func traceGCMarkAssistStart() {
983	traceEvent(traceEvGCMarkAssistStart, 1)
984}
985
986func traceGCMarkAssistDone() {
987	traceEvent(traceEvGCMarkAssistDone, -1)
988}
989
990func traceGoCreate(newg *g, pc uintptr) {
991	newg.traceseq = 0
992	newg.tracelastp = getg().m.p
993	// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
994	id := trace.stackTab.put([]location{location{pc: pc + sys.PCQuantum}})
995	traceEvent(traceEvGoCreate, 2, uint64(newg.goid), uint64(id))
996}
997
998func traceGoStart() {
999	_g_ := getg().m.curg
1000	_p_ := _g_.m.p
1001	_g_.traceseq++
1002	if _g_ == _p_.ptr().gcBgMarkWorker.ptr() {
1003		traceEvent(traceEvGoStartLabel, -1, uint64(_g_.goid), _g_.traceseq, trace.markWorkerLabels[_p_.ptr().gcMarkWorkerMode])
1004	} else if _g_.tracelastp == _p_ {
1005		traceEvent(traceEvGoStartLocal, -1, uint64(_g_.goid))
1006	} else {
1007		_g_.tracelastp = _p_
1008		traceEvent(traceEvGoStart, -1, uint64(_g_.goid), _g_.traceseq)
1009	}
1010}
1011
1012func traceGoEnd() {
1013	traceEvent(traceEvGoEnd, -1)
1014}
1015
1016func traceGoSched() {
1017	_g_ := getg()
1018	_g_.tracelastp = _g_.m.p
1019	traceEvent(traceEvGoSched, 1)
1020}
1021
1022func traceGoPreempt() {
1023	_g_ := getg()
1024	_g_.tracelastp = _g_.m.p
1025	traceEvent(traceEvGoPreempt, 1)
1026}
1027
1028func traceGoPark(traceEv byte, skip int) {
1029	if traceEv&traceFutileWakeup != 0 {
1030		traceEvent(traceEvFutileWakeup, -1)
1031	}
1032	traceEvent(traceEv & ^traceFutileWakeup, skip)
1033}
1034
1035func traceGoUnpark(gp *g, skip int) {
1036	_p_ := getg().m.p
1037	gp.traceseq++
1038	if gp.tracelastp == _p_ {
1039		traceEvent(traceEvGoUnblockLocal, skip, uint64(gp.goid))
1040	} else {
1041		gp.tracelastp = _p_
1042		traceEvent(traceEvGoUnblock, skip, uint64(gp.goid), gp.traceseq)
1043	}
1044}
1045
1046func traceGoSysCall() {
1047	traceEvent(traceEvGoSysCall, 1)
1048}
1049
1050func traceGoSysExit(ts int64) {
1051	if ts != 0 && ts < trace.ticksStart {
1052		// There is a race between the code that initializes sysexitticks
1053		// (in exitsyscall, which runs without a P, and therefore is not
1054		// stopped with the rest of the world) and the code that initializes
1055		// a new trace. The recorded sysexitticks must therefore be treated
1056		// as "best effort". If they are valid for this trace, then great,
1057		// use them for greater accuracy. But if they're not valid for this
1058		// trace, assume that the trace was started after the actual syscall
1059		// exit (but before we actually managed to start the goroutine,
1060		// aka right now), and assign a fresh time stamp to keep the log consistent.
1061		ts = 0
1062	}
1063	_g_ := getg().m.curg
1064	_g_.traceseq++
1065	_g_.tracelastp = _g_.m.p
1066	traceEvent(traceEvGoSysExit, -1, uint64(_g_.goid), _g_.traceseq, uint64(ts)/traceTickDiv)
1067}
1068
1069func traceGoSysBlock(pp *p) {
1070	// Sysmon and stopTheWorld can declare syscalls running on remote Ps as blocked,
1071	// to handle this we temporary employ the P.
1072	mp := acquirem()
1073	oldp := mp.p
1074	mp.p.set(pp)
1075	traceEvent(traceEvGoSysBlock, -1)
1076	mp.p = oldp
1077	releasem(mp)
1078}
1079
1080func traceHeapAlloc() {
1081	traceEvent(traceEvHeapAlloc, -1, memstats.heap_live)
1082}
1083
1084func traceNextGC() {
1085	if memstats.next_gc == ^uint64(0) {
1086		// Heap-based triggering is disabled.
1087		traceEvent(traceEvNextGC, -1, 0)
1088	} else {
1089		traceEvent(traceEvNextGC, -1, memstats.next_gc)
1090	}
1091}
1092