1// Copyright 2011 The Go Authors.  All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// CPU profiling.
6// Based on algorithms and data structures used in
7// http://code.google.com/p/google-perftools/.
8//
9// The main difference between this code and the google-perftools
10// code is that this code is written to allow copying the profile data
11// to an arbitrary io.Writer, while the google-perftools code always
12// writes to an operating system file.
13//
14// The signal handler for the profiling clock tick adds a new stack trace
15// to a hash table tracking counts for recent traces.  Most clock ticks
16// hit in the cache.  In the event of a cache miss, an entry must be
17// evicted from the hash table, copied to a log that will eventually be
18// written as profile data.  The google-perftools code flushed the
19// log itself during the signal handler.  This code cannot do that, because
20// the io.Writer might block or need system calls or locks that are not
21// safe to use from within the signal handler.  Instead, we split the log
22// into two halves and let the signal handler fill one half while a goroutine
23// is writing out the other half.  When the signal handler fills its half, it
24// offers to swap with the goroutine.  If the writer is not done with its half,
25// we lose the stack trace for this clock tick (and record that loss).
26// The goroutine interacts with the signal handler by calling getprofile() to
27// get the next log piece to write, implicitly handing back the last log
28// piece it obtained.
29//
30// The state of this dance between the signal handler and the goroutine
31// is encoded in the Profile.handoff field.  If handoff == 0, then the goroutine
32// is not using either log half and is waiting (or will soon be waiting) for
33// a new piece by calling notesleep(&p->wait).  If the signal handler
34// changes handoff from 0 to non-zero, it must call notewakeup(&p->wait)
35// to wake the goroutine.  The value indicates the number of entries in the
36// log half being handed off.  The goroutine leaves the non-zero value in
37// place until it has finished processing the log half and then flips the number
38// back to zero.  Setting the high bit in handoff means that the profiling is over,
39// and the goroutine is now in charge of flushing the data left in the hash table
40// to the log and returning that data.
41//
42// The handoff field is manipulated using atomic operations.
43// For the most part, the manipulation of handoff is orderly: if handoff == 0
44// then the signal handler owns it and can change it to non-zero.
45// If handoff != 0 then the goroutine owns it and can change it to zero.
46// If that were the end of the story then we would not need to manipulate
47// handoff using atomic operations.  The operations are needed, however,
48// in order to let the log closer set the high bit to indicate "EOF" safely
49// in the situation when normally the goroutine "owns" handoff.
50
51package runtime
52#include "runtime.h"
53#include "arch.h"
54#include "malloc.h"
55
56#include "array.h"
57typedef struct __go_open_array Slice;
58#define array __values
59#define len __count
60#define cap __capacity
61
62enum
63{
64	HashSize = 1<<10,
65	LogSize = 1<<17,
66	Assoc = 4,
67	MaxStack = 64,
68};
69
70typedef struct Profile Profile;
71typedef struct Bucket Bucket;
72typedef struct Entry Entry;
73
74struct Entry {
75	uintptr count;
76	uintptr depth;
77	uintptr stack[MaxStack];
78};
79
80struct Bucket {
81	Entry entry[Assoc];
82};
83
84struct Profile {
85	bool on;		// profiling is on
86	Note wait;		// goroutine waits here
87	uintptr count;		// tick count
88	uintptr evicts;		// eviction count
89	uintptr lost;		// lost ticks that need to be logged
90
91	// Active recent stack traces.
92	Bucket hash[HashSize];
93
94	// Log of traces evicted from hash.
95	// Signal handler has filled log[toggle][:nlog].
96	// Goroutine is writing log[1-toggle][:handoff].
97	uintptr log[2][LogSize/2];
98	uintptr nlog;
99	int32 toggle;
100	uint32 handoff;
101
102	// Writer state.
103	// Writer maintains its own toggle to avoid races
104	// looking at signal handler's toggle.
105	uint32 wtoggle;
106	bool wholding;	// holding & need to release a log half
107	bool flushing;	// flushing hash table - profile is over
108	bool eod_sent;  // special end-of-data record sent; => flushing
109};
110
111static Lock lk;
112static Profile *prof;
113
114static void tick(uintptr*, int32);
115static void add(Profile*, uintptr*, int32);
116static bool evict(Profile*, Entry*);
117static bool flushlog(Profile*);
118
119static uintptr eod[3] = {0, 1, 0};
120
121// LostProfileData is a no-op function used in profiles
122// to mark the number of profiling stack traces that were
123// discarded due to slow data writers.
124static void
125LostProfileData(void)
126{
127}
128
129extern void runtime_SetCPUProfileRate(intgo)
130     __asm__ (GOSYM_PREFIX "runtime.SetCPUProfileRate");
131
132// SetCPUProfileRate sets the CPU profiling rate.
133// The user documentation is in debug.go.
134void
135runtime_SetCPUProfileRate(intgo hz)
136{
137	uintptr *p;
138	uintptr n;
139
140	// Clamp hz to something reasonable.
141	if(hz < 0)
142		hz = 0;
143	if(hz > 1000000)
144		hz = 1000000;
145
146	runtime_lock(&lk);
147	if(hz > 0) {
148		if(prof == nil) {
149			prof = runtime_SysAlloc(sizeof *prof, &mstats.other_sys);
150			if(prof == nil) {
151				runtime_printf("runtime: cpu profiling cannot allocate memory\n");
152				runtime_unlock(&lk);
153				return;
154			}
155		}
156		if(prof->on || prof->handoff != 0) {
157			runtime_printf("runtime: cannot set cpu profile rate until previous profile has finished.\n");
158			runtime_unlock(&lk);
159			return;
160		}
161
162		prof->on = true;
163		p = prof->log[0];
164		// pprof binary header format.
165		// http://code.google.com/p/google-perftools/source/browse/trunk/src/profiledata.cc#117
166		*p++ = 0;  // count for header
167		*p++ = 3;  // depth for header
168		*p++ = 0;  // version number
169		*p++ = 1000000 / hz;  // period (microseconds)
170		*p++ = 0;
171		prof->nlog = p - prof->log[0];
172		prof->toggle = 0;
173		prof->wholding = false;
174		prof->wtoggle = 0;
175		prof->flushing = false;
176		prof->eod_sent = false;
177		runtime_noteclear(&prof->wait);
178
179		runtime_setcpuprofilerate(tick, hz);
180	} else if(prof != nil && prof->on) {
181		runtime_setcpuprofilerate(nil, 0);
182		prof->on = false;
183
184		// Now add is not running anymore, and getprofile owns the entire log.
185		// Set the high bit in prof->handoff to tell getprofile.
186		for(;;) {
187			n = prof->handoff;
188			if(n&0x80000000)
189				runtime_printf("runtime: setcpuprofile(off) twice");
190			if(runtime_cas(&prof->handoff, n, n|0x80000000))
191				break;
192		}
193		if(n == 0) {
194			// we did the transition from 0 -> nonzero so we wake getprofile
195			runtime_notewakeup(&prof->wait);
196		}
197	}
198	runtime_unlock(&lk);
199}
200
201static void
202tick(uintptr *pc, int32 n)
203{
204	add(prof, pc, n);
205}
206
207// add adds the stack trace to the profile.
208// It is called from signal handlers and other limited environments
209// and cannot allocate memory or acquire locks that might be
210// held at the time of the signal, nor can it use substantial amounts
211// of stack.  It is allowed to call evict.
212static void
213add(Profile *p, uintptr *pc, int32 n)
214{
215	int32 i, j;
216	uintptr h, x;
217	Bucket *b;
218	Entry *e;
219
220	if(n > MaxStack)
221		n = MaxStack;
222
223	// Compute hash.
224	h = 0;
225	for(i=0; i<n; i++) {
226		h = h<<8 | (h>>(8*(sizeof(h)-1)));
227		x = pc[i];
228		h += x*31 + x*7 + x*3;
229	}
230	p->count++;
231
232	// Add to entry count if already present in table.
233	b = &p->hash[h%HashSize];
234	for(i=0; i<Assoc; i++) {
235		e = &b->entry[i];
236		if(e->depth != (uintptr)n)
237			continue;
238		for(j=0; j<n; j++)
239			if(e->stack[j] != pc[j])
240				goto ContinueAssoc;
241		e->count++;
242		return;
243	ContinueAssoc:;
244	}
245
246	// Evict entry with smallest count.
247	e = &b->entry[0];
248	for(i=1; i<Assoc; i++)
249		if(b->entry[i].count < e->count)
250			e = &b->entry[i];
251	if(e->count > 0) {
252		if(!evict(p, e)) {
253			// Could not evict entry.  Record lost stack.
254			p->lost++;
255			return;
256		}
257		p->evicts++;
258	}
259
260	// Reuse the newly evicted entry.
261	e->depth = n;
262	e->count = 1;
263	for(i=0; i<n; i++)
264		e->stack[i] = pc[i];
265}
266
267// evict copies the given entry's data into the log, so that
268// the entry can be reused.  evict is called from add, which
269// is called from the profiling signal handler, so it must not
270// allocate memory or block.  It is safe to call flushLog.
271// evict returns true if the entry was copied to the log,
272// false if there was no room available.
273static bool
274evict(Profile *p, Entry *e)
275{
276	int32 i, d, nslot;
277	uintptr *log, *q;
278
279	d = e->depth;
280	nslot = d+2;
281	log = p->log[p->toggle];
282	if(p->nlog+nslot > nelem(p->log[0])) {
283		if(!flushlog(p))
284			return false;
285		log = p->log[p->toggle];
286	}
287
288	q = log+p->nlog;
289	*q++ = e->count;
290	*q++ = d;
291	for(i=0; i<d; i++)
292		*q++ = e->stack[i];
293	p->nlog = q - log;
294	e->count = 0;
295	return true;
296}
297
298// flushlog tries to flush the current log and switch to the other one.
299// flushlog is called from evict, called from add, called from the signal handler,
300// so it cannot allocate memory or block.  It can try to swap logs with
301// the writing goroutine, as explained in the comment at the top of this file.
302static bool
303flushlog(Profile *p)
304{
305	uintptr *log, *q;
306
307	if(!runtime_cas(&p->handoff, 0, p->nlog))
308		return false;
309	runtime_notewakeup(&p->wait);
310
311	p->toggle = 1 - p->toggle;
312	log = p->log[p->toggle];
313	q = log;
314	if(p->lost > 0) {
315		*q++ = p->lost;
316		*q++ = 1;
317		*q++ = (uintptr)LostProfileData;
318		p->lost = 0;
319	}
320	p->nlog = q - log;
321	return true;
322}
323
324// getprofile blocks until the next block of profiling data is available
325// and returns it as a []byte.  It is called from the writing goroutine.
326Slice
327getprofile(Profile *p)
328{
329	uint32 i, j, n;
330	Slice ret;
331	Bucket *b;
332	Entry *e;
333
334	ret.array = nil;
335	ret.len = 0;
336	ret.cap = 0;
337
338	if(p == nil)
339		return ret;
340
341	if(p->wholding) {
342		// Release previous log to signal handling side.
343		// Loop because we are racing against SetCPUProfileRate(0).
344		for(;;) {
345			n = p->handoff;
346			if(n == 0) {
347				runtime_printf("runtime: phase error during cpu profile handoff\n");
348				return ret;
349			}
350			if(n & 0x80000000) {
351				p->wtoggle = 1 - p->wtoggle;
352				p->wholding = false;
353				p->flushing = true;
354				goto flush;
355			}
356			if(runtime_cas(&p->handoff, n, 0))
357				break;
358		}
359		p->wtoggle = 1 - p->wtoggle;
360		p->wholding = false;
361	}
362
363	if(p->flushing)
364		goto flush;
365
366	if(!p->on && p->handoff == 0)
367		return ret;
368
369	// Wait for new log.
370	runtime_notetsleepg(&p->wait, -1);
371	runtime_noteclear(&p->wait);
372
373	n = p->handoff;
374	if(n == 0) {
375		runtime_printf("runtime: phase error during cpu profile wait\n");
376		return ret;
377	}
378	if(n == 0x80000000) {
379		p->flushing = true;
380		goto flush;
381	}
382	n &= ~0x80000000;
383
384	// Return new log to caller.
385	p->wholding = true;
386
387	ret.array = (byte*)p->log[p->wtoggle];
388	ret.len = n*sizeof(uintptr);
389	ret.cap = ret.len;
390	return ret;
391
392flush:
393	// In flush mode.
394	// Add is no longer being called.  We own the log.
395	// Also, p->handoff is non-zero, so flushlog will return false.
396	// Evict the hash table into the log and return it.
397	for(i=0; i<HashSize; i++) {
398		b = &p->hash[i];
399		for(j=0; j<Assoc; j++) {
400			e = &b->entry[j];
401			if(e->count > 0 && !evict(p, e)) {
402				// Filled the log.  Stop the loop and return what we've got.
403				goto breakflush;
404			}
405		}
406	}
407breakflush:
408
409	// Return pending log data.
410	if(p->nlog > 0) {
411		// Note that we're using toggle now, not wtoggle,
412		// because we're working on the log directly.
413		ret.array = (byte*)p->log[p->toggle];
414		ret.len = p->nlog*sizeof(uintptr);
415		ret.cap = ret.len;
416		p->nlog = 0;
417		return ret;
418	}
419
420	// Made it through the table without finding anything to log.
421	if(!p->eod_sent) {
422		// We may not have space to append this to the partial log buf,
423		// so we always return a new slice for the end-of-data marker.
424		p->eod_sent = true;
425		ret.array = (byte*)eod;
426		ret.len = sizeof eod;
427		ret.cap = ret.len;
428		return ret;
429	}
430
431	// Finally done.  Clean up and return nil.
432	p->flushing = false;
433	if(!runtime_cas(&p->handoff, p->handoff, 0))
434		runtime_printf("runtime: profile flush racing with something\n");
435	return ret;  // set to nil at top of function
436}
437
438// CPUProfile returns the next cpu profile block as a []byte.
439// The user documentation is in debug.go.
440func CPUProfile() (ret Slice) {
441	ret = getprofile(prof);
442}
443