1 /* Copyright (C) 2010 Wildfire Games.
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining
4  * a copy of this software and associated documentation files (the
5  * "Software"), to deal in the Software without restriction, including
6  * without limitation the rights to use, copy, modify, merge, publish,
7  * distribute, sublicense, and/or sell copies of the Software, and to
8  * permit persons to whom the Software is furnished to do so, subject to
9  * the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "precompiled.h"
24 
25 #if 0
26 
27 // we need a means of measuring performance, since it is hard to predict and
28 // depends on many factors. to cover a wider range of configurations, this
29 // must also be possible on end-user systems lacking specialized developer
30 // tools. therefore, we must ship our own implementation; this complements
31 // Intel VTune et al.
32 //
33 // there are 3 approaches to the problem:
34 // - single-step analysis logs every executed instruction. very thorough, but
35 //   intolerably slow (~1000x) and not suitable for performance measurement.
36 // - intrusive measuring tracks execution time of explicitly marked
37 //   functions or 'zones'. more complex, requires adding code, and
38 //   inaccurate when thread switches are frequent.
39 // - IP sampling records the current instruction pointer at regular
40 //   intervals; slow sections of code will over time appear more often.
41 //   not exact, but simple and low-overhead.
42 //
43 // we implement IP sampling due to its simplicity. an intrusive approach
44 // might also be added later to account for performance per-module
45 // (helps spot the culprit in case hotspots are called from multiple sites).
46 
47 
48 // on Windows, we retrieve the current IP with GetThreadContext. dox require
49 // this to happen from another thread, and for the target to be suspended
50 // (now enforced by XP SP2). this leads to all sorts of problems:
51 // - if the suspended thread was dispatching an exception in the kernel,
52 //   register state may be a mix between the correct values and
53 //   those captured from the exception.
54 // - if running on Win9x with real-mode drivers, interrupts may interfere
55 //   with GetThreadContext. however, it's not supported anyway due to other
56 //   deficiencies (e.g. lack of proper mmap support).
57 // - the suspended thread may be holding locks; we need to be extremely
58 //   careful to avoid deadlock! many win api functions acquire locks in
59 //   non-obvious ways.
60 
61 static HANDLE prof_target_thread;
62 
63 static pthread_t prof_thread;
64 
65 // delay [ms] between samples. OS sleep timers usually provide only
66 // ms resolution. increasing interval reduces overhead and accuracy.
67 static const int PROFILE_INTERVAL_MS = 1;
68 
69 
70 static uintptr_t get_target_pc()
71 {
72 	DWORD ret;
73 	HANDLE hThread = prof_target_thread;	// convenience
74 
75 	ret = SuspendThread(hThread);
76 	if(ret == (DWORD)-1)
77 	{
78 		DEBUG_WARN_ERR(ERR::LOGIC);	// get_target_pc: SuspendThread failed
79 		return 0;
80 	}
81 	// note: we don't need to call more than once: this increments a DWORD
82 	// 'suspend count'; target is guaranteed to be suspended unless
83 	// the function failed.
84 
85 	/////////////////////////////////////////////
86 
87 	// be VERY CAREFUL to avoid anything that may acquire a lock until
88 	// after ResumeThread! this includes locks taken by the OS,
89 	// e.g. malloc -> heap or GetProcAddress -> loader.
90 	// reason is, if the target thread was holding a lock we try to
91 	// acquire here, a classic deadlock results.
92 
93 	uintptr_t pc = 0;	// => will return 0 if GetThreadContext fails
94 
95 	CONTEXT context;
96 	context.ContextFlags = CONTEXT_CONTROL;
97 	if(GetThreadContext(hThread, &context))
98 		pc = context.PC_;
99 
100 	/////////////////////////////////////////////
101 
102 	ret = ResumeThread(hThread);
103 	ENSURE(ret != 0);
104 	// don't fail (we have a valid PC), but warn
105 
106 	return pc;
107 }
108 
109 
110 static pthread_t thread;
111 static sem_t exit_flag;
112 
113 static void* prof_thread_func(void* UNUSED(data))
114 {
115 	debug_SetThreadName("eip_sampler");
116 
117 	const long _1e6 = 1000000;
118 	const long _1e9 = 1000000000;
119 
120 	for(;;)
121 	{
122 		// calculate absolute timeout for sem_timedwait
123 		struct timespec abs_timeout;
124 		clock_gettime(CLOCK_REALTIME, &abs_timeout);
125 		abs_timeout.tv_nsec += PROFILE_INTERVAL_MS * _1e6;
126 		// .. handle nanosecond wraparound (must not be > 1000m)
127 		if(abs_timeout.tv_nsec >= _1e9)
128 		{
129 			abs_timeout.tv_nsec -= _1e9;
130 			abs_timeout.tv_sec++;
131 		}
132 
133 		errno = 0;
134 		// if we acquire the semaphore, exit was requested.
135 		if(sem_timedwait(&exit_flag, &abs_timeout) == 0)
136 			break;
137 		// actual error: warn
138 		if(errno != ETIMEDOUT)
139 			DEBUG_WARN_ERR(ERR::LOGIC);	// wpcu prof_thread_func: sem_timedwait failed
140 
141 		uintptr_t pc = get_target_pc();
142 		UNUSED2(pc);
143 
144 		// ADD TO LIST
145 	}
146 
147 	return 0;
148 }
149 
150 
151 
152 // call from thread that is to be profiled
153 Status prof_start()
154 {
155 	// we need a real HANDLE to the target thread for use with
156 	// Suspend|ResumeThread and GetThreadContext.
157 	// alternative: DuplicateHandle on the current thread pseudo-HANDLE.
158 	// this way is a bit more obvious/simple.
159 	const DWORD access = THREAD_GET_CONTEXT|THREAD_SUSPEND_RESUME;
160 	HANDLE hThread = OpenThread(access, FALSE, GetCurrentThreadId());
161 	if(hThread == INVALID_HANDLE_VALUE)
162 		WARN_RETURN(ERR::FAIL);
163 
164 	prof_target_thread = hThread;
165 
166 	sem_init(&exit_flag, 0, 0);
167 	pthread_create(&thread, 0, prof_thread_func, 0);
168 	return INFO::OK;
169 }
170 
171 Status prof_shutdown()
172 {
173 	WARN_IF_FALSE(CloseHandle(prof_target_thread));
174 	return INFO::OK;
175 }
176 
177 
178 
179 /*
180 open question: how to store the EIP values returned? some background:
181 the mechanism above churns out an EIP value (may be in our process, but might
182 also be bogus); we need to store it somehow pending analysis.
183 
184 when done with the current run, we'd want to resolve EIP -> function name,
185 source file etc. (rather slow, so don't do it at runtime).
186 
187 so, how to store it in the meantime? 2 possibilities:
188 - simple array/vector of addresses (of course optimized to reduce allocs)
189 - fixed size array of 'bins' (range of addresses; may be as fine as 1 byte);
190 each bin has a counter which is incremented when the bin's corresponding
191 address has been hit.
192 
193 it's a size tradeoff here; for simple runs of < 1 min (60,000 ms), #1
194 would use 240kb of mem. #2 requires sizeof_whole_program * bytes_per_counter
195 up front, and has problems measuring DLLs (we'd have to explicitly map
196 the DLL address range into a bin - ugh). however, if we ever want to
197 test for say an hour (improves accuracy of profiling due to larger sample size),
198 #1 would guzzle 15mb of memory.
199 
200 hm, another idea would be to write out #1's list of addresses periodically.
201 to make sure the disk I/O doesn't come at a bad time, we could have the main
202 thread call into the profiler and request it write out at that time.
203 this would require extreme caution to avoid the deadlock problem, but looks
204 doable.
205 
206 -------- [2] ----------
207 
208 realistic profiler runs will take up to an hour.
209 
210 writing out to disk would work: could have main thread call back.
211 that and adding EIP to list would be atomic (locked).
212 BUT: large amount of data, that's bad (loading at 30mb/s => 500ms load time alone)
213 
214 problem with enumerating all symbols at startup: how do we enum all DLLs?
215 
216 hybrid idea: std::map of EIPs. we don't build the map at startup,
217 but add when first seen and subsequently increment counter stored there.
218 problem: uses more memory/slower access than list.
219 would have to make sure EIPs are reused.
220 to help that, could quantize down to 4 byte (or so) bins.
221 accessing debug information at runtime to determine function length is too slow.
222 
223 maybe some weird data structure: one bucket controls say 256 bytes of code
224 bucket is found by stripping off lower 8 bits. then, store only
225 the hit count for that byte. where's the savings over normal count?
226 
227 TODO: what if the thread is sleeping at the time we query EIP?
228 can't detect that - suspend count is only set by SuspendThread
229 do we want to report that point (it's good to know), or try to access other threads?
230 
231 TODO split off target thread / get PC into sysdep; profiler thread is portable!
232 
233 
234 at exit: resolve list to hotspots
235 probably hard; a start would be just the function in which the address is, then hit count
236 
237 
238 ==========================================
239 
240 
241 */
242 #endif
243