1 /* Copyright (C) 2010 Wildfire Games. 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining 4 * a copy of this software and associated documentation files (the 5 * "Software"), to deal in the Software without restriction, including 6 * without limitation the rights to use, copy, modify, merge, publish, 7 * distribute, sublicense, and/or sell copies of the Software, and to 8 * permit persons to whom the Software is furnished to do so, subject to 9 * the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included 12 * in all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 17 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 18 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 19 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 20 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 #include "precompiled.h" 24 25 #if 0 26 27 // we need a means of measuring performance, since it is hard to predict and 28 // depends on many factors. to cover a wider range of configurations, this 29 // must also be possible on end-user systems lacking specialized developer 30 // tools. therefore, we must ship our own implementation; this complements 31 // Intel VTune et al. 32 // 33 // there are 3 approaches to the problem: 34 // - single-step analysis logs every executed instruction. very thorough, but 35 // intolerably slow (~1000x) and not suitable for performance measurement. 36 // - intrusive measuring tracks execution time of explicitly marked 37 // functions or 'zones'. more complex, requires adding code, and 38 // inaccurate when thread switches are frequent. 39 // - IP sampling records the current instruction pointer at regular 40 // intervals; slow sections of code will over time appear more often. 41 // not exact, but simple and low-overhead. 42 // 43 // we implement IP sampling due to its simplicity. an intrusive approach 44 // might also be added later to account for performance per-module 45 // (helps spot the culprit in case hotspots are called from multiple sites). 46 47 48 // on Windows, we retrieve the current IP with GetThreadContext. dox require 49 // this to happen from another thread, and for the target to be suspended 50 // (now enforced by XP SP2). this leads to all sorts of problems: 51 // - if the suspended thread was dispatching an exception in the kernel, 52 // register state may be a mix between the correct values and 53 // those captured from the exception. 54 // - if running on Win9x with real-mode drivers, interrupts may interfere 55 // with GetThreadContext. however, it's not supported anyway due to other 56 // deficiencies (e.g. lack of proper mmap support). 57 // - the suspended thread may be holding locks; we need to be extremely 58 // careful to avoid deadlock! many win api functions acquire locks in 59 // non-obvious ways. 60 61 static HANDLE prof_target_thread; 62 63 static pthread_t prof_thread; 64 65 // delay [ms] between samples. OS sleep timers usually provide only 66 // ms resolution. increasing interval reduces overhead and accuracy. 67 static const int PROFILE_INTERVAL_MS = 1; 68 69 70 static uintptr_t get_target_pc() 71 { 72 DWORD ret; 73 HANDLE hThread = prof_target_thread; // convenience 74 75 ret = SuspendThread(hThread); 76 if(ret == (DWORD)-1) 77 { 78 DEBUG_WARN_ERR(ERR::LOGIC); // get_target_pc: SuspendThread failed 79 return 0; 80 } 81 // note: we don't need to call more than once: this increments a DWORD 82 // 'suspend count'; target is guaranteed to be suspended unless 83 // the function failed. 84 85 ///////////////////////////////////////////// 86 87 // be VERY CAREFUL to avoid anything that may acquire a lock until 88 // after ResumeThread! this includes locks taken by the OS, 89 // e.g. malloc -> heap or GetProcAddress -> loader. 90 // reason is, if the target thread was holding a lock we try to 91 // acquire here, a classic deadlock results. 92 93 uintptr_t pc = 0; // => will return 0 if GetThreadContext fails 94 95 CONTEXT context; 96 context.ContextFlags = CONTEXT_CONTROL; 97 if(GetThreadContext(hThread, &context)) 98 pc = context.PC_; 99 100 ///////////////////////////////////////////// 101 102 ret = ResumeThread(hThread); 103 ENSURE(ret != 0); 104 // don't fail (we have a valid PC), but warn 105 106 return pc; 107 } 108 109 110 static pthread_t thread; 111 static sem_t exit_flag; 112 113 static void* prof_thread_func(void* UNUSED(data)) 114 { 115 debug_SetThreadName("eip_sampler"); 116 117 const long _1e6 = 1000000; 118 const long _1e9 = 1000000000; 119 120 for(;;) 121 { 122 // calculate absolute timeout for sem_timedwait 123 struct timespec abs_timeout; 124 clock_gettime(CLOCK_REALTIME, &abs_timeout); 125 abs_timeout.tv_nsec += PROFILE_INTERVAL_MS * _1e6; 126 // .. handle nanosecond wraparound (must not be > 1000m) 127 if(abs_timeout.tv_nsec >= _1e9) 128 { 129 abs_timeout.tv_nsec -= _1e9; 130 abs_timeout.tv_sec++; 131 } 132 133 errno = 0; 134 // if we acquire the semaphore, exit was requested. 135 if(sem_timedwait(&exit_flag, &abs_timeout) == 0) 136 break; 137 // actual error: warn 138 if(errno != ETIMEDOUT) 139 DEBUG_WARN_ERR(ERR::LOGIC); // wpcu prof_thread_func: sem_timedwait failed 140 141 uintptr_t pc = get_target_pc(); 142 UNUSED2(pc); 143 144 // ADD TO LIST 145 } 146 147 return 0; 148 } 149 150 151 152 // call from thread that is to be profiled 153 Status prof_start() 154 { 155 // we need a real HANDLE to the target thread for use with 156 // Suspend|ResumeThread and GetThreadContext. 157 // alternative: DuplicateHandle on the current thread pseudo-HANDLE. 158 // this way is a bit more obvious/simple. 159 const DWORD access = THREAD_GET_CONTEXT|THREAD_SUSPEND_RESUME; 160 HANDLE hThread = OpenThread(access, FALSE, GetCurrentThreadId()); 161 if(hThread == INVALID_HANDLE_VALUE) 162 WARN_RETURN(ERR::FAIL); 163 164 prof_target_thread = hThread; 165 166 sem_init(&exit_flag, 0, 0); 167 pthread_create(&thread, 0, prof_thread_func, 0); 168 return INFO::OK; 169 } 170 171 Status prof_shutdown() 172 { 173 WARN_IF_FALSE(CloseHandle(prof_target_thread)); 174 return INFO::OK; 175 } 176 177 178 179 /* 180 open question: how to store the EIP values returned? some background: 181 the mechanism above churns out an EIP value (may be in our process, but might 182 also be bogus); we need to store it somehow pending analysis. 183 184 when done with the current run, we'd want to resolve EIP -> function name, 185 source file etc. (rather slow, so don't do it at runtime). 186 187 so, how to store it in the meantime? 2 possibilities: 188 - simple array/vector of addresses (of course optimized to reduce allocs) 189 - fixed size array of 'bins' (range of addresses; may be as fine as 1 byte); 190 each bin has a counter which is incremented when the bin's corresponding 191 address has been hit. 192 193 it's a size tradeoff here; for simple runs of < 1 min (60,000 ms), #1 194 would use 240kb of mem. #2 requires sizeof_whole_program * bytes_per_counter 195 up front, and has problems measuring DLLs (we'd have to explicitly map 196 the DLL address range into a bin - ugh). however, if we ever want to 197 test for say an hour (improves accuracy of profiling due to larger sample size), 198 #1 would guzzle 15mb of memory. 199 200 hm, another idea would be to write out #1's list of addresses periodically. 201 to make sure the disk I/O doesn't come at a bad time, we could have the main 202 thread call into the profiler and request it write out at that time. 203 this would require extreme caution to avoid the deadlock problem, but looks 204 doable. 205 206 -------- [2] ---------- 207 208 realistic profiler runs will take up to an hour. 209 210 writing out to disk would work: could have main thread call back. 211 that and adding EIP to list would be atomic (locked). 212 BUT: large amount of data, that's bad (loading at 30mb/s => 500ms load time alone) 213 214 problem with enumerating all symbols at startup: how do we enum all DLLs? 215 216 hybrid idea: std::map of EIPs. we don't build the map at startup, 217 but add when first seen and subsequently increment counter stored there. 218 problem: uses more memory/slower access than list. 219 would have to make sure EIPs are reused. 220 to help that, could quantize down to 4 byte (or so) bins. 221 accessing debug information at runtime to determine function length is too slow. 222 223 maybe some weird data structure: one bucket controls say 256 bytes of code 224 bucket is found by stripping off lower 8 bits. then, store only 225 the hit count for that byte. where's the savings over normal count? 226 227 TODO: what if the thread is sleeping at the time we query EIP? 228 can't detect that - suspend count is only set by SuspendThread 229 do we want to report that point (it's good to know), or try to access other threads? 230 231 TODO split off target thread / get PC into sysdep; profiler thread is portable! 232 233 234 at exit: resolve list to hotspots 235 probably hard; a start would be just the function in which the address is, then hit count 236 237 238 ========================================== 239 240 241 */ 242 #endif 243