1 #include "HalideRuntime.h"
2 #include "printer.h"
3 #include "scoped_mutex_lock.h"
4
5 // Note: The profiler thread may out-live any valid user_context, or
6 // be used across many different user_contexts, so nothing it calls
7 // can depend on the user context.
8
9 extern "C" {
10 // Returns the address of the global halide_profiler state
halide_profiler_get_state()11 WEAK halide_profiler_state *halide_profiler_get_state() {
12 static halide_profiler_state s = {{{0}}, 1, 0, 0, 0, 0, NULL, NULL};
13 return &s;
14 }
15 }
16
17 namespace Halide {
18 namespace Runtime {
19 namespace Internal {
20
find_or_create_pipeline(const char * pipeline_name,int num_funcs,const uint64_t * func_names)21 WEAK halide_profiler_pipeline_stats *find_or_create_pipeline(const char *pipeline_name, int num_funcs, const uint64_t *func_names) {
22 halide_profiler_state *s = halide_profiler_get_state();
23
24 for (halide_profiler_pipeline_stats *p = s->pipelines; p;
25 p = (halide_profiler_pipeline_stats *)(p->next)) {
26 // The same pipeline will deliver the same global constant
27 // string, so they can be compared by pointer.
28 if (p->name == pipeline_name &&
29 p->num_funcs == num_funcs) {
30 return p;
31 }
32 }
33 // Create a new pipeline stats entry.
34 halide_profiler_pipeline_stats *p =
35 (halide_profiler_pipeline_stats *)malloc(sizeof(halide_profiler_pipeline_stats));
36 if (!p) return NULL;
37 p->next = s->pipelines;
38 p->name = pipeline_name;
39 p->first_func_id = s->first_free_id;
40 p->num_funcs = num_funcs;
41 p->runs = 0;
42 p->time = 0;
43 p->samples = 0;
44 p->memory_current = 0;
45 p->memory_peak = 0;
46 p->memory_total = 0;
47 p->num_allocs = 0;
48 p->active_threads_numerator = 0;
49 p->active_threads_denominator = 0;
50 p->funcs = (halide_profiler_func_stats *)malloc(num_funcs * sizeof(halide_profiler_func_stats));
51 if (!p->funcs) {
52 free(p);
53 return NULL;
54 }
55 for (int i = 0; i < num_funcs; i++) {
56 p->funcs[i].time = 0;
57 p->funcs[i].name = (const char *)(func_names[i]);
58 p->funcs[i].memory_current = 0;
59 p->funcs[i].memory_peak = 0;
60 p->funcs[i].memory_total = 0;
61 p->funcs[i].num_allocs = 0;
62 p->funcs[i].stack_peak = 0;
63 p->funcs[i].active_threads_numerator = 0;
64 p->funcs[i].active_threads_denominator = 0;
65 }
66 s->first_free_id += num_funcs;
67 s->pipelines = p;
68 return p;
69 }
70
bill_func(halide_profiler_state * s,int func_id,uint64_t time,int active_threads)71 WEAK void bill_func(halide_profiler_state *s, int func_id, uint64_t time, int active_threads) {
72 halide_profiler_pipeline_stats *p_prev = NULL;
73 for (halide_profiler_pipeline_stats *p = s->pipelines; p;
74 p = (halide_profiler_pipeline_stats *)(p->next)) {
75 if (func_id >= p->first_func_id && func_id < p->first_func_id + p->num_funcs) {
76 if (p_prev) {
77 // Bubble the pipeline to the top to speed up future queries.
78 p_prev->next = (halide_profiler_pipeline_stats *)(p->next);
79 p->next = s->pipelines;
80 s->pipelines = p;
81 }
82 halide_profiler_func_stats *f = p->funcs + func_id - p->first_func_id;
83 f->time += time;
84 f->active_threads_numerator += active_threads;
85 f->active_threads_denominator += 1;
86 p->time += time;
87 p->samples++;
88 p->active_threads_numerator += active_threads;
89 p->active_threads_denominator += 1;
90 return;
91 }
92 p_prev = p;
93 }
94 // Someone must have called reset_state while a kernel was running. Do nothing.
95 }
96
sampling_profiler_thread(void *)97 WEAK void sampling_profiler_thread(void *) {
98 halide_profiler_state *s = halide_profiler_get_state();
99
100 // grab the lock
101 halide_mutex_lock(&s->lock);
102
103 while (s->current_func != halide_profiler_please_stop) {
104
105 uint64_t t1 = halide_current_time_ns(NULL);
106 uint64_t t = t1;
107 while (1) {
108 int func, active_threads;
109 if (s->get_remote_profiler_state) {
110 // Execution has disappeared into remote code running
111 // on an accelerator (e.g. Hexagon DSP)
112 s->get_remote_profiler_state(&func, &active_threads);
113 } else {
114 func = s->current_func;
115 active_threads = s->active_threads;
116 }
117 uint64_t t_now = halide_current_time_ns(NULL);
118 if (func == halide_profiler_please_stop) {
119 break;
120 } else if (func >= 0) {
121 // Assume all time since I was last awake is due to
122 // the currently running func.
123 bill_func(s, func, t_now - t, active_threads);
124 }
125 t = t_now;
126
127 // Release the lock, sleep, reacquire.
128 int sleep_ms = s->sleep_time;
129 halide_mutex_unlock(&s->lock);
130 halide_sleep_ms(NULL, sleep_ms);
131 halide_mutex_lock(&s->lock);
132 }
133 }
134
135 halide_mutex_unlock(&s->lock);
136 }
137
138 } // namespace Internal
139 } // namespace Runtime
140 } // namespace Halide
141
142 namespace {
143
144 template<typename T>
sync_compare_max_and_swap(T * ptr,T val)145 void sync_compare_max_and_swap(T *ptr, T val) {
146 T old_val = *ptr;
147 while (val > old_val) {
148 T temp = old_val;
149 old_val = __sync_val_compare_and_swap(ptr, old_val, val);
150 if (temp == old_val) {
151 return;
152 }
153 }
154 }
155
156 } // namespace
157
158 extern "C" {
159 // Returns the address of the pipeline state associated with pipeline_name.
halide_profiler_get_pipeline_state(const char * pipeline_name)160 WEAK halide_profiler_pipeline_stats *halide_profiler_get_pipeline_state(const char *pipeline_name) {
161 halide_profiler_state *s = halide_profiler_get_state();
162
163 ScopedMutexLock lock(&s->lock);
164
165 for (halide_profiler_pipeline_stats *p = s->pipelines; p;
166 p = (halide_profiler_pipeline_stats *)(p->next)) {
167 // The same pipeline will deliver the same global constant
168 // string, so they can be compared by pointer.
169 if (p->name == pipeline_name) {
170 return p;
171 }
172 }
173 return NULL;
174 }
175
176 // Returns a token identifying this pipeline instance.
halide_profiler_pipeline_start(void * user_context,const char * pipeline_name,int num_funcs,const uint64_t * func_names)177 WEAK int halide_profiler_pipeline_start(void *user_context,
178 const char *pipeline_name,
179 int num_funcs,
180 const uint64_t *func_names) {
181 halide_profiler_state *s = halide_profiler_get_state();
182
183 ScopedMutexLock lock(&s->lock);
184
185 if (!s->sampling_thread) {
186 halide_start_clock(user_context);
187 s->sampling_thread = halide_spawn_thread(sampling_profiler_thread, NULL);
188 }
189
190 halide_profiler_pipeline_stats *p =
191 find_or_create_pipeline(pipeline_name, num_funcs, func_names);
192 if (!p) {
193 // Allocating space to track the statistics failed.
194 return halide_error_out_of_memory(user_context);
195 }
196 p->runs++;
197
198 return p->first_func_id;
199 }
200
halide_profiler_stack_peak_update(void * user_context,void * pipeline_state,uint64_t * f_values)201 WEAK void halide_profiler_stack_peak_update(void *user_context,
202 void *pipeline_state,
203 uint64_t *f_values) {
204 halide_profiler_pipeline_stats *p_stats = (halide_profiler_pipeline_stats *)pipeline_state;
205 halide_assert(user_context, p_stats != NULL);
206
207 // Note: Update to the counter is done without grabbing the state's lock to
208 // reduce lock contention. One potential issue is that other call that frees the
209 // pipeline and function stats structs may be running in parallel. However, the
210 // current desctructor (called on profiler shutdown) does not free the structs
211 // unless user specifically calls halide_profiler_reset().
212
213 // Update per-func memory stats
214 for (int i = 0; i < p_stats->num_funcs; ++i) {
215 if (f_values[i] != 0) {
216 sync_compare_max_and_swap(&(p_stats->funcs[i]).stack_peak, f_values[i]);
217 }
218 }
219 }
220
halide_profiler_memory_allocate(void * user_context,void * pipeline_state,int func_id,uint64_t incr)221 WEAK void halide_profiler_memory_allocate(void *user_context,
222 void *pipeline_state,
223 int func_id,
224 uint64_t incr) {
225 // It's possible to have 'incr' equal to zero if the allocation is not
226 // executed conditionally.
227 if (incr == 0) {
228 return;
229 }
230
231 halide_profiler_pipeline_stats *p_stats = (halide_profiler_pipeline_stats *)pipeline_state;
232 halide_assert(user_context, p_stats != NULL);
233 halide_assert(user_context, func_id >= 0);
234 halide_assert(user_context, func_id < p_stats->num_funcs);
235
236 halide_profiler_func_stats *f_stats = &p_stats->funcs[func_id];
237
238 // Note: Update to the counter is done without grabbing the state's lock to
239 // reduce lock contention. One potential issue is that other call that frees the
240 // pipeline and function stats structs may be running in parallel. However, the
241 // current desctructor (called on profiler shutdown) does not free the structs
242 // unless user specifically calls halide_profiler_reset().
243
244 // Update per-pipeline memory stats
245 __sync_add_and_fetch(&p_stats->num_allocs, 1);
246 __sync_add_and_fetch(&p_stats->memory_total, incr);
247 uint64_t p_mem_current = __sync_add_and_fetch(&p_stats->memory_current, incr);
248 sync_compare_max_and_swap(&p_stats->memory_peak, p_mem_current);
249
250 // Update per-func memory stats
251 __sync_add_and_fetch(&f_stats->num_allocs, 1);
252 __sync_add_and_fetch(&f_stats->memory_total, incr);
253 uint64_t f_mem_current = __sync_add_and_fetch(&f_stats->memory_current, incr);
254 sync_compare_max_and_swap(&f_stats->memory_peak, f_mem_current);
255 }
256
halide_profiler_memory_free(void * user_context,void * pipeline_state,int func_id,uint64_t decr)257 WEAK void halide_profiler_memory_free(void *user_context,
258 void *pipeline_state,
259 int func_id,
260 uint64_t decr) {
261 // It's possible to have 'decr' equal to zero if the allocation is not
262 // executed conditionally.
263 if (decr == 0) {
264 return;
265 }
266
267 halide_profiler_pipeline_stats *p_stats = (halide_profiler_pipeline_stats *)pipeline_state;
268 halide_assert(user_context, p_stats != NULL);
269 halide_assert(user_context, func_id >= 0);
270 halide_assert(user_context, func_id < p_stats->num_funcs);
271
272 halide_profiler_func_stats *f_stats = &p_stats->funcs[func_id];
273
274 // Note: Update to the counter is done without grabbing the state's lock to
275 // reduce lock contention. One potential issue is that other call that frees the
276 // pipeline and function stats structs may be running in parallel. However, the
277 // current destructor (called on profiler shutdown) does not free the structs
278 // unless user specifically calls halide_profiler_reset().
279
280 // Update per-pipeline memory stats
281 __sync_sub_and_fetch(&p_stats->memory_current, decr);
282
283 // Update per-func memory stats
284 __sync_sub_and_fetch(&f_stats->memory_current, decr);
285 }
286
halide_profiler_report_unlocked(void * user_context,halide_profiler_state * s)287 WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_state *s) {
288
289 char line_buf[1024];
290 Printer<StringStreamPrinter, sizeof(line_buf)> sstr(user_context, line_buf);
291
292 for (halide_profiler_pipeline_stats *p = s->pipelines; p;
293 p = (halide_profiler_pipeline_stats *)(p->next)) {
294 float t = p->time / 1000000.0f;
295 if (!p->runs) continue;
296 sstr.clear();
297 int alloc_avg = 0;
298 if (p->num_allocs != 0) {
299 alloc_avg = p->memory_total / p->num_allocs;
300 }
301 bool serial = p->active_threads_numerator == p->active_threads_denominator;
302 float threads = p->active_threads_numerator / (p->active_threads_denominator + 1e-10);
303 sstr << p->name << "\n"
304 << " total time: " << t << " ms"
305 << " samples: " << p->samples
306 << " runs: " << p->runs
307 << " time/run: " << t / p->runs << " ms\n";
308 if (!serial) {
309 sstr << " average threads used: " << threads << "\n";
310 }
311 sstr << " heap allocations: " << p->num_allocs
312 << " peak heap usage: " << p->memory_peak << " bytes\n";
313 halide_print(user_context, sstr.str());
314
315 bool print_f_states = p->time || p->memory_total;
316 if (!print_f_states) {
317 for (int i = 0; i < p->num_funcs; i++) {
318 halide_profiler_func_stats *fs = p->funcs + i;
319 if (fs->stack_peak) {
320 print_f_states = true;
321 break;
322 }
323 }
324 }
325
326 if (print_f_states) {
327 for (int i = 0; i < p->num_funcs; i++) {
328 size_t cursor = 0;
329 sstr.clear();
330 halide_profiler_func_stats *fs = p->funcs + i;
331
332 // The first func is always a catch-all overhead
333 // slot. Only report overhead time if it's non-zero
334 if (i == 0 && fs->time == 0) continue;
335
336 sstr << " " << fs->name << ": ";
337 cursor += 25;
338 while (sstr.size() < cursor) {
339 sstr << " ";
340 }
341
342 float ft = fs->time / (p->runs * 1000000.0f);
343 sstr << ft;
344 // We don't need 6 sig. figs.
345 sstr.erase(3);
346 sstr << "ms";
347 cursor += 10;
348 while (sstr.size() < cursor) {
349 sstr << " ";
350 }
351
352 int percent = 0;
353 if (p->time != 0) {
354 percent = (100 * fs->time) / p->time;
355 }
356 sstr << "(" << percent << "%)";
357 cursor += 8;
358 while (sstr.size() < cursor) {
359 sstr << " ";
360 }
361
362 if (!serial) {
363 float threads = fs->active_threads_numerator / (fs->active_threads_denominator + 1e-10);
364 sstr << "threads: " << threads;
365 sstr.erase(3);
366 cursor += 15;
367 while (sstr.size() < cursor) {
368 sstr << " ";
369 }
370 }
371
372 int alloc_avg = 0;
373 if (fs->num_allocs != 0) {
374 alloc_avg = fs->memory_total / fs->num_allocs;
375 }
376
377 if (fs->memory_peak) {
378 cursor += 15;
379 sstr << " peak: " << fs->memory_peak;
380 while (sstr.size() < cursor) {
381 sstr << " ";
382 }
383 sstr << " num: " << fs->num_allocs;
384 cursor += 15;
385 while (sstr.size() < cursor) {
386 sstr << " ";
387 }
388 sstr << " avg: " << alloc_avg;
389 }
390 if (fs->stack_peak > 0) {
391 sstr << " stack: " << fs->stack_peak;
392 }
393 sstr << "\n";
394
395 halide_print(user_context, sstr.str());
396 }
397 }
398 }
399 }
400
halide_profiler_report(void * user_context)401 WEAK void halide_profiler_report(void *user_context) {
402 halide_profiler_state *s = halide_profiler_get_state();
403 ScopedMutexLock lock(&s->lock);
404 halide_profiler_report_unlocked(user_context, s);
405 }
406
halide_profiler_reset_unlocked(halide_profiler_state * s)407 WEAK void halide_profiler_reset_unlocked(halide_profiler_state *s) {
408 while (s->pipelines) {
409 halide_profiler_pipeline_stats *p = s->pipelines;
410 s->pipelines = (halide_profiler_pipeline_stats *)(p->next);
411 free(p->funcs);
412 free(p);
413 }
414 s->first_free_id = 0;
415 }
416
halide_profiler_reset()417 WEAK void halide_profiler_reset() {
418 // WARNING: Do not call this method while any other halide
419 // pipeline is running; halide_profiler_memory_allocate/free and
420 // halide_profiler_stack_peak_update update the profiler pipeline's
421 // state without grabbing the global profiler state's lock.
422 halide_profiler_state *s = halide_profiler_get_state();
423 ScopedMutexLock lock(&s->lock);
424 halide_profiler_reset_unlocked(s);
425 }
426
427 #ifndef WINDOWS
428 __attribute__((destructor))
429 #endif
430 WEAK void
halide_profiler_shutdown()431 halide_profiler_shutdown() {
432 halide_profiler_state *s = halide_profiler_get_state();
433 if (!s->sampling_thread) {
434 return;
435 }
436
437 s->current_func = halide_profiler_please_stop;
438 halide_join_thread(s->sampling_thread);
439 s->sampling_thread = NULL;
440 s->current_func = halide_profiler_outside_of_halide;
441
442 // Print results. No need to lock anything because we just shut
443 // down the thread.
444 halide_profiler_report_unlocked(NULL, s);
445
446 halide_profiler_reset_unlocked(s);
447 }
448
449 namespace {
450 #ifdef WINDOWS
halide_windows_profiler_shutdown()451 WEAK void halide_windows_profiler_shutdown() {
452 halide_profiler_state *s = halide_profiler_get_state();
453 if (!s->sampling_thread) {
454 return;
455 }
456
457 // On Windows it is unsafe to do anything with threads or critical
458 // sections in a static destructor as it may run after threads
459 // have been killed by the OS. Furthermore, may calls, even things
460 // like EnterCriticalSection may be set to kill the process if
461 // called during process shutdown. Hence kthis routine doesn't attmept
462 // to clean up state as the destructor does on other platforms.
463
464 // Print results. Avoid locking as it will cause problems and
465 // nothing should be running.
466 halide_profiler_report_unlocked(NULL, s);
467 }
468 #endif
469 } // namespace
470
halide_profiler_pipeline_end(void * user_context,void * state)471 WEAK void halide_profiler_pipeline_end(void *user_context, void *state) {
472 ((halide_profiler_state *)state)->current_func = halide_profiler_outside_of_halide;
473 }
474
475 } // extern "C"
476