1 #include "HalideRuntime.h"
2 #include "printer.h"
3 #include "scoped_mutex_lock.h"
4 
5 // Note: The profiler thread may out-live any valid user_context, or
6 // be used across many different user_contexts, so nothing it calls
7 // can depend on the user context.
8 
9 extern "C" {
10 // Returns the address of the global halide_profiler state
halide_profiler_get_state()11 WEAK halide_profiler_state *halide_profiler_get_state() {
12     static halide_profiler_state s = {{{0}}, 1, 0, 0, 0, 0, NULL, NULL};
13     return &s;
14 }
15 }
16 
17 namespace Halide {
18 namespace Runtime {
19 namespace Internal {
20 
find_or_create_pipeline(const char * pipeline_name,int num_funcs,const uint64_t * func_names)21 WEAK halide_profiler_pipeline_stats *find_or_create_pipeline(const char *pipeline_name, int num_funcs, const uint64_t *func_names) {
22     halide_profiler_state *s = halide_profiler_get_state();
23 
24     for (halide_profiler_pipeline_stats *p = s->pipelines; p;
25          p = (halide_profiler_pipeline_stats *)(p->next)) {
26         // The same pipeline will deliver the same global constant
27         // string, so they can be compared by pointer.
28         if (p->name == pipeline_name &&
29             p->num_funcs == num_funcs) {
30             return p;
31         }
32     }
33     // Create a new pipeline stats entry.
34     halide_profiler_pipeline_stats *p =
35         (halide_profiler_pipeline_stats *)malloc(sizeof(halide_profiler_pipeline_stats));
36     if (!p) return NULL;
37     p->next = s->pipelines;
38     p->name = pipeline_name;
39     p->first_func_id = s->first_free_id;
40     p->num_funcs = num_funcs;
41     p->runs = 0;
42     p->time = 0;
43     p->samples = 0;
44     p->memory_current = 0;
45     p->memory_peak = 0;
46     p->memory_total = 0;
47     p->num_allocs = 0;
48     p->active_threads_numerator = 0;
49     p->active_threads_denominator = 0;
50     p->funcs = (halide_profiler_func_stats *)malloc(num_funcs * sizeof(halide_profiler_func_stats));
51     if (!p->funcs) {
52         free(p);
53         return NULL;
54     }
55     for (int i = 0; i < num_funcs; i++) {
56         p->funcs[i].time = 0;
57         p->funcs[i].name = (const char *)(func_names[i]);
58         p->funcs[i].memory_current = 0;
59         p->funcs[i].memory_peak = 0;
60         p->funcs[i].memory_total = 0;
61         p->funcs[i].num_allocs = 0;
62         p->funcs[i].stack_peak = 0;
63         p->funcs[i].active_threads_numerator = 0;
64         p->funcs[i].active_threads_denominator = 0;
65     }
66     s->first_free_id += num_funcs;
67     s->pipelines = p;
68     return p;
69 }
70 
bill_func(halide_profiler_state * s,int func_id,uint64_t time,int active_threads)71 WEAK void bill_func(halide_profiler_state *s, int func_id, uint64_t time, int active_threads) {
72     halide_profiler_pipeline_stats *p_prev = NULL;
73     for (halide_profiler_pipeline_stats *p = s->pipelines; p;
74          p = (halide_profiler_pipeline_stats *)(p->next)) {
75         if (func_id >= p->first_func_id && func_id < p->first_func_id + p->num_funcs) {
76             if (p_prev) {
77                 // Bubble the pipeline to the top to speed up future queries.
78                 p_prev->next = (halide_profiler_pipeline_stats *)(p->next);
79                 p->next = s->pipelines;
80                 s->pipelines = p;
81             }
82             halide_profiler_func_stats *f = p->funcs + func_id - p->first_func_id;
83             f->time += time;
84             f->active_threads_numerator += active_threads;
85             f->active_threads_denominator += 1;
86             p->time += time;
87             p->samples++;
88             p->active_threads_numerator += active_threads;
89             p->active_threads_denominator += 1;
90             return;
91         }
92         p_prev = p;
93     }
94     // Someone must have called reset_state while a kernel was running. Do nothing.
95 }
96 
sampling_profiler_thread(void *)97 WEAK void sampling_profiler_thread(void *) {
98     halide_profiler_state *s = halide_profiler_get_state();
99 
100     // grab the lock
101     halide_mutex_lock(&s->lock);
102 
103     while (s->current_func != halide_profiler_please_stop) {
104 
105         uint64_t t1 = halide_current_time_ns(NULL);
106         uint64_t t = t1;
107         while (1) {
108             int func, active_threads;
109             if (s->get_remote_profiler_state) {
110                 // Execution has disappeared into remote code running
111                 // on an accelerator (e.g. Hexagon DSP)
112                 s->get_remote_profiler_state(&func, &active_threads);
113             } else {
114                 func = s->current_func;
115                 active_threads = s->active_threads;
116             }
117             uint64_t t_now = halide_current_time_ns(NULL);
118             if (func == halide_profiler_please_stop) {
119                 break;
120             } else if (func >= 0) {
121                 // Assume all time since I was last awake is due to
122                 // the currently running func.
123                 bill_func(s, func, t_now - t, active_threads);
124             }
125             t = t_now;
126 
127             // Release the lock, sleep, reacquire.
128             int sleep_ms = s->sleep_time;
129             halide_mutex_unlock(&s->lock);
130             halide_sleep_ms(NULL, sleep_ms);
131             halide_mutex_lock(&s->lock);
132         }
133     }
134 
135     halide_mutex_unlock(&s->lock);
136 }
137 
138 }  // namespace Internal
139 }  // namespace Runtime
140 }  // namespace Halide
141 
142 namespace {
143 
144 template<typename T>
sync_compare_max_and_swap(T * ptr,T val)145 void sync_compare_max_and_swap(T *ptr, T val) {
146     T old_val = *ptr;
147     while (val > old_val) {
148         T temp = old_val;
149         old_val = __sync_val_compare_and_swap(ptr, old_val, val);
150         if (temp == old_val) {
151             return;
152         }
153     }
154 }
155 
156 }  // namespace
157 
158 extern "C" {
159 // Returns the address of the pipeline state associated with pipeline_name.
halide_profiler_get_pipeline_state(const char * pipeline_name)160 WEAK halide_profiler_pipeline_stats *halide_profiler_get_pipeline_state(const char *pipeline_name) {
161     halide_profiler_state *s = halide_profiler_get_state();
162 
163     ScopedMutexLock lock(&s->lock);
164 
165     for (halide_profiler_pipeline_stats *p = s->pipelines; p;
166          p = (halide_profiler_pipeline_stats *)(p->next)) {
167         // The same pipeline will deliver the same global constant
168         // string, so they can be compared by pointer.
169         if (p->name == pipeline_name) {
170             return p;
171         }
172     }
173     return NULL;
174 }
175 
176 // Returns a token identifying this pipeline instance.
halide_profiler_pipeline_start(void * user_context,const char * pipeline_name,int num_funcs,const uint64_t * func_names)177 WEAK int halide_profiler_pipeline_start(void *user_context,
178                                         const char *pipeline_name,
179                                         int num_funcs,
180                                         const uint64_t *func_names) {
181     halide_profiler_state *s = halide_profiler_get_state();
182 
183     ScopedMutexLock lock(&s->lock);
184 
185     if (!s->sampling_thread) {
186         halide_start_clock(user_context);
187         s->sampling_thread = halide_spawn_thread(sampling_profiler_thread, NULL);
188     }
189 
190     halide_profiler_pipeline_stats *p =
191         find_or_create_pipeline(pipeline_name, num_funcs, func_names);
192     if (!p) {
193         // Allocating space to track the statistics failed.
194         return halide_error_out_of_memory(user_context);
195     }
196     p->runs++;
197 
198     return p->first_func_id;
199 }
200 
halide_profiler_stack_peak_update(void * user_context,void * pipeline_state,uint64_t * f_values)201 WEAK void halide_profiler_stack_peak_update(void *user_context,
202                                             void *pipeline_state,
203                                             uint64_t *f_values) {
204     halide_profiler_pipeline_stats *p_stats = (halide_profiler_pipeline_stats *)pipeline_state;
205     halide_assert(user_context, p_stats != NULL);
206 
207     // Note: Update to the counter is done without grabbing the state's lock to
208     // reduce lock contention. One potential issue is that other call that frees the
209     // pipeline and function stats structs may be running in parallel. However, the
210     // current desctructor (called on profiler shutdown) does not free the structs
211     // unless user specifically calls halide_profiler_reset().
212 
213     // Update per-func memory stats
214     for (int i = 0; i < p_stats->num_funcs; ++i) {
215         if (f_values[i] != 0) {
216             sync_compare_max_and_swap(&(p_stats->funcs[i]).stack_peak, f_values[i]);
217         }
218     }
219 }
220 
halide_profiler_memory_allocate(void * user_context,void * pipeline_state,int func_id,uint64_t incr)221 WEAK void halide_profiler_memory_allocate(void *user_context,
222                                           void *pipeline_state,
223                                           int func_id,
224                                           uint64_t incr) {
225     // It's possible to have 'incr' equal to zero if the allocation is not
226     // executed conditionally.
227     if (incr == 0) {
228         return;
229     }
230 
231     halide_profiler_pipeline_stats *p_stats = (halide_profiler_pipeline_stats *)pipeline_state;
232     halide_assert(user_context, p_stats != NULL);
233     halide_assert(user_context, func_id >= 0);
234     halide_assert(user_context, func_id < p_stats->num_funcs);
235 
236     halide_profiler_func_stats *f_stats = &p_stats->funcs[func_id];
237 
238     // Note: Update to the counter is done without grabbing the state's lock to
239     // reduce lock contention. One potential issue is that other call that frees the
240     // pipeline and function stats structs may be running in parallel. However, the
241     // current desctructor (called on profiler shutdown) does not free the structs
242     // unless user specifically calls halide_profiler_reset().
243 
244     // Update per-pipeline memory stats
245     __sync_add_and_fetch(&p_stats->num_allocs, 1);
246     __sync_add_and_fetch(&p_stats->memory_total, incr);
247     uint64_t p_mem_current = __sync_add_and_fetch(&p_stats->memory_current, incr);
248     sync_compare_max_and_swap(&p_stats->memory_peak, p_mem_current);
249 
250     // Update per-func memory stats
251     __sync_add_and_fetch(&f_stats->num_allocs, 1);
252     __sync_add_and_fetch(&f_stats->memory_total, incr);
253     uint64_t f_mem_current = __sync_add_and_fetch(&f_stats->memory_current, incr);
254     sync_compare_max_and_swap(&f_stats->memory_peak, f_mem_current);
255 }
256 
halide_profiler_memory_free(void * user_context,void * pipeline_state,int func_id,uint64_t decr)257 WEAK void halide_profiler_memory_free(void *user_context,
258                                       void *pipeline_state,
259                                       int func_id,
260                                       uint64_t decr) {
261     // It's possible to have 'decr' equal to zero if the allocation is not
262     // executed conditionally.
263     if (decr == 0) {
264         return;
265     }
266 
267     halide_profiler_pipeline_stats *p_stats = (halide_profiler_pipeline_stats *)pipeline_state;
268     halide_assert(user_context, p_stats != NULL);
269     halide_assert(user_context, func_id >= 0);
270     halide_assert(user_context, func_id < p_stats->num_funcs);
271 
272     halide_profiler_func_stats *f_stats = &p_stats->funcs[func_id];
273 
274     // Note: Update to the counter is done without grabbing the state's lock to
275     // reduce lock contention. One potential issue is that other call that frees the
276     // pipeline and function stats structs may be running in parallel. However, the
277     // current destructor (called on profiler shutdown) does not free the structs
278     // unless user specifically calls halide_profiler_reset().
279 
280     // Update per-pipeline memory stats
281     __sync_sub_and_fetch(&p_stats->memory_current, decr);
282 
283     // Update per-func memory stats
284     __sync_sub_and_fetch(&f_stats->memory_current, decr);
285 }
286 
halide_profiler_report_unlocked(void * user_context,halide_profiler_state * s)287 WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_state *s) {
288 
289     char line_buf[1024];
290     Printer<StringStreamPrinter, sizeof(line_buf)> sstr(user_context, line_buf);
291 
292     for (halide_profiler_pipeline_stats *p = s->pipelines; p;
293          p = (halide_profiler_pipeline_stats *)(p->next)) {
294         float t = p->time / 1000000.0f;
295         if (!p->runs) continue;
296         sstr.clear();
297         int alloc_avg = 0;
298         if (p->num_allocs != 0) {
299             alloc_avg = p->memory_total / p->num_allocs;
300         }
301         bool serial = p->active_threads_numerator == p->active_threads_denominator;
302         float threads = p->active_threads_numerator / (p->active_threads_denominator + 1e-10);
303         sstr << p->name << "\n"
304              << " total time: " << t << " ms"
305              << "  samples: " << p->samples
306              << "  runs: " << p->runs
307              << "  time/run: " << t / p->runs << " ms\n";
308         if (!serial) {
309             sstr << " average threads used: " << threads << "\n";
310         }
311         sstr << " heap allocations: " << p->num_allocs
312              << "  peak heap usage: " << p->memory_peak << " bytes\n";
313         halide_print(user_context, sstr.str());
314 
315         bool print_f_states = p->time || p->memory_total;
316         if (!print_f_states) {
317             for (int i = 0; i < p->num_funcs; i++) {
318                 halide_profiler_func_stats *fs = p->funcs + i;
319                 if (fs->stack_peak) {
320                     print_f_states = true;
321                     break;
322                 }
323             }
324         }
325 
326         if (print_f_states) {
327             for (int i = 0; i < p->num_funcs; i++) {
328                 size_t cursor = 0;
329                 sstr.clear();
330                 halide_profiler_func_stats *fs = p->funcs + i;
331 
332                 // The first func is always a catch-all overhead
333                 // slot. Only report overhead time if it's non-zero
334                 if (i == 0 && fs->time == 0) continue;
335 
336                 sstr << "  " << fs->name << ": ";
337                 cursor += 25;
338                 while (sstr.size() < cursor) {
339                     sstr << " ";
340                 }
341 
342                 float ft = fs->time / (p->runs * 1000000.0f);
343                 sstr << ft;
344                 // We don't need 6 sig. figs.
345                 sstr.erase(3);
346                 sstr << "ms";
347                 cursor += 10;
348                 while (sstr.size() < cursor) {
349                     sstr << " ";
350                 }
351 
352                 int percent = 0;
353                 if (p->time != 0) {
354                     percent = (100 * fs->time) / p->time;
355                 }
356                 sstr << "(" << percent << "%)";
357                 cursor += 8;
358                 while (sstr.size() < cursor) {
359                     sstr << " ";
360                 }
361 
362                 if (!serial) {
363                     float threads = fs->active_threads_numerator / (fs->active_threads_denominator + 1e-10);
364                     sstr << "threads: " << threads;
365                     sstr.erase(3);
366                     cursor += 15;
367                     while (sstr.size() < cursor) {
368                         sstr << " ";
369                     }
370                 }
371 
372                 int alloc_avg = 0;
373                 if (fs->num_allocs != 0) {
374                     alloc_avg = fs->memory_total / fs->num_allocs;
375                 }
376 
377                 if (fs->memory_peak) {
378                     cursor += 15;
379                     sstr << " peak: " << fs->memory_peak;
380                     while (sstr.size() < cursor) {
381                         sstr << " ";
382                     }
383                     sstr << " num: " << fs->num_allocs;
384                     cursor += 15;
385                     while (sstr.size() < cursor) {
386                         sstr << " ";
387                     }
388                     sstr << " avg: " << alloc_avg;
389                 }
390                 if (fs->stack_peak > 0) {
391                     sstr << " stack: " << fs->stack_peak;
392                 }
393                 sstr << "\n";
394 
395                 halide_print(user_context, sstr.str());
396             }
397         }
398     }
399 }
400 
halide_profiler_report(void * user_context)401 WEAK void halide_profiler_report(void *user_context) {
402     halide_profiler_state *s = halide_profiler_get_state();
403     ScopedMutexLock lock(&s->lock);
404     halide_profiler_report_unlocked(user_context, s);
405 }
406 
halide_profiler_reset_unlocked(halide_profiler_state * s)407 WEAK void halide_profiler_reset_unlocked(halide_profiler_state *s) {
408     while (s->pipelines) {
409         halide_profiler_pipeline_stats *p = s->pipelines;
410         s->pipelines = (halide_profiler_pipeline_stats *)(p->next);
411         free(p->funcs);
412         free(p);
413     }
414     s->first_free_id = 0;
415 }
416 
halide_profiler_reset()417 WEAK void halide_profiler_reset() {
418     // WARNING: Do not call this method while any other halide
419     // pipeline is running; halide_profiler_memory_allocate/free and
420     // halide_profiler_stack_peak_update update the profiler pipeline's
421     // state without grabbing the global profiler state's lock.
422     halide_profiler_state *s = halide_profiler_get_state();
423     ScopedMutexLock lock(&s->lock);
424     halide_profiler_reset_unlocked(s);
425 }
426 
427 #ifndef WINDOWS
428 __attribute__((destructor))
429 #endif
430 WEAK void
halide_profiler_shutdown()431 halide_profiler_shutdown() {
432     halide_profiler_state *s = halide_profiler_get_state();
433     if (!s->sampling_thread) {
434         return;
435     }
436 
437     s->current_func = halide_profiler_please_stop;
438     halide_join_thread(s->sampling_thread);
439     s->sampling_thread = NULL;
440     s->current_func = halide_profiler_outside_of_halide;
441 
442     // Print results. No need to lock anything because we just shut
443     // down the thread.
444     halide_profiler_report_unlocked(NULL, s);
445 
446     halide_profiler_reset_unlocked(s);
447 }
448 
449 namespace {
450 #ifdef WINDOWS
halide_windows_profiler_shutdown()451 WEAK void halide_windows_profiler_shutdown() {
452     halide_profiler_state *s = halide_profiler_get_state();
453     if (!s->sampling_thread) {
454         return;
455     }
456 
457     // On Windows it is unsafe to do anything with threads or critical
458     // sections in a static destructor as it may run after threads
459     // have been killed by the OS. Furthermore, may calls, even things
460     // like EnterCriticalSection may be set to kill the process if
461     // called during process shutdown. Hence kthis routine doesn't attmept
462     // to clean up state as the destructor does on other platforms.
463 
464     // Print results. Avoid locking as it will cause problems and
465     // nothing should be running.
466     halide_profiler_report_unlocked(NULL, s);
467 }
468 #endif
469 }  // namespace
470 
halide_profiler_pipeline_end(void * user_context,void * state)471 WEAK void halide_profiler_pipeline_end(void *user_context, void *state) {
472     ((halide_profiler_state *)state)->current_func = halide_profiler_outside_of_halide;
473 }
474 
475 }  // extern "C"
476