1 #include "HalideRuntime.h"
2 #include "printer.h"
3 #include "scoped_spin_lock.h"
4
5 extern "C" {
6
7 typedef int32_t (*trace_fn)(void *, const halide_trace_event_t *);
8 }
9
10 namespace Halide {
11 namespace Runtime {
12 namespace Internal {
13
14 // A spinlock that allows for shared and exclusive access. It's
15 // equivalent to a reader-writer lock, but in my case the "readers"
16 // will actually be writing simultaneously to the trace buffer, so
17 // that's a bad name. We use the __sync primitives used elsewhere in
18 // the runtime for atomic work. They are well supported by clang.
19 class SharedExclusiveSpinLock {
20 volatile uint32_t lock;
21
22 // Covers a single bit indicating one owner has exclusive
23 // access. The waiting bit can be set while the exclusive bit is
24 // set, but the bits masked by shared_mask must be zero while this
25 // bit is set.
26 const static uint32_t exclusive_held_mask = 0x80000000;
27
28 // Set to indicate a thread needs to acquire exclusive
29 // access. Other fields of the lock may be set, but no shared
30 // access request will proceed while this bit is set.
31 const static uint32_t exclusive_waiting_mask = 0x40000000;
32
33 // Count of threads currently holding shared access. Must be zero
34 // if the exclusive bit is set. Cannot increase if the waiting bit
35 // is set.
36 const static uint32_t shared_mask = 0x3fffffff;
37
38 public:
acquire_shared()39 ALWAYS_INLINE void acquire_shared() {
40 while (1) {
41 uint32_t x = lock & shared_mask;
42 if (__sync_bool_compare_and_swap(&lock, x, x + 1)) {
43 return;
44 }
45 }
46 }
47
release_shared()48 ALWAYS_INLINE void release_shared() {
49 __sync_fetch_and_sub(&lock, 1);
50 }
51
acquire_exclusive()52 ALWAYS_INLINE void acquire_exclusive() {
53 while (1) {
54 // If multiple threads are trying to acquire exclusive
55 // ownership, we may need to rerequest exclusive waiting
56 // while we spin, as it gets unset whenever a thread
57 // acquires exclusive ownership.
58 __sync_fetch_and_or(&lock, exclusive_waiting_mask);
59 if (__sync_bool_compare_and_swap(&lock, exclusive_waiting_mask, exclusive_held_mask)) {
60 return;
61 }
62 }
63 }
64
release_exclusive()65 ALWAYS_INLINE void release_exclusive() {
66 __sync_fetch_and_and(&lock, ~exclusive_held_mask);
67 }
68
init()69 ALWAYS_INLINE void init() {
70 lock = 0;
71 }
72
SharedExclusiveSpinLock()73 SharedExclusiveSpinLock()
74 : lock(0) {
75 }
76 };
77
78 const static int buffer_size = 1024 * 1024;
79
80 class TraceBuffer {
81 SharedExclusiveSpinLock lock;
82 uint32_t cursor, overage;
83 uint8_t buf[buffer_size];
84
85 // Attempt to atomically acquire space in the buffer to write a
86 // packet. Returns NULL if the buffer was full.
try_acquire_packet(void * user_context,uint32_t size)87 ALWAYS_INLINE halide_trace_packet_t *try_acquire_packet(void *user_context, uint32_t size) {
88 lock.acquire_shared();
89 halide_assert(user_context, size <= buffer_size);
90 uint32_t my_cursor = __sync_fetch_and_add(&cursor, size);
91 if (my_cursor + size > sizeof(buf)) {
92 // Don't try to back it out: instead, just allow this request to fail
93 // (along with all subsequent requests) and record the 'overage'
94 // that was added and should be ignored; then, in the next flush,
95 // remove the overage.
96 __sync_fetch_and_add(&overage, size);
97 lock.release_shared();
98 return NULL;
99 } else {
100 return (halide_trace_packet_t *)(buf + my_cursor);
101 }
102 }
103
104 public:
105 // Wait for all writers to finish with their packets, stall any
106 // new writers, and flush the buffer to the fd.
flush(void * user_context,int fd)107 ALWAYS_INLINE void flush(void *user_context, int fd) {
108 lock.acquire_exclusive();
109 bool success = true;
110 if (cursor) {
111 cursor -= overage;
112 success = (cursor == (uint32_t)write(fd, buf, cursor));
113 cursor = 0;
114 overage = 0;
115 }
116 lock.release_exclusive();
117 halide_assert(user_context, success && "Could not write to trace file");
118 }
119
120 // Acquire and return a packet's worth of space in the trace
121 // buffer, flushing the trace buffer to the given fd to make space
122 // if necessary. The region acquired is protected from other
123 // threads writing or reading to it, so it must be released before
124 // a flush can occur.
acquire_packet(void * user_context,int fd,uint32_t size)125 ALWAYS_INLINE halide_trace_packet_t *acquire_packet(void *user_context, int fd, uint32_t size) {
126 halide_trace_packet_t *packet = NULL;
127 while (!(packet = try_acquire_packet(user_context, size))) {
128 // Couldn't acquire space to write a packet. Flush and try again.
129 flush(user_context, fd);
130 }
131 return packet;
132 }
133
134 // Release a packet, allowing it to be written out with flush
release_packet(halide_trace_packet_t *)135 ALWAYS_INLINE void release_packet(halide_trace_packet_t *) {
136 // Need a memory barrier to guarantee all the writes are done.
137 __sync_synchronize();
138 lock.release_shared();
139 }
140
init()141 ALWAYS_INLINE void init() {
142 cursor = 0;
143 overage = 0;
144 lock.init();
145 }
146
TraceBuffer()147 TraceBuffer()
148 : cursor(0), overage(0) {
149 }
150 };
151
152 WEAK TraceBuffer *halide_trace_buffer = NULL;
153 WEAK int halide_trace_file = -1; // -1 indicates uninitialized
154 WEAK ScopedSpinLock::AtomicFlag halide_trace_file_lock = 0;
155 WEAK bool halide_trace_file_initialized = false;
156 WEAK void *halide_trace_file_internally_opened = NULL;
157
158 } // namespace Internal
159 } // namespace Runtime
160 } // namespace Halide
161
162 extern "C" {
163
halide_default_trace(void * user_context,const halide_trace_event_t * e)164 WEAK int32_t halide_default_trace(void *user_context, const halide_trace_event_t *e) {
165 static int32_t ids = 1;
166
167 int32_t my_id = __sync_fetch_and_add(&ids, 1);
168
169 // If we're dumping to a file, use a binary format
170 int fd = halide_get_trace_file(user_context);
171 if (fd > 0) {
172 // Compute the total packet size
173 uint32_t value_bytes = (uint32_t)(e->type.lanes * e->type.bytes());
174 uint32_t header_bytes = (uint32_t)sizeof(halide_trace_packet_t);
175 uint32_t coords_bytes = e->dimensions * (uint32_t)sizeof(int32_t);
176 uint32_t name_bytes = strlen(e->func) + 1;
177 uint32_t trace_tag_bytes = e->trace_tag ? (strlen(e->trace_tag) + 1) : 1;
178 uint32_t total_size_without_padding = header_bytes + value_bytes + coords_bytes + name_bytes + trace_tag_bytes;
179 uint32_t total_size = (total_size_without_padding + 3) & ~3;
180
181 // Claim some space to write to in the trace buffer
182 halide_trace_packet_t *packet = halide_trace_buffer->acquire_packet(user_context, fd, total_size);
183
184 if (total_size > 4096) {
185 print(NULL) << total_size << "\n";
186 }
187
188 // Write a packet into it
189 packet->size = total_size;
190 packet->id = my_id;
191 packet->type = e->type;
192 packet->event = e->event;
193 packet->parent_id = e->parent_id;
194 packet->value_index = e->value_index;
195 packet->dimensions = e->dimensions;
196 if (e->coordinates) {
197 memcpy((void *)packet->coordinates(), e->coordinates, coords_bytes);
198 }
199 if (e->value) {
200 memcpy((void *)packet->value(), e->value, value_bytes);
201 }
202 memcpy((void *)packet->func(), e->func, name_bytes);
203 memcpy((void *)packet->trace_tag(), e->trace_tag ? e->trace_tag : "", trace_tag_bytes);
204
205 // Release it
206 halide_trace_buffer->release_packet(packet);
207
208 // We should also flush the trace buffer if we hit an event
209 // that might be the end of the trace.
210 if (e->event == halide_trace_end_pipeline) {
211 halide_trace_buffer->flush(user_context, fd);
212 }
213
214 } else {
215 uint8_t buffer[4096];
216 Printer<StringStreamPrinter, sizeof(buffer)> ss(user_context, (char *)buffer);
217
218 // Round up bits to 8, 16, 32, or 64
219 int print_bits = 8;
220 while (print_bits < e->type.bits) {
221 print_bits <<= 1;
222 }
223 halide_assert(user_context, print_bits <= 64 && "Tracing bad type");
224
225 // Otherwise, use halide_print and a plain-text format
226 const char *event_types[] = {"Load",
227 "Store",
228 "Begin realization",
229 "End realization",
230 "Produce",
231 "End produce",
232 "Consume",
233 "End consume",
234 "Begin pipeline",
235 "End pipeline",
236 "Tag"};
237
238 // Only print out the value on stores and loads.
239 bool print_value = (e->event < 2);
240
241 ss << event_types[e->event] << " " << e->func << "." << e->value_index << "(";
242 if (e->type.lanes > 1) {
243 ss << "<";
244 }
245 for (int i = 0; i < e->dimensions; i++) {
246 if (i > 0) {
247 if ((e->type.lanes > 1) && (i % e->type.lanes) == 0) {
248 ss << ">, <";
249 } else {
250 ss << ", ";
251 }
252 }
253 ss << e->coordinates[i];
254 }
255 if (e->type.lanes > 1) {
256 ss << ">)";
257 } else {
258 ss << ")";
259 }
260
261 if (print_value) {
262 if (e->type.lanes > 1) {
263 ss << " = <";
264 } else {
265 ss << " = ";
266 }
267 for (int i = 0; i < e->type.lanes; i++) {
268 if (i > 0) {
269 ss << ", ";
270 }
271 if (e->type.code == 0) {
272 if (print_bits == 8) {
273 ss << ((int8_t *)(e->value))[i];
274 } else if (print_bits == 16) {
275 ss << ((int16_t *)(e->value))[i];
276 } else if (print_bits == 32) {
277 ss << ((int32_t *)(e->value))[i];
278 } else {
279 ss << ((int64_t *)(e->value))[i];
280 }
281 } else if (e->type.code == 1) {
282 if (print_bits == 8) {
283 ss << ((uint8_t *)(e->value))[i];
284 } else if (print_bits == 16) {
285 ss << ((uint16_t *)(e->value))[i];
286 } else if (print_bits == 32) {
287 ss << ((uint32_t *)(e->value))[i];
288 } else {
289 ss << ((uint64_t *)(e->value))[i];
290 }
291 } else if (e->type.code == 2) {
292 halide_assert(user_context, print_bits >= 16 && "Tracing a bad type");
293 if (print_bits == 32) {
294 ss << ((float *)(e->value))[i];
295 } else if (print_bits == 16) {
296 ss.write_float16_from_bits(((uint16_t *)(e->value))[i]);
297 } else {
298 ss << ((double *)(e->value))[i];
299 }
300 } else if (e->type.code == 3) {
301 ss << ((void **)(e->value))[i];
302 }
303 }
304 if (e->type.lanes > 1) {
305 ss << ">";
306 }
307 }
308
309 if (e->trace_tag && *e->trace_tag) {
310 ss << " tag = \"" << e->trace_tag << "\"";
311 }
312
313 ss << "\n";
314 ss.msan_annotate_is_initialized();
315
316 {
317 ScopedSpinLock lock(&halide_trace_file_lock);
318 halide_print(user_context, (const char *)buffer);
319 }
320 }
321
322 return my_id;
323 }
324
325 } // extern "C"
326
327 namespace Halide {
328 namespace Runtime {
329 namespace Internal {
330
331 WEAK trace_fn halide_custom_trace = halide_default_trace;
332
333 }
334 } // namespace Runtime
335 } // namespace Halide
336
337 extern "C" {
338
halide_set_custom_trace(trace_fn t)339 WEAK trace_fn halide_set_custom_trace(trace_fn t) {
340 trace_fn result = halide_custom_trace;
341 halide_custom_trace = t;
342 return result;
343 }
344
halide_set_trace_file(int fd)345 WEAK void halide_set_trace_file(int fd) {
346 halide_trace_file = fd;
347 }
348
349 extern int errno;
350
halide_get_trace_file(void * user_context)351 WEAK int halide_get_trace_file(void *user_context) {
352 ScopedSpinLock lock(&halide_trace_file_lock);
353 if (halide_trace_file < 0) {
354 const char *trace_file_name = getenv("HL_TRACE_FILE");
355 if (trace_file_name) {
356 void *file = fopen(trace_file_name, "ab");
357 halide_assert(user_context, file && "Failed to open trace file\n");
358 halide_set_trace_file(fileno(file));
359 halide_trace_file_internally_opened = file;
360 if (!halide_trace_buffer) {
361 halide_trace_buffer = (TraceBuffer *)malloc(sizeof(TraceBuffer));
362 halide_trace_buffer->init();
363 }
364 } else {
365 halide_set_trace_file(0);
366 }
367 }
368 return halide_trace_file;
369 }
370
halide_trace(void * user_context,const halide_trace_event_t * e)371 WEAK int32_t halide_trace(void *user_context, const halide_trace_event_t *e) {
372 return (*halide_custom_trace)(user_context, e);
373 }
374
halide_shutdown_trace()375 WEAK int halide_shutdown_trace() {
376 if (halide_trace_file_internally_opened) {
377 int ret = fclose(halide_trace_file_internally_opened);
378 halide_trace_file = 0;
379 halide_trace_file_initialized = false;
380 halide_trace_file_internally_opened = NULL;
381 if (halide_trace_buffer) {
382 free(halide_trace_buffer);
383 }
384 return ret;
385 } else {
386 return 0;
387 }
388 }
389
390 namespace {
halide_trace_cleanup()391 WEAK __attribute__((destructor)) void halide_trace_cleanup() {
392 halide_shutdown_trace();
393 }
394 } // namespace
395 }
396