1 #include "HalideRuntime.h"
2 #include "printer.h"
3 #include "scoped_spin_lock.h"
4 
5 extern "C" {
6 
7 typedef int32_t (*trace_fn)(void *, const halide_trace_event_t *);
8 }
9 
10 namespace Halide {
11 namespace Runtime {
12 namespace Internal {
13 
14 // A spinlock that allows for shared and exclusive access. It's
15 // equivalent to a reader-writer lock, but in my case the "readers"
16 // will actually be writing simultaneously to the trace buffer, so
17 // that's a bad name. We use the __sync primitives used elsewhere in
18 // the runtime for atomic work. They are well supported by clang.
19 class SharedExclusiveSpinLock {
20     volatile uint32_t lock;
21 
22     // Covers a single bit indicating one owner has exclusive
23     // access. The waiting bit can be set while the exclusive bit is
24     // set, but the bits masked by shared_mask must be zero while this
25     // bit is set.
26     const static uint32_t exclusive_held_mask = 0x80000000;
27 
28     // Set to indicate a thread needs to acquire exclusive
29     // access. Other fields of the lock may be set, but no shared
30     // access request will proceed while this bit is set.
31     const static uint32_t exclusive_waiting_mask = 0x40000000;
32 
33     // Count of threads currently holding shared access. Must be zero
34     // if the exclusive bit is set. Cannot increase if the waiting bit
35     // is set.
36     const static uint32_t shared_mask = 0x3fffffff;
37 
38 public:
acquire_shared()39     ALWAYS_INLINE void acquire_shared() {
40         while (1) {
41             uint32_t x = lock & shared_mask;
42             if (__sync_bool_compare_and_swap(&lock, x, x + 1)) {
43                 return;
44             }
45         }
46     }
47 
release_shared()48     ALWAYS_INLINE void release_shared() {
49         __sync_fetch_and_sub(&lock, 1);
50     }
51 
acquire_exclusive()52     ALWAYS_INLINE void acquire_exclusive() {
53         while (1) {
54             // If multiple threads are trying to acquire exclusive
55             // ownership, we may need to rerequest exclusive waiting
56             // while we spin, as it gets unset whenever a thread
57             // acquires exclusive ownership.
58             __sync_fetch_and_or(&lock, exclusive_waiting_mask);
59             if (__sync_bool_compare_and_swap(&lock, exclusive_waiting_mask, exclusive_held_mask)) {
60                 return;
61             }
62         }
63     }
64 
release_exclusive()65     ALWAYS_INLINE void release_exclusive() {
66         __sync_fetch_and_and(&lock, ~exclusive_held_mask);
67     }
68 
init()69     ALWAYS_INLINE void init() {
70         lock = 0;
71     }
72 
SharedExclusiveSpinLock()73     SharedExclusiveSpinLock()
74         : lock(0) {
75     }
76 };
77 
78 const static int buffer_size = 1024 * 1024;
79 
80 class TraceBuffer {
81     SharedExclusiveSpinLock lock;
82     uint32_t cursor, overage;
83     uint8_t buf[buffer_size];
84 
85     // Attempt to atomically acquire space in the buffer to write a
86     // packet. Returns NULL if the buffer was full.
try_acquire_packet(void * user_context,uint32_t size)87     ALWAYS_INLINE halide_trace_packet_t *try_acquire_packet(void *user_context, uint32_t size) {
88         lock.acquire_shared();
89         halide_assert(user_context, size <= buffer_size);
90         uint32_t my_cursor = __sync_fetch_and_add(&cursor, size);
91         if (my_cursor + size > sizeof(buf)) {
92             // Don't try to back it out: instead, just allow this request to fail
93             // (along with all subsequent requests) and record the 'overage'
94             // that was added and should be ignored; then, in the next flush,
95             // remove the overage.
96             __sync_fetch_and_add(&overage, size);
97             lock.release_shared();
98             return NULL;
99         } else {
100             return (halide_trace_packet_t *)(buf + my_cursor);
101         }
102     }
103 
104 public:
105     // Wait for all writers to finish with their packets, stall any
106     // new writers, and flush the buffer to the fd.
flush(void * user_context,int fd)107     ALWAYS_INLINE void flush(void *user_context, int fd) {
108         lock.acquire_exclusive();
109         bool success = true;
110         if (cursor) {
111             cursor -= overage;
112             success = (cursor == (uint32_t)write(fd, buf, cursor));
113             cursor = 0;
114             overage = 0;
115         }
116         lock.release_exclusive();
117         halide_assert(user_context, success && "Could not write to trace file");
118     }
119 
120     // Acquire and return a packet's worth of space in the trace
121     // buffer, flushing the trace buffer to the given fd to make space
122     // if necessary. The region acquired is protected from other
123     // threads writing or reading to it, so it must be released before
124     // a flush can occur.
acquire_packet(void * user_context,int fd,uint32_t size)125     ALWAYS_INLINE halide_trace_packet_t *acquire_packet(void *user_context, int fd, uint32_t size) {
126         halide_trace_packet_t *packet = NULL;
127         while (!(packet = try_acquire_packet(user_context, size))) {
128             // Couldn't acquire space to write a packet. Flush and try again.
129             flush(user_context, fd);
130         }
131         return packet;
132     }
133 
134     // Release a packet, allowing it to be written out with flush
release_packet(halide_trace_packet_t *)135     ALWAYS_INLINE void release_packet(halide_trace_packet_t *) {
136         // Need a memory barrier to guarantee all the writes are done.
137         __sync_synchronize();
138         lock.release_shared();
139     }
140 
init()141     ALWAYS_INLINE void init() {
142         cursor = 0;
143         overage = 0;
144         lock.init();
145     }
146 
TraceBuffer()147     TraceBuffer()
148         : cursor(0), overage(0) {
149     }
150 };
151 
152 WEAK TraceBuffer *halide_trace_buffer = NULL;
153 WEAK int halide_trace_file = -1;  // -1 indicates uninitialized
154 WEAK ScopedSpinLock::AtomicFlag halide_trace_file_lock = 0;
155 WEAK bool halide_trace_file_initialized = false;
156 WEAK void *halide_trace_file_internally_opened = NULL;
157 
158 }  // namespace Internal
159 }  // namespace Runtime
160 }  // namespace Halide
161 
162 extern "C" {
163 
halide_default_trace(void * user_context,const halide_trace_event_t * e)164 WEAK int32_t halide_default_trace(void *user_context, const halide_trace_event_t *e) {
165     static int32_t ids = 1;
166 
167     int32_t my_id = __sync_fetch_and_add(&ids, 1);
168 
169     // If we're dumping to a file, use a binary format
170     int fd = halide_get_trace_file(user_context);
171     if (fd > 0) {
172         // Compute the total packet size
173         uint32_t value_bytes = (uint32_t)(e->type.lanes * e->type.bytes());
174         uint32_t header_bytes = (uint32_t)sizeof(halide_trace_packet_t);
175         uint32_t coords_bytes = e->dimensions * (uint32_t)sizeof(int32_t);
176         uint32_t name_bytes = strlen(e->func) + 1;
177         uint32_t trace_tag_bytes = e->trace_tag ? (strlen(e->trace_tag) + 1) : 1;
178         uint32_t total_size_without_padding = header_bytes + value_bytes + coords_bytes + name_bytes + trace_tag_bytes;
179         uint32_t total_size = (total_size_without_padding + 3) & ~3;
180 
181         // Claim some space to write to in the trace buffer
182         halide_trace_packet_t *packet = halide_trace_buffer->acquire_packet(user_context, fd, total_size);
183 
184         if (total_size > 4096) {
185             print(NULL) << total_size << "\n";
186         }
187 
188         // Write a packet into it
189         packet->size = total_size;
190         packet->id = my_id;
191         packet->type = e->type;
192         packet->event = e->event;
193         packet->parent_id = e->parent_id;
194         packet->value_index = e->value_index;
195         packet->dimensions = e->dimensions;
196         if (e->coordinates) {
197             memcpy((void *)packet->coordinates(), e->coordinates, coords_bytes);
198         }
199         if (e->value) {
200             memcpy((void *)packet->value(), e->value, value_bytes);
201         }
202         memcpy((void *)packet->func(), e->func, name_bytes);
203         memcpy((void *)packet->trace_tag(), e->trace_tag ? e->trace_tag : "", trace_tag_bytes);
204 
205         // Release it
206         halide_trace_buffer->release_packet(packet);
207 
208         // We should also flush the trace buffer if we hit an event
209         // that might be the end of the trace.
210         if (e->event == halide_trace_end_pipeline) {
211             halide_trace_buffer->flush(user_context, fd);
212         }
213 
214     } else {
215         uint8_t buffer[4096];
216         Printer<StringStreamPrinter, sizeof(buffer)> ss(user_context, (char *)buffer);
217 
218         // Round up bits to 8, 16, 32, or 64
219         int print_bits = 8;
220         while (print_bits < e->type.bits) {
221             print_bits <<= 1;
222         }
223         halide_assert(user_context, print_bits <= 64 && "Tracing bad type");
224 
225         // Otherwise, use halide_print and a plain-text format
226         const char *event_types[] = {"Load",
227                                      "Store",
228                                      "Begin realization",
229                                      "End realization",
230                                      "Produce",
231                                      "End produce",
232                                      "Consume",
233                                      "End consume",
234                                      "Begin pipeline",
235                                      "End pipeline",
236                                      "Tag"};
237 
238         // Only print out the value on stores and loads.
239         bool print_value = (e->event < 2);
240 
241         ss << event_types[e->event] << " " << e->func << "." << e->value_index << "(";
242         if (e->type.lanes > 1) {
243             ss << "<";
244         }
245         for (int i = 0; i < e->dimensions; i++) {
246             if (i > 0) {
247                 if ((e->type.lanes > 1) && (i % e->type.lanes) == 0) {
248                     ss << ">, <";
249                 } else {
250                     ss << ", ";
251                 }
252             }
253             ss << e->coordinates[i];
254         }
255         if (e->type.lanes > 1) {
256             ss << ">)";
257         } else {
258             ss << ")";
259         }
260 
261         if (print_value) {
262             if (e->type.lanes > 1) {
263                 ss << " = <";
264             } else {
265                 ss << " = ";
266             }
267             for (int i = 0; i < e->type.lanes; i++) {
268                 if (i > 0) {
269                     ss << ", ";
270                 }
271                 if (e->type.code == 0) {
272                     if (print_bits == 8) {
273                         ss << ((int8_t *)(e->value))[i];
274                     } else if (print_bits == 16) {
275                         ss << ((int16_t *)(e->value))[i];
276                     } else if (print_bits == 32) {
277                         ss << ((int32_t *)(e->value))[i];
278                     } else {
279                         ss << ((int64_t *)(e->value))[i];
280                     }
281                 } else if (e->type.code == 1) {
282                     if (print_bits == 8) {
283                         ss << ((uint8_t *)(e->value))[i];
284                     } else if (print_bits == 16) {
285                         ss << ((uint16_t *)(e->value))[i];
286                     } else if (print_bits == 32) {
287                         ss << ((uint32_t *)(e->value))[i];
288                     } else {
289                         ss << ((uint64_t *)(e->value))[i];
290                     }
291                 } else if (e->type.code == 2) {
292                     halide_assert(user_context, print_bits >= 16 && "Tracing a bad type");
293                     if (print_bits == 32) {
294                         ss << ((float *)(e->value))[i];
295                     } else if (print_bits == 16) {
296                         ss.write_float16_from_bits(((uint16_t *)(e->value))[i]);
297                     } else {
298                         ss << ((double *)(e->value))[i];
299                     }
300                 } else if (e->type.code == 3) {
301                     ss << ((void **)(e->value))[i];
302                 }
303             }
304             if (e->type.lanes > 1) {
305                 ss << ">";
306             }
307         }
308 
309         if (e->trace_tag && *e->trace_tag) {
310             ss << " tag = \"" << e->trace_tag << "\"";
311         }
312 
313         ss << "\n";
314         ss.msan_annotate_is_initialized();
315 
316         {
317             ScopedSpinLock lock(&halide_trace_file_lock);
318             halide_print(user_context, (const char *)buffer);
319         }
320     }
321 
322     return my_id;
323 }
324 
325 }  // extern "C"
326 
327 namespace Halide {
328 namespace Runtime {
329 namespace Internal {
330 
331 WEAK trace_fn halide_custom_trace = halide_default_trace;
332 
333 }
334 }  // namespace Runtime
335 }  // namespace Halide
336 
337 extern "C" {
338 
halide_set_custom_trace(trace_fn t)339 WEAK trace_fn halide_set_custom_trace(trace_fn t) {
340     trace_fn result = halide_custom_trace;
341     halide_custom_trace = t;
342     return result;
343 }
344 
halide_set_trace_file(int fd)345 WEAK void halide_set_trace_file(int fd) {
346     halide_trace_file = fd;
347 }
348 
349 extern int errno;
350 
halide_get_trace_file(void * user_context)351 WEAK int halide_get_trace_file(void *user_context) {
352     ScopedSpinLock lock(&halide_trace_file_lock);
353     if (halide_trace_file < 0) {
354         const char *trace_file_name = getenv("HL_TRACE_FILE");
355         if (trace_file_name) {
356             void *file = fopen(trace_file_name, "ab");
357             halide_assert(user_context, file && "Failed to open trace file\n");
358             halide_set_trace_file(fileno(file));
359             halide_trace_file_internally_opened = file;
360             if (!halide_trace_buffer) {
361                 halide_trace_buffer = (TraceBuffer *)malloc(sizeof(TraceBuffer));
362                 halide_trace_buffer->init();
363             }
364         } else {
365             halide_set_trace_file(0);
366         }
367     }
368     return halide_trace_file;
369 }
370 
halide_trace(void * user_context,const halide_trace_event_t * e)371 WEAK int32_t halide_trace(void *user_context, const halide_trace_event_t *e) {
372     return (*halide_custom_trace)(user_context, e);
373 }
374 
halide_shutdown_trace()375 WEAK int halide_shutdown_trace() {
376     if (halide_trace_file_internally_opened) {
377         int ret = fclose(halide_trace_file_internally_opened);
378         halide_trace_file = 0;
379         halide_trace_file_initialized = false;
380         halide_trace_file_internally_opened = NULL;
381         if (halide_trace_buffer) {
382             free(halide_trace_buffer);
383         }
384         return ret;
385     } else {
386         return 0;
387     }
388 }
389 
390 namespace {
halide_trace_cleanup()391 WEAK __attribute__((destructor)) void halide_trace_cleanup() {
392     halide_shutdown_trace();
393 }
394 }  // namespace
395 }
396