1 #include "RunGen.h"
2 
3 using namespace Halide::RunGen;
4 using Halide::Tools::BenchmarkConfig;
5 
6 namespace {
7 
8 struct RegisteredFilter {
9     struct RegisteredFilter *next;
10     int (*filter_argv_call)(void **);
11     const struct halide_filter_metadata_t *filter_metadata;
12 };
13 
14 RegisteredFilter *registered_filters = nullptr;
15 
halide_register_argv_and_metadata(int (* filter_argv_call)(void **),const struct halide_filter_metadata_t * filter_metadata,const char * const * extra_key_value_pairs)16 extern "C" void halide_register_argv_and_metadata(
17     int (*filter_argv_call)(void **),
18     const struct halide_filter_metadata_t *filter_metadata,
19     const char *const *extra_key_value_pairs) {
20 
21     auto *rf = new RegisteredFilter();
22     rf->next = registered_filters;
23     rf->filter_argv_call = filter_argv_call;
24     rf->filter_metadata = filter_metadata;
25     // RunGen ignores extra_key_value_pairs
26     registered_filters = rf;
27 }
28 
replace_all(const std::string & str,const std::string & find,const std::string & replace)29 std::string replace_all(const std::string &str,
30                         const std::string &find,
31                         const std::string &replace) {
32     size_t pos = 0;
33     std::string result = str;
34     while ((pos = result.find(find, pos)) != std::string::npos) {
35         result.replace(pos, find.length(), replace);
36         pos += replace.length();
37     }
38     return result;
39 }
40 
usage(const char * argv0)41 void usage(const char *argv0) {
42     const std::string usage = R"USAGE(
43 Usage: $NAME$ argument=value [argument=value... ] [flags]
44 
45 Arguments:
46 
47     Specify the Generator's input and output values by name, in any order.
48 
49     Scalar inputs are specified in the obvious syntax, e.g.
50 
51         some_int=42 some_float=3.1415
52 
53     You can also use the text `default` or `estimate` to use the default or
54     estimate value of the given input, respectively. (You can join these by
55     commas to give default-then-estimate or estimate-then-default behaviors.)
56 
57     Buffer inputs and outputs are specified by pathname:
58 
59         some_input_buffer=/path/to/existing/file.png
60         some_output_buffer=/path/to/create/output/file.png
61 
62     We currently support JPG, PGM, PNG, PPM format. If the type or dimensions
63     of the input or output file type can't support the data (e.g., your filter
64     uses float32 input and output, and you load/save to PNG), we'll use the most
65     robust approximation within the format and issue a warning to stdout.
66 
67     (We anticipate adding other image formats in the future, in particular,
68     TIFF and TMP.)
69 
70     For inputs, there are also "pseudo-file" specifiers you can use; currently
71     supported are
72 
73         zero:[NUM,NUM,...]
74 
75         This input should be an image with the given extents, and all elements
76         set to zero of the appropriate type. (This is useful for benchmarking
77         filters that don't have performance variances with different data.)
78 
79         constant:VALUE:[NUM,NUM,...]
80 
81         Like zero, but allows an arbitrary value of the input's type.
82 
83         identity:[NUM,NUM,...]
84 
85         This input should be an image with the given extents, where diagonal
86         elements are set to one of the appropriate type, and the rest are zero.
87         Diagonal elements are those whose first two coordinates are equal.
88 
89         random:SEED:[NUM,NUM,...]
90 
91         This input should be an image with the given extents, and all elements
92         set to a random value of the appropriate type. The random values will
93         be constructed using the mt19937_64 engine, using the given seed;
94         all floating point values will be in a uniform distribution between
95         0.0 and 1.0, while integral values will be uniform across the entire
96         range of the type.
97 
98         (We anticipate adding other pseudo-file inputs in the future, e.g.
99         various random distributions, gradients, rainbows, etc.)
100 
101         In place of [NUM,NUM,...] for boundary, you may specify 'auto'; this
102         will run a bounds-query to choose a legal input size given the output
103         size constraints. (In general, this is useful only when also using
104         the --output_extents flag.)
105 
106         In place of [NUM,NUM,...] for boundary, you may specify 'estimate';
107         this will use the estimated bounds specified in the code.
108 
109 Flags:
110 
111     --describe:
112         print names and types of all arguments to stdout and exit.
113 
114     --output_extents=[NUM,NUM,...]
115         By default, we attempt to calculate a reasonable size for the output
116         buffers, based on the size of the input buffers and bounds query; if we
117         guess wrong, or you want to explicitly specify the desired output size,
118         you can specify the extent of each dimension with this flag:
119 
120         --output_extents=[1000,100]   # 2 dimensions: w=1000 h = 100
121         --output_extents=[100,200,3]  # 3 dimensions: w=100 h=200 c=3
122 
123         Note that if there are multiple outputs, all will be constrained
124         to this shape.
125 
126     --verbose:
127         emit extra diagnostic output.
128 
129     --quiet:
130         Don't log calls to halide_print() to stdout.
131 
132     --benchmarks=all:
133         Run the filter with the given arguments many times to
134         produce an estimate of average execution time; this currently
135         runs "samples" sets of "iterations" each, and chooses the fastest
136         sample set.
137 
138     --benchmark_min_time=DURATION_SECONDS [default = 0.1]:
139         Override the default minimum desired benchmarking time; ignored if
140         --benchmarks is not also specified.
141 
142     --track_memory:
143         Override Halide memory allocator to track high-water mark of memory
144         allocation during run; note that this may slow down execution, so
145         benchmarks may be inaccurate if you combine --benchmark with this.
146 
147     --default_input_buffers=VALUE:
148         Specify the value for all otherwise-unspecified buffer inputs, in the
149         same syntax in use above. If you omit =VALUE, "zero:auto" will be used.
150 
151     --default_input_scalars=VALUE:
152         Specify the value for all otherwise-unspecified scalar inputs, in the
153         same syntax in use above. If you omit =VALUE, "estimate,default"
154         will be used.
155 
156     --parsable_output:
157         Final output is emitted in an easy-to-parse output (one value per line),
158         rather than easy-for-humans.
159 
160     --estimate_all:
161         Request that all inputs and outputs are based on estimate,
162         and fill buffers with random values. This is exactly equivalent to
163         specifying
164 
165             --default_input_buffers=estimate_then_auto
166             --default_input_scalars=estimate
167             --output_extents=estimate
168 
169         and is a convenience for automated benchmarking.
170 
171 Known Issues:
172 
173     * Filters running on GPU (vs CPU) have not been tested.
174     * Filters using buffer layouts other than planar (e.g. interleaved/chunky)
175       may be buggy.
176 
177 )USAGE";
178 
179     std::string basename = split_string(replace_all(argv0, "\\", "/"), "/").back();
180     std::cout << replace_all(usage, "$NAME$", basename);
181 }
182 
183 // Utility class for installing memory-tracking machinery into the Halide runtime
184 // when --track_memory is specified.
185 class HalideMemoryTracker {
186     static HalideMemoryTracker *active;
187 
188     std::mutex tracker_mutex;
189 
190     // Total current CPU memory allocated via halide_malloc.
191     // Access controlled by tracker_mutex.
192     uint64_t memory_allocated{0};
193 
194     // High-water mark of CPU memory allocated since program start
195     // (or last call to get_cpu_memory_highwater_reset).
196     // Access controlled by tracker_mutex.
197     uint64_t memory_highwater{0};
198 
199     // Map of outstanding allocation sizes.
200     // Access controlled by tracker_mutex.
201     std::map<void *, size_t> memory_size_map;
202 
tracker_malloc_impl(void * user_context,size_t x)203     void *tracker_malloc_impl(void *user_context, size_t x) {
204         std::lock_guard<std::mutex> lock(tracker_mutex);
205 
206         void *ptr = halide_default_malloc(user_context, x);
207 
208         memory_allocated += x;
209         if (memory_highwater < memory_allocated) {
210             memory_highwater = memory_allocated;
211         }
212         if (memory_size_map.find(ptr) != memory_size_map.end()) {
213             halide_error(user_context, "Tracking error in tracker_malloc");
214         }
215         memory_size_map[ptr] = x;
216 
217         return ptr;
218     }
219 
tracker_free_impl(void * user_context,void * ptr)220     void tracker_free_impl(void *user_context, void *ptr) {
221         std::lock_guard<std::mutex> lock(tracker_mutex);
222         auto it = memory_size_map.find(ptr);
223         if (it == memory_size_map.end()) {
224             halide_error(user_context, "Tracking error in tracker_free");
225         }
226         size_t x = it->second;
227         memory_allocated -= x;
228         memory_size_map.erase(it);
229         halide_default_free(user_context, ptr);
230     }
231 
tracker_malloc(void * user_context,size_t x)232     static void *tracker_malloc(void *user_context, size_t x) {
233         return active->tracker_malloc_impl(user_context, x);
234     }
235 
tracker_free(void * user_context,void * ptr)236     static void tracker_free(void *user_context, void *ptr) {
237         return active->tracker_free_impl(user_context, ptr);
238     }
239 
240 public:
install()241     void install() {
242         assert(!active);
243         active = this;
244         halide_set_custom_malloc(tracker_malloc);
245         halide_set_custom_free(tracker_free);
246     }
247 
allocated()248     uint64_t allocated() {
249         std::lock_guard<std::mutex> lock(tracker_mutex);
250         return memory_allocated;
251     }
252 
highwater()253     uint64_t highwater() {
254         std::lock_guard<std::mutex> lock(tracker_mutex);
255         return memory_highwater;
256     }
257 
highwater_reset()258     void highwater_reset() {
259         std::lock_guard<std::mutex> lock(tracker_mutex);
260         memory_highwater = memory_allocated;
261     }
262 };
263 
264 /* static */ HalideMemoryTracker *HalideMemoryTracker::active{nullptr};
265 
266 bool log_info = false;
267 bool log_warn = true;
268 
do_log_cout(const std::string & s)269 void do_log_cout(const std::string &s) {
270     std::cout << s;
271 }
272 
do_log_cerr(const std::string & s)273 void do_log_cerr(const std::string &s) {
274     std::cerr << s;
275 }
276 
do_log_info(const std::string & s)277 void do_log_info(const std::string &s) {
278     if (log_info) {
279         do_log_cerr(s);
280     }
281 }
282 
do_log_warn(const std::string & s)283 void do_log_warn(const std::string &s) {
284     if (log_warn) {
285         do_log_cerr("Warning: " + s);
286     }
287 }
288 
do_log_fail(const std::string & s)289 void do_log_fail(const std::string &s) {
290     do_log_cerr(s);
291     abort();
292 }
293 
294 }  // namespace
295 
296 namespace Halide {
297 namespace RunGen {
298 
log()299 Logger log() {
300     return {do_log_cout, do_log_info, do_log_warn, do_log_fail};
301 }
302 
303 }  // namespace RunGen
304 }  // namespace Halide
305 
main(int argc,char ** argv)306 int main(int argc, char **argv) {
307     if (argc <= 1) {
308         usage(argv[0]);
309         return 0;
310     }
311 
312     if (registered_filters == nullptr) {
313         std::cerr << "No filters registered. Compile RunGenMain.cpp along with at least one 'registration' output from a generator.\n";
314         return -1;
315     }
316 
317     // Look for --name
318     std::string filter_name;
319     for (int i = 1; i < argc; ++i) {
320         if (argv[i][0] == '-') {
321             const char *p = argv[i] + 1;  // skip -
322             if (p[0] == '-') {
323                 p++;  // allow -- as well, because why not
324             }
325             std::vector<std::string> v = split_string(p, "=");
326             std::string flag_name = v[0];
327             std::string flag_value = v.size() > 1 ? v[1] : "";
328             if (v.size() > 2) {
329                 fail() << "Invalid argument: " << argv[i];
330             }
331             if (flag_name != "name") {
332                 continue;
333             }
334             if (!filter_name.empty()) {
335                 fail() << "--name cannot be specified twice.";
336             }
337             filter_name = flag_value;
338             if (filter_name.empty()) {
339                 fail() << "--name cannot be empty.";
340             }
341         }
342     }
343 
344     auto *rf = registered_filters;
345     if (filter_name.empty()) {
346         // Just choose the first one.
347         if (rf->next != nullptr) {
348             std::ostringstream o;
349             o << "Must specify --name if multiple filters are registered; registered filters are:\n";
350             for (auto *rf = registered_filters; rf != nullptr; rf = rf->next) {
351                 o << "  " << rf->filter_metadata->name << "\n";
352             }
353             o << "\n";
354             fail() << o.str();
355         }
356     } else {
357         for (; rf != nullptr; rf = rf->next) {
358             if (filter_name == rf->filter_metadata->name) {
359                 break;
360             }
361         }
362         if (rf == nullptr) {
363             std::ostringstream o;
364             o << "Filter " << filter_name << " not found; registered filters are:\n";
365             for (auto *rf = registered_filters; rf != nullptr; rf = rf->next) {
366                 o << "  " << rf->filter_metadata->name << "\n";
367             }
368             o << "\n";
369             fail() << o.str();
370         }
371     }
372 
373     RunGen r(rf->filter_argv_call, rf->filter_metadata);
374 
375     std::string user_specified_output_shape;
376     std::set<std::string> seen_args;
377     bool benchmark = false;
378     bool track_memory = false;
379     bool describe = false;
380     double benchmark_min_time = BenchmarkConfig().min_time;
381     std::string default_input_buffers;
382     std::string default_input_scalars;
383     std::string benchmarks_flag_value;
384     for (int i = 1; i < argc; ++i) {
385         if (argv[i][0] == '-') {
386             const char *p = argv[i] + 1;  // skip -
387             if (p[0] == '-') {
388                 p++;  // allow -- as well, because why not
389             }
390             std::vector<std::string> v = split_string(p, "=");
391             std::string flag_name = v[0];
392             std::string flag_value = v.size() > 1 ? v[1] : "";
393             if (v.size() > 2) {
394                 fail() << "Invalid argument: " << argv[i];
395             }
396             if (flag_name == "name") {
397                 continue;
398             } else if (flag_name == "verbose") {
399                 if (flag_value.empty()) {
400                     flag_value = "true";
401                 }
402                 if (!parse_scalar(flag_value, &log_info)) {
403                     fail() << "Invalid value for flag: " << flag_name;
404                 }
405             } else if (flag_name == "quiet") {
406                 if (flag_value.empty()) {
407                     flag_value = "true";
408                 }
409                 bool quiet;
410                 if (!parse_scalar(flag_value, &quiet)) {
411                     fail() << "Invalid value for flag: " << flag_name;
412                 }
413                 r.set_quiet(quiet);
414             } else if (flag_name == "parsable_output") {
415                 if (flag_value.empty()) {
416                     flag_value = "true";
417                 }
418                 bool parsable_output;
419                 if (!parse_scalar(flag_value, &parsable_output)) {
420                     fail() << "Invalid value for flag: " << flag_name;
421                 }
422                 r.set_parsable_output(parsable_output);
423             } else if (flag_name == "describe") {
424                 if (flag_value.empty()) {
425                     flag_value = "true";
426                 }
427                 if (!parse_scalar(flag_value, &describe)) {
428                     fail() << "Invalid value for flag: " << flag_name;
429                 }
430             } else if (flag_name == "track_memory") {
431                 if (flag_value.empty()) {
432                     flag_value = "true";
433                 }
434                 if (!parse_scalar(flag_value, &track_memory)) {
435                     fail() << "Invalid value for flag: " << flag_name;
436                 }
437             } else if (flag_name == "benchmarks") {
438                 benchmarks_flag_value = flag_value;
439                 benchmark = true;
440             } else if (flag_name == "benchmark_min_time") {
441                 if (!parse_scalar(flag_value, &benchmark_min_time)) {
442                     fail() << "Invalid value for flag: " << flag_name;
443                 }
444             } else if (flag_name == "default_input_buffers") {
445                 default_input_buffers = flag_value;
446                 if (default_input_buffers.empty()) {
447                     default_input_buffers = "zero:auto";
448                 }
449             } else if (flag_name == "default_input_scalars") {
450                 default_input_scalars = flag_value;
451                 if (default_input_scalars.empty()) {
452                     default_input_scalars = "estimate,default";
453                 }
454             } else if (flag_name == "output_extents") {
455                 user_specified_output_shape = flag_value;
456             } else if (flag_name == "estimate_all") {
457                 // Equivalent to:
458                 // --default_input_buffers=random:0:estimate_then_auto
459                 // --default_input_scalars=estimate
460                 // --output_extents=estimate
461                 default_input_buffers = "random:0:estimate_then_auto";
462                 default_input_scalars = "estimate";
463                 user_specified_output_shape = "estimate";
464             } else {
465                 usage(argv[0]);
466                 fail() << "Unknown flag: " << flag_name;
467             }
468         } else {
469             // Assume it's a named Input or Output for the Generator,
470             // in the form name=value.
471             std::vector<std::string> v = split_string(argv[i], "=");
472             if (v.size() != 2 || v[0].empty() || v[1].empty()) {
473                 fail() << "Invalid argument: " << argv[i];
474             }
475             r.parse_one(v[0], v[1], &seen_args);
476         }
477     }
478 
479     if (describe) {
480         r.describe();
481         return 0;
482     }
483 
484     // It's OK to omit output arguments when we are benchmarking or tracking memory.
485     bool ok_to_omit_outputs = (benchmark || track_memory);
486 
487     if (benchmark && track_memory) {
488         warn() << "Using --track_memory with --benchmarks will produce inaccurate benchmark results.";
489     }
490 
491     // Check to be sure that all required arguments are specified.
492     r.validate(seen_args, default_input_buffers, default_input_scalars, ok_to_omit_outputs);
493 
494     // Parse all the input arguments, loading images as necessary.
495     // (Don't handle outputs yet.)
496     r.load_inputs(user_specified_output_shape);
497 
498     // Run a bounds query: we need to figure out how to allocate the output buffers,
499     // and the input buffers might need reshaping to satisfy constraints (e.g. a chunky/interleaved layout).
500     std::vector<Shape> constrained_shapes = r.run_bounds_query();
501 
502     r.adapt_input_buffers(constrained_shapes);
503     r.allocate_output_buffers(constrained_shapes);
504 
505     // If we're tracking memory, install the memory tracker *after* doing a bounds query.
506     HalideMemoryTracker tracker;
507     if (track_memory) {
508         tracker.install();
509     }
510 
511     // This is a single-purpose binary to benchmark this filter, so we
512     // shouldn't be eagerly returning device memory.
513     halide_reuse_device_allocations(nullptr, true);
514 
515     if (benchmark) {
516         if (benchmarks_flag_value.empty()) {
517             benchmarks_flag_value = "all";
518         }
519         if (benchmarks_flag_value != "all") {
520             fail() << "The only valid value for --benchmarks is 'all'";
521         }
522         r.run_for_benchmark(benchmark_min_time);
523     } else {
524         r.run_for_output();
525     }
526 
527     if (track_memory) {
528         // Ensure that we copy any GPU-output buffers back to host before
529         // we report on memory usage.
530         r.copy_outputs_to_host();
531         std::cout << "Maximum Halide memory: " << tracker.highwater()
532                   << " bytes for output of " << r.megapixels_out() << " mpix.\n";
533     }
534 
535     // Save the output(s), if necessary.
536     r.save_outputs();
537 
538     return 0;
539 }
540