1 #include "Module.h"
2
3 #include <array>
4 #include <fstream>
5 #include <future>
6 #include <utility>
7
8 #include "CodeGen_C.h"
9 #include "CodeGen_Internal.h"
10 #include "CodeGen_PyTorch.h"
11 #include "CompilerLogger.h"
12 #include "Debug.h"
13 #include "HexagonOffload.h"
14 #include "IROperator.h"
15 #include "LLVM_Headers.h"
16 #include "LLVM_Output.h"
17 #include "LLVM_Runtime_Linker.h"
18 #include "Pipeline.h"
19 #include "PythonExtensionGen.h"
20 #include "StmtToHtml.h"
21
22 using Halide::Internal::debug;
23
24 namespace Halide {
25 namespace Internal {
26
27 // This is the One True Source of the known output types for halide,
28 // and the appropriate file extension for each output type. If you are
29 // explicitly managing file extensions somewhere else, you are probably
30 // doing it wrong; please prefer to use this table as the source of truth.
31 //
32 // Note that we deliberately default to ".py.cpp" (rather than .py.c) here for python_extension;
33 // in theory, the Python extension file we generate can be compiled just
34 // fine as a plain-C file... but if we are building with cpp-name-mangling
35 // enabled in the target, we will include generated .h files that can't be compiled.
36 // We really don't want to vary the file extensions based on target flags,
37 // and in practice, it's extremely unlikely that anyone needs to rely on this
38 // being pure C output (vs possibly C++).
get_output_info(const Target & target)39 std::map<Output, const OutputInfo> get_output_info(const Target &target) {
40 constexpr bool IsMulti = true;
41 constexpr bool IsSingle = false;
42 const bool is_windows_coff = target.os == Target::Windows;
43 std::map<Output, const OutputInfo> ext = {
44 {Output::assembly, {"assembly", ".s", IsMulti}},
45 {Output::bitcode, {"bitcode", ".bc", IsMulti}},
46 {Output::c_header, {"c_header", ".h", IsSingle}},
47 {Output::c_source, {"c_source", ".halide_generated.cpp", IsSingle}},
48 {Output::compiler_log, {"compiler_log", ".halide_compiler_log", IsSingle}},
49 {Output::cpp_stub, {"cpp_stub", ".stub.h", IsSingle}},
50 {Output::featurization, {"featurization", ".featurization", IsMulti}},
51 {Output::llvm_assembly, {"llvm_assembly", ".ll", IsMulti}},
52 {Output::object, {"object", is_windows_coff ? ".obj" : ".o", IsMulti}},
53 {Output::python_extension, {"python_extension", ".py.cpp", IsSingle}},
54 {Output::pytorch_wrapper, {"pytorch_wrapper", ".pytorch.h", IsSingle}},
55 {Output::registration, {"registration", ".registration.cpp", IsSingle}},
56 {Output::schedule, {"schedule", ".schedule.h", IsSingle}},
57 {Output::static_library, {"static_library", is_windows_coff ? ".lib" : ".a", IsSingle}},
58 {Output::stmt, {"stmt", ".stmt", IsMulti}},
59 {Output::stmt_html, {"stmt_html", ".stmt.html", IsMulti}},
60 };
61 return ext;
62 }
63
64 namespace {
65
66 class TemporaryObjectFileDir final {
67 public:
TemporaryObjectFileDir()68 TemporaryObjectFileDir()
69 : dir_path(dir_make_temp()) {
70 }
~TemporaryObjectFileDir()71 ~TemporaryObjectFileDir() {
72 for (const auto &f : dir_files) {
73 debug(1) << "file_unlink: " << f << "\n";
74 file_unlink(f);
75 }
76 debug(1) << "dir_rmdir: " << dir_path << "\n";
77 dir_rmdir(dir_path);
78 }
add_temp_file(const std::string & base_path_name,const std::string & suffix,const Target & target,bool in_front=false)79 std::string add_temp_file(const std::string &base_path_name,
80 const std::string &suffix,
81 const Target &target,
82 bool in_front = false) {
83 size_t slash_idx = base_path_name.rfind('/');
84 size_t backslash_idx = base_path_name.rfind('\\');
85 if (slash_idx == std::string::npos) {
86 slash_idx = 0;
87 } else {
88 slash_idx++;
89 }
90 if (backslash_idx == std::string::npos) {
91 backslash_idx = 0;
92 } else {
93 backslash_idx++;
94 }
95 std::string base_name = base_path_name.substr(std::max(slash_idx, backslash_idx));
96 std::string name = dir_path + "/" + base_name + suffix;
97 debug(1) << "add_temp_object_file: " << name << "\n";
98 if (in_front) {
99 dir_files.insert(dir_files.begin(), name);
100 } else {
101 dir_files.push_back(name);
102 }
103 return name;
104 }
105
add_temp_object_file(const std::string & base_path_name,const std::string & suffix,const Target & target,bool in_front=false)106 std::string add_temp_object_file(const std::string &base_path_name,
107 const std::string &suffix,
108 const Target &target,
109 bool in_front = false) {
110 const char *ext = (target.os == Target::Windows) ? ".obj" : ".o";
111 return add_temp_file(base_path_name, suffix + ext, target, in_front);
112 }
113
files()114 const std::vector<std::string> &files() {
115 return dir_files;
116 }
117
118 private:
119 const std::string dir_path;
120 std::vector<std::string> dir_files;
121 TemporaryObjectFileDir(const TemporaryObjectFileDir &) = delete;
122 void operator=(const TemporaryObjectFileDir &) = delete;
123 };
124
125 // Given a pathname of the form /path/to/name.ext, append suffix before ext to produce /path/to/namesuffix.ext
add_suffix(const std::string & path,const std::string & suffix)126 std::string add_suffix(const std::string &path, const std::string &suffix) {
127 size_t last_path = std::min(path.rfind('/'), path.rfind('\\'));
128 if (last_path == std::string::npos) {
129 last_path = 0;
130 }
131 size_t dot = path.find('.', last_path);
132 if (dot == std::string::npos) {
133 return path + suffix;
134 } else {
135 return path.substr(0, dot) + suffix + path.substr(dot);
136 }
137 }
138
validate_outputs(const std::map<Output,std::string> & in)139 void validate_outputs(const std::map<Output, std::string> &in) {
140 // We don't care about the extensions, so any Target will do
141 auto known = get_output_info(Target());
142 for (auto it : in) {
143 internal_assert(!it.second.empty()) << "Empty value for output: " << known.at(it.first).name;
144 }
145 }
146
contains(const std::map<Output,std::string> & in,const Output & key)147 bool contains(const std::map<Output, std::string> &in, const Output &key) {
148 return in.find(key) != in.end();
149 }
150
emit_registration(const Module & m,std::ostream & stream)151 void emit_registration(const Module &m, std::ostream &stream) {
152 /*
153 This relies on the filter library being linked in a way that doesn't
154 dead-strip "unused" initialization code; this may mean that you need to
155 explicitly link with with --whole-archive (or the equivalent) to ensure
156 that the registration code isn't omitted. Sadly, there's no portable way
157 to do this, so you may need to take care in your make/build/etc files:
158
159 Linux: -Wl,--whole-archive "/path/to/library" -Wl,-no-whole-archive
160 Darwin/OSX: -Wl,-force_load,/path/to/library
161 VS2015 R2+: /WHOLEARCHIVE:/path/to/library.lib
162 Bazel: alwayslink=1
163
164 Note also that registration files deliberately have no #includes, and
165 are specifically designed to be legal to concatenate into a single
166 source file; it should be equivalent to compile-and-link multiple
167 registration files separately, or to concatenate multiple registration
168 files into a single one which is then compiled.
169 */
170
171 const std::string registration_template = R"INLINE_CODE(
172 // MACHINE GENERATED -- DO NOT EDIT
173
174 extern "C" {
175 struct halide_filter_metadata_t;
176 void halide_register_argv_and_metadata(
177 int (*filter_argv_call)(void **),
178 const struct halide_filter_metadata_t *filter_metadata,
179 const char * const *extra_key_value_pairs
180 );
181 }
182
183 $NAMESPACEOPEN$
184 extern int $SHORTNAME$_argv(void **args);
185 extern const struct halide_filter_metadata_t *$SHORTNAME$_metadata();
186 $NAMESPACECLOSE$
187
188 #ifdef HALIDE_REGISTER_EXTRA_KEY_VALUE_PAIRS_FUNC
189 extern "C" const char * const *HALIDE_REGISTER_EXTRA_KEY_VALUE_PAIRS_FUNC();
190 #endif // HALIDE_REGISTER_EXTRA_KEY_VALUE_PAIRS_FUNC
191
192 namespace $NREGS$ {
193 namespace {
194 struct Registerer {
195 Registerer() {
196 #ifdef HALIDE_REGISTER_EXTRA_KEY_VALUE_PAIRS_FUNC
197 halide_register_argv_and_metadata(::$FULLNAME$_argv, ::$FULLNAME$_metadata(), HALIDE_REGISTER_EXTRA_KEY_VALUE_PAIRS_FUNC());
198 #else
199 halide_register_argv_and_metadata(::$FULLNAME$_argv, ::$FULLNAME$_metadata(), nullptr);
200 #endif // HALIDE_REGISTER_EXTRA_KEY_VALUE_PAIRS_FUNC
201 }
202 };
203 static Registerer registerer;
204 } // namespace
205 } // $NREGS$
206
207 )INLINE_CODE";
208
209 for (const auto &f : m.functions()) {
210 if (f.linkage == LinkageType::ExternalPlusMetadata) {
211 std::vector<std::string> namespaces;
212 std::string simple_name = extract_namespaces(f.name, namespaces);
213 std::string nsopen, nsclose;
214 for (const auto &ns : namespaces) {
215 nsopen += "namespace " + ns + " { ";
216 nsclose += "}";
217 }
218 if (!m.target().has_feature(Target::CPlusPlusMangling)) {
219 internal_assert(namespaces.empty());
220 nsopen = "extern \"C\" {";
221 nsclose = "}";
222 }
223 std::string nsreg = "halide_nsreg_" + replace_all(f.name, "::", "_");
224 std::string s = replace_all(registration_template, "$NAMESPACEOPEN$", nsopen);
225 s = replace_all(s, "$SHORTNAME$", simple_name);
226 s = replace_all(s, "$NAMESPACECLOSE$", nsclose);
227 s = replace_all(s, "$FULLNAME$", f.name);
228 s = replace_all(s, "$NREGS$", nsreg);
229 stream << s;
230 }
231 }
232 }
233
indent_string(const std::string & src,const std::string & indent)234 std::string indent_string(const std::string &src, const std::string &indent) {
235 std::ostringstream o;
236 bool prev_was_newline = true;
237 for (size_t i = 0; i < src.size(); i++) {
238 const char c = src[i];
239 const bool is_newline = (c == '\n');
240 if (prev_was_newline && !is_newline) {
241 o << indent;
242 }
243 o << c;
244 prev_was_newline = is_newline;
245 }
246 return o.str();
247 }
248
emit_schedule_file(const std::string & name,const std::vector<Target> & targets,const std::string & scheduler_name,const std::string & machine_params_string,const std::string & body,std::ostream & stream)249 void emit_schedule_file(const std::string &name,
250 const std::vector<Target> &targets,
251 const std::string &scheduler_name,
252 const std::string &machine_params_string,
253 const std::string &body,
254 std::ostream &stream) {
255 std::string s = R"INLINE_CODE(#ifndef $CLEANNAME$_SCHEDULE_H
256 #define $CLEANNAME$_SCHEDULE_H
257
258 // MACHINE GENERATED -- DO NOT EDIT
259 // This schedule was automatically generated by $SCHEDULER$
260 // for target=$TARGET$ // NOLINT
261 // with machine_params=$MACHINEPARAMS$
262
263 #include "Halide.h"
264
265 $NAMESPACEOPEN$
266 inline void apply_schedule_$SHORTNAME$(
267 ::Halide::Pipeline pipeline,
268 ::Halide::Target target
269 ) {
270 using ::Halide::Func;
271 using ::Halide::MemoryType;
272 using ::Halide::RVar;
273 using ::Halide::TailStrategy;
274 using ::Halide::Var;
275 $BODY$
276 }
277 $NAMESPACECLOSE$
278 #endif // $CLEANNAME$_SCHEDULE_H
279 )INLINE_CODE";
280
281 // For logging in the comment, strip out features that are almost
282 // certainly irrelevant to scheduling issues, to make for easier reading
283 const Target::Feature irrelevant_features[] = {
284 Target::CPlusPlusMangling,
285 Target::NoRuntime,
286 Target::UserContext,
287 };
288
289 std::vector<std::string> namespaces;
290 std::string simple_name = extract_namespaces(name, namespaces);
291 std::string nsopen, nsclose;
292 for (const auto &ns : namespaces) {
293 nsopen += "namespace " + ns + " {\n";
294 nsclose += "} // namespace " + ns + "\n";
295 }
296 std::string clean_name = replace_all(name, "::", "_");
297 std::string target_string;
298 for (Target t : targets) {
299 if (!target_string.empty()) target_string += ",";
300 for (auto f : irrelevant_features) {
301 t = t.without_feature(f);
302 }
303 target_string += t.to_string();
304 }
305 std::string body_text = indent_string(body, " ");
306 s = replace_all(s, "$SCHEDULER$", scheduler_name);
307 s = replace_all(s, "$NAMESPACEOPEN$", nsopen);
308 s = replace_all(s, "$SHORTNAME$", simple_name);
309 s = replace_all(s, "$CLEANNAME$", clean_name);
310 s = replace_all(s, "$NAMESPACECLOSE$", nsclose);
311 s = replace_all(s, "$TARGET$", target_string);
312 s = replace_all(s, "$BODY$", body_text);
313 s = replace_all(s, "$MACHINEPARAMS$", machine_params_string);
314 stream << s;
315 }
316
317 } // namespace
318
319 struct ModuleContents {
320 mutable RefCount ref_count;
321 std::string name;
322 Target target;
323 std::vector<Buffer<>> buffers;
324 std::vector<Internal::LoweredFunc> functions;
325 std::vector<Module> submodules;
326 std::vector<ExternalCode> external_code;
327 std::map<std::string, std::string> metadata_name_map;
328 bool any_strict_float{false};
329 std::unique_ptr<AutoSchedulerResults> auto_scheduler_results;
330 };
331
332 template<>
ref_count(const ModuleContents * t)333 RefCount &ref_count<ModuleContents>(const ModuleContents *t) noexcept {
334 return t->ref_count;
335 }
336
337 template<>
destroy(const ModuleContents * t)338 void destroy<ModuleContents>(const ModuleContents *t) {
339 delete t;
340 }
341
LoweredFunc(const std::string & name,const std::vector<LoweredArgument> & args,Stmt body,LinkageType linkage,NameMangling name_mangling)342 LoweredFunc::LoweredFunc(const std::string &name,
343 const std::vector<LoweredArgument> &args,
344 Stmt body,
345 LinkageType linkage,
346 NameMangling name_mangling)
347 : name(name), args(args), body(std::move(body)), linkage(linkage), name_mangling(name_mangling) {
348 }
349
LoweredFunc(const std::string & name,const std::vector<Argument> & args,Stmt body,LinkageType linkage,NameMangling name_mangling)350 LoweredFunc::LoweredFunc(const std::string &name,
351 const std::vector<Argument> &args,
352 Stmt body,
353 LinkageType linkage,
354 NameMangling name_mangling)
355 : name(name), body(std::move(body)), linkage(linkage), name_mangling(name_mangling) {
356 for (const Argument &i : args) {
357 this->args.emplace_back(i);
358 }
359 }
360
361 } // namespace Internal
362
363 using namespace Halide::Internal;
364
Module(const std::string & name,const Target & target)365 Module::Module(const std::string &name, const Target &target)
366 : contents(new Internal::ModuleContents) {
367 contents->name = name;
368 contents->target = target;
369 }
370
set_auto_scheduler_results(const AutoSchedulerResults & auto_scheduler_results)371 void Module::set_auto_scheduler_results(const AutoSchedulerResults &auto_scheduler_results) {
372 internal_assert(contents->auto_scheduler_results.get() == nullptr);
373 contents->auto_scheduler_results.reset(new AutoSchedulerResults(auto_scheduler_results));
374 }
375
set_any_strict_float(bool any_strict_float)376 void Module::set_any_strict_float(bool any_strict_float) {
377 contents->any_strict_float = any_strict_float;
378 }
379
target() const380 const Target &Module::target() const {
381 return contents->target;
382 }
383
name() const384 const std::string &Module::name() const {
385 return contents->name;
386 }
387
get_auto_scheduler_results() const388 const AutoSchedulerResults *Module::get_auto_scheduler_results() const {
389 return contents->auto_scheduler_results.get();
390 }
391
any_strict_float() const392 bool Module::any_strict_float() const {
393 return contents->any_strict_float;
394 }
395
buffers() const396 const std::vector<Buffer<>> &Module::buffers() const {
397 return contents->buffers;
398 }
399
functions() const400 const std::vector<Internal::LoweredFunc> &Module::functions() const {
401 return contents->functions;
402 }
403
functions()404 std::vector<Internal::LoweredFunc> &Module::functions() {
405 return contents->functions;
406 }
407
submodules() const408 const std::vector<Module> &Module::submodules() const {
409 return contents->submodules;
410 }
411
external_code() const412 const std::vector<ExternalCode> &Module::external_code() const {
413 return contents->external_code;
414 }
415
get_function_by_name(const std::string & name) const416 Internal::LoweredFunc Module::get_function_by_name(const std::string &name) const {
417 for (const auto &f : functions()) {
418 if (f.name == name) {
419 return f;
420 }
421 }
422 user_error << "get_function_by_name: function " << name << " not found.\n";
423 return Internal::LoweredFunc("", std::vector<Argument>{}, {}, LinkageType::External);
424 }
425
append(const Buffer<> & buffer)426 void Module::append(const Buffer<> &buffer) {
427 contents->buffers.push_back(buffer);
428 }
429
append(const Internal::LoweredFunc & function)430 void Module::append(const Internal::LoweredFunc &function) {
431 contents->functions.push_back(function);
432 }
433
append(const Module & module)434 void Module::append(const Module &module) {
435 contents->submodules.push_back(module);
436 }
437
append(const ExternalCode & external_code)438 void Module::append(const ExternalCode &external_code) {
439 contents->external_code.push_back(external_code);
440 }
441
link_modules(const std::string & name,const std::vector<Module> & modules)442 Module link_modules(const std::string &name, const std::vector<Module> &modules) {
443 Module output(name, modules.front().target());
444
445 for (size_t i = 0; i < modules.size(); i++) {
446 const Module &input = modules[i];
447
448 if (output.target() != input.target()) {
449 user_error << "Mismatched targets in modules to link ("
450 << output.name() << ", " << output.target().to_string()
451 << "), ("
452 << input.name() << ", " << input.target().to_string() << ")\n";
453 }
454
455 // TODO(dsharlet): Check for naming collisions, maybe rename
456 // internal linkage declarations in the case of collision.
457 for (const auto &b : input.buffers()) {
458 output.append(b);
459 }
460 for (const auto &f : input.functions()) {
461 output.append(f);
462 }
463 }
464
465 return output;
466 }
467
compile_to_buffer() const468 Buffer<uint8_t> Module::compile_to_buffer() const {
469 // TODO: This Hexagon specific code should be removed as soon as possible.
470 // This may involve adding more general support for post-processing and
471 // a way of specifying to use it.
472 if (target().arch == Target::Hexagon) {
473 return compile_module_to_hexagon_shared_object(*this);
474 }
475
476 llvm::LLVMContext context;
477 std::unique_ptr<llvm::Module> llvm_module(compile_module_to_llvm_module(*this, context));
478
479 llvm::SmallVector<char, 4096> object;
480 llvm::raw_svector_ostream object_stream(object);
481 compile_llvm_module_to_object(*llvm_module, object_stream);
482
483 if (debug::debug_level() >= 2) {
484 debug(2) << "Submodule assembly for " << name() << ": "
485 << "\n";
486 llvm::SmallString<4096> assembly;
487 llvm::raw_svector_ostream assembly_stream(assembly);
488 compile_llvm_module_to_assembly(*llvm_module, assembly_stream);
489 debug(2) << assembly.c_str() << "\n";
490 }
491
492 Buffer<uint8_t> result(object.size(), name());
493 memcpy(result.data(), reinterpret_cast<uint8_t *>(&object[0]), object.size());
494 return result;
495 }
496
resolve_submodules() const497 Module Module::resolve_submodules() const {
498 if (submodules().empty()) {
499 return *this;
500 }
501
502 Module lowered_module(name(), target());
503
504 for (const auto &f : functions()) {
505 lowered_module.append(f);
506 }
507 for (const auto &buf : buffers()) {
508 lowered_module.append(buf);
509 }
510 for (const auto &ec : external_code()) {
511 lowered_module.append(ec);
512 }
513 for (const auto &m : submodules()) {
514 Module copy(m.resolve_submodules());
515
516 // Propagate external code blocks.
517 for (const auto &ec : external_code()) {
518 // TODO(zalman): Is this the right thing to do?
519 bool already_in_list = false;
520 for (const auto &ec_sub : copy.external_code()) {
521 if (ec_sub.name() == ec.name()) {
522 already_in_list = true;
523 break;
524 }
525 }
526 if (!already_in_list) {
527 copy.append(ec);
528 }
529 }
530
531 auto buf = copy.compile_to_buffer();
532 lowered_module.append(buf);
533 }
534 // Copy the autoscheduler results back into the lowered module after resolving the submodules.
535 if (auto *r = contents->auto_scheduler_results.get()) {
536 lowered_module.set_auto_scheduler_results(*r);
537 }
538 return lowered_module;
539 }
540
remap_metadata_name(const std::string & from,const std::string & to) const541 void Module::remap_metadata_name(const std::string &from, const std::string &to) const {
542 internal_assert(contents->metadata_name_map.find(from) == contents->metadata_name_map.end());
543 internal_assert(contents->metadata_name_map.find(to) == contents->metadata_name_map.end());
544 contents->metadata_name_map[from] = to;
545 }
546
get_metadata_name_map() const547 std::map<std::string, std::string> Module::get_metadata_name_map() const {
548 return contents->metadata_name_map;
549 }
550
compile(const std::map<Output,std::string> & output_files) const551 void Module::compile(const std::map<Output, std::string> &output_files) const {
552 validate_outputs(output_files);
553
554 // output stmt and html prior to resolving submodules. We need to
555 // clear the output after writing it, otherwise the output will
556 // be overwritten by recursive calls after submodules are resolved.
557 if (contains(output_files, Output::stmt)) {
558 debug(1) << "Module.compile(): stmt " << output_files.at(Output::stmt) << "\n";
559 std::ofstream file(output_files.at(Output::stmt));
560 file << *this;
561 }
562 if (contains(output_files, Output::stmt_html)) {
563 debug(1) << "Module.compile(): stmt_html " << output_files.at(Output::stmt_html) << "\n";
564 Internal::print_to_html(output_files.at(Output::stmt_html), *this);
565 }
566
567 // If there are submodules, recursively lower submodules to
568 // buffers on a copy of the module being compiled, then compile
569 // the copied module.
570 if (!submodules().empty()) {
571 std::map<Output, std::string> output_files_copy = output_files;
572 output_files_copy.erase(Output::stmt);
573 output_files_copy.erase(Output::stmt_html);
574 resolve_submodules().compile(output_files_copy);
575 return;
576 }
577
578 auto *logger = get_compiler_logger();
579 if (contains(output_files, Output::object) || contains(output_files, Output::assembly) ||
580 contains(output_files, Output::bitcode) || contains(output_files, Output::llvm_assembly) ||
581 contains(output_files, Output::static_library)) {
582 llvm::LLVMContext context;
583 std::unique_ptr<llvm::Module> llvm_module(compile_module_to_llvm_module(*this, context));
584
585 if (contains(output_files, Output::object)) {
586 const auto &f = output_files.at(Output::object);
587 debug(1) << "Module.compile(): object " << f << "\n";
588 auto out = make_raw_fd_ostream(f);
589 compile_llvm_module_to_object(*llvm_module, *out);
590 if (logger) {
591 out->flush();
592 logger->record_object_code_size(file_stat(f).file_size);
593 }
594 }
595 if (contains(output_files, Output::static_library)) {
596 // To simplify the code, we always create a temporary object output
597 // here, even if output_files.at(Output::object) was also set: in practice,
598 // no real-world code ever sets both object and static_library
599 // at the same time, so there is no meaningful performance advantage
600 // to be had.
601 TemporaryObjectFileDir temp_dir;
602 {
603 std::string object = temp_dir.add_temp_object_file(output_files.at(Output::static_library), "", target());
604 debug(1) << "Module.compile(): temporary object " << object << "\n";
605 auto out = make_raw_fd_ostream(object);
606 compile_llvm_module_to_object(*llvm_module, *out);
607 out->flush(); // create_static_library() is happier if we do this
608 if (logger && !contains(output_files, Output::object)) {
609 // Don't double-record object-code size if we already recorded it for object
610 logger->record_object_code_size(file_stat(object).file_size);
611 }
612 }
613 debug(1) << "Module.compile(): static_library " << output_files.at(Output::static_library) << "\n";
614 Target base_target(target().os, target().arch, target().bits);
615 create_static_library(temp_dir.files(), base_target, output_files.at(Output::static_library));
616 }
617 if (contains(output_files, Output::assembly)) {
618 debug(1) << "Module.compile(): assembly " << output_files.at(Output::assembly) << "\n";
619 auto out = make_raw_fd_ostream(output_files.at(Output::assembly));
620 compile_llvm_module_to_assembly(*llvm_module, *out);
621 }
622 if (contains(output_files, Output::bitcode)) {
623 debug(1) << "Module.compile(): bitcode " << output_files.at(Output::bitcode) << "\n";
624 auto out = make_raw_fd_ostream(output_files.at(Output::bitcode));
625 compile_llvm_module_to_llvm_bitcode(*llvm_module, *out);
626 }
627 if (contains(output_files, Output::llvm_assembly)) {
628 debug(1) << "Module.compile(): llvm_assembly " << output_files.at(Output::llvm_assembly) << "\n";
629 auto out = make_raw_fd_ostream(output_files.at(Output::llvm_assembly));
630 compile_llvm_module_to_llvm_assembly(*llvm_module, *out);
631 }
632 }
633 if (contains(output_files, Output::c_header)) {
634 debug(1) << "Module.compile(): c_header " << output_files.at(Output::c_header) << "\n";
635 std::ofstream file(output_files.at(Output::c_header));
636 Internal::CodeGen_C cg(file,
637 target(),
638 target().has_feature(Target::CPlusPlusMangling) ? Internal::CodeGen_C::CPlusPlusHeader : Internal::CodeGen_C::CHeader,
639 output_files.at(Output::c_header));
640 cg.compile(*this);
641 }
642 if (contains(output_files, Output::c_source)) {
643 debug(1) << "Module.compile(): c_source " << output_files.at(Output::c_source) << "\n";
644 std::ofstream file(output_files.at(Output::c_source));
645 Internal::CodeGen_C cg(file,
646 target(),
647 target().has_feature(Target::CPlusPlusMangling) ? Internal::CodeGen_C::CPlusPlusImplementation : Internal::CodeGen_C::CImplementation);
648 cg.compile(*this);
649 }
650 if (contains(output_files, Output::python_extension)) {
651 debug(1) << "Module.compile(): python_extension " << output_files.at(Output::python_extension) << "\n";
652 std::ofstream file(output_files.at(Output::python_extension));
653 Internal::PythonExtensionGen python_extension_gen(file);
654 python_extension_gen.compile(*this);
655 }
656 if (contains(output_files, Output::schedule)) {
657 debug(1) << "Module.compile(): schedule " << output_files.at(Output::schedule) << "\n";
658 std::ofstream file(output_files.at(Output::schedule));
659 auto *r = contents->auto_scheduler_results.get();
660 std::string scheduler = r ? r->scheduler_name : "(None)";
661 std::string machine_params = r ? r->machine_params_string : "(None)";
662 std::string body = r && !r->schedule_source.empty() ? r->schedule_source : "// No autoscheduler has been run for this Generator.\n";
663 emit_schedule_file(name(), {target()}, scheduler, machine_params, body, file);
664 }
665 if (contains(output_files, Output::featurization)) {
666 debug(1) << "Module.compile(): featurization " << output_files.at(Output::featurization) << "\n";
667 // If the featurization data is empty, just write an empty file
668 std::ofstream binfile(output_files.at(Output::featurization), std::ios::binary | std::ios_base::trunc);
669 auto *r = contents->auto_scheduler_results.get();
670 if (r) {
671 binfile.write((const char *)r->featurization.data(), r->featurization.size());
672 }
673 binfile.close();
674 }
675 if (contains(output_files, Output::registration)) {
676 debug(1) << "Module.compile(): registration " << output_files.at(Output::registration) << "\n";
677 std::ofstream file(output_files.at(Output::registration));
678 emit_registration(*this, file);
679 file.close();
680 internal_assert(!file.fail());
681 }
682 if (contains(output_files, Output::pytorch_wrapper)) {
683 debug(1) << "Module.compile(): pytorch_wrapper " << output_files.at(Output::pytorch_wrapper) << "\n";
684
685 std::ofstream file(output_files.at(Output::pytorch_wrapper));
686 Internal::CodeGen_PyTorch cg(file);
687 cg.compile(*this);
688 file.close();
689 internal_assert(!file.fail());
690 }
691 if (contains(output_files, Output::compiler_log)) {
692 debug(1) << "Module.compile(): compiler_log " << output_files.at(Output::compiler_log) << "\n";
693 std::ofstream file(output_files.at(Output::compiler_log));
694 internal_assert(get_compiler_logger() != nullptr);
695 get_compiler_logger()->emit_to_stream(file);
696 file.close();
697 internal_assert(!file.fail());
698 }
699 // If HL_DEBUG_COMPILER_LOGGER is set, dump the log (if any) to stderr now, whether or it is required
700 if (get_env_variable("HL_DEBUG_COMPILER_LOGGER") == "1" && get_compiler_logger() != nullptr) {
701 get_compiler_logger()->emit_to_stream(std::cerr);
702 }
703 }
704
compile_standalone_runtime(const std::map<Output,std::string> & output_files,Target t)705 std::map<Output, std::string> compile_standalone_runtime(const std::map<Output, std::string> &output_files, Target t) {
706 validate_outputs(output_files);
707
708 Module empty("standalone_runtime", t.without_feature(Target::NoRuntime).without_feature(Target::JIT));
709 // For runtime, it only makes sense to output object files or static_library, so ignore
710 // everything else.
711 std::map<Output, std::string> actual_outputs;
712 for (auto key : {Output::object, Output::static_library}) {
713 auto it = output_files.find(key);
714 if (it != output_files.end()) {
715 actual_outputs[key] = it->second;
716 }
717 }
718 empty.compile(actual_outputs);
719 return actual_outputs;
720 }
721
compile_standalone_runtime(const std::string & object_filename,Target t)722 void compile_standalone_runtime(const std::string &object_filename, Target t) {
723 compile_standalone_runtime({{Output::object, object_filename}}, t);
724 }
725
726 namespace {
727
728 class ScopedCompilerLogger {
729 public:
ScopedCompilerLogger(const CompilerLoggerFactory & compiler_logger_factory,const std::string & fn_name,const Target & target)730 ScopedCompilerLogger(const CompilerLoggerFactory &compiler_logger_factory, const std::string &fn_name, const Target &target) {
731 internal_assert(!get_compiler_logger());
732 if (compiler_logger_factory) {
733 set_compiler_logger(compiler_logger_factory(fn_name, target));
734 } else {
735 set_compiler_logger(nullptr);
736 }
737 }
738
~ScopedCompilerLogger()739 ~ScopedCompilerLogger() {
740 set_compiler_logger(nullptr);
741 }
742 };
743
744 } // namespace
745
compile_multitarget(const std::string & fn_name,const std::map<Output,std::string> & output_files,const std::vector<Target> & targets,const std::vector<std::string> & suffixes,const ModuleFactory & module_factory,const CompilerLoggerFactory & compiler_logger_factory)746 void compile_multitarget(const std::string &fn_name,
747 const std::map<Output, std::string> &output_files,
748 const std::vector<Target> &targets,
749 const std::vector<std::string> &suffixes,
750 const ModuleFactory &module_factory,
751 const CompilerLoggerFactory &compiler_logger_factory) {
752 validate_outputs(output_files);
753
754 user_assert(!fn_name.empty()) << "Function name must be specified.\n";
755 user_assert(!targets.empty()) << "Must specify at least one target.\n";
756 user_assert(suffixes.empty() || suffixes.size() == targets.size())
757 << "The suffixes list must be empty or the same length as the targets list.\n";
758
759 // The final target in the list is considered "baseline", and is used
760 // for (e.g.) the runtime and shared code. It is often just arch-bits-os
761 // with no other features (though this is *not* a requirement).
762 const Target &base_target = targets.back();
763
764 // JIT makes no sense.
765 user_assert(!base_target.has_feature(Target::JIT)) << "JIT not allowed for compile_multitarget.\n";
766
767 const auto suffix_for_entry = [&](int i) -> std::string {
768 const std::string suffix = "-" + (suffixes.empty() ? targets[i].to_string() : suffixes[i]);
769 return suffix;
770 };
771
772 const auto add_suffixes = [&](const std::map<Output, std::string> &in, const std::string &suffix) -> std::map<Output, std::string> {
773 // is_multi doesn't vary by Target, so we can pass an empty target here safely
774 auto output_info = get_output_info(Target());
775 std::map<Output, std::string> out = in;
776 for (auto &it : out) {
777 if (output_info[it.first].is_multi) {
778 out[it.first] = add_suffix(it.second, suffix);
779 }
780 }
781 return out;
782 };
783
784 // If only one target, don't bother with the runtime feature detection wrapping.
785 const bool needs_wrapper = (targets.size() > 1);
786 if (targets.size() == 1) {
787 debug(1) << "compile_multitarget: single target is " << base_target.to_string() << "\n";
788 ScopedCompilerLogger activate(compiler_logger_factory, fn_name, base_target);
789
790 // If we want to have single-output object files use the target suffix, we'd
791 // want to do this instead:
792 //
793 // auto sub_out = add_suffixes(output_files, suffix_for_entry(0));
794 // module_factory(fn_name, base_target).compile(sub_out);
795 //
796 // This would make the filename outputs more symmetrical (ie the same for n=1 as for n>1)
797 // but at the expense of breaking existing users. So for now, we're going to continue
798 // with the legacy treatment below:
799 module_factory(fn_name, base_target).compile(output_files);
800 return;
801 }
802
803 user_assert(((int)contains(output_files, Output::object) + (int)contains(output_files, Output::static_library)) == 1)
804 << "compile_multitarget() expects exactly one of 'object' and 'static_library' to be specified when multiple targets are specified.\n";
805
806 // For safety, the runtime must be built only with features common to all
807 // of the targets; given an unusual ordering like
808 //
809 // x86-64-linux,x86-64-sse41
810 //
811 // we should still always be *correct*: this ordering would never select sse41
812 // (since x86-64-linux would be selected first due to ordering), but could
813 // crash on non-sse41 machines (if we generated a runtime with sse41 instructions
814 // included). So we'll keep track of the common features as we walk thru the targets.
815
816 // Using something like std::bitset would be arguably cleaner here, but we need an
817 // array-of-uint64 for calls to halide_can_use_target_features() anyway,
818 // so we'll just build and maintain in that form to avoid extra conversion.
819 constexpr int kFeaturesWordCount = (Target::FeatureEnd + 63) / (sizeof(uint64_t) * 8);
820 uint64_t runtime_features[kFeaturesWordCount] = {(uint64_t)-1LL};
821
822 TemporaryObjectFileDir temp_obj_dir, temp_compiler_log_dir;
823 std::vector<Expr> wrapper_args;
824 std::vector<LoweredArgument> base_target_args;
825 std::vector<AutoSchedulerResults> auto_scheduler_results;
826
827 for (size_t i = 0; i < targets.size(); ++i) {
828 const Target &target = targets[i];
829
830 // arch-bits-os must be identical across all targets.
831 if (target.os != base_target.os ||
832 target.arch != base_target.arch ||
833 target.bits != base_target.bits) {
834 user_error << "All Targets must have matching arch-bits-os for compile_multitarget.\n";
835 }
836 // Some features must match across all targets.
837 static const std::array<Target::Feature, 9> must_match_features = {{
838 Target::ASAN,
839 Target::CPlusPlusMangling,
840 Target::Debug,
841 Target::JIT,
842 Target::Matlab,
843 Target::MSAN,
844 Target::NoRuntime,
845 Target::TSAN,
846 Target::UserContext,
847 }};
848 for (auto f : must_match_features) {
849 if (target.has_feature(f) != base_target.has_feature(f)) {
850 user_error << "All Targets must have feature '" << Target::feature_to_name(f) << "'' set identically for compile_multitarget.\n";
851 break;
852 }
853 }
854
855 // Each sub-target has a function name that is the 'real' name plus a suffix
856 std::string suffix = suffix_for_entry(i);
857 std::string sub_fn_name = needs_wrapper ? (fn_name + suffix) : fn_name;
858
859 // We always produce the runtime separately, so add NoRuntime explicitly.
860 // Matlab should be added to the wrapper pipeline below, instead of each sub-pipeline.
861 Target sub_fn_target = target.with_feature(Target::NoRuntime);
862 if (needs_wrapper) {
863 sub_fn_target = sub_fn_target.without_feature(Target::Matlab);
864 }
865
866 {
867 ScopedCompilerLogger activate(compiler_logger_factory, sub_fn_name, sub_fn_target);
868 Module sub_module = module_factory(sub_fn_name, sub_fn_target);
869 // Re-assign every time -- should be the same across all targets anyway,
870 // but base_target is always the last one we encounter.
871 base_target_args = sub_module.get_function_by_name(sub_fn_name).args;
872
873 auto sub_out = add_suffixes(output_files, suffix);
874 if (contains(output_files, Output::static_library)) {
875 sub_out[Output::object] = temp_obj_dir.add_temp_object_file(output_files.at(Output::static_library), suffix, target);
876 sub_out.erase(Output::static_library);
877 }
878 sub_out.erase(Output::registration);
879 sub_out.erase(Output::schedule);
880 sub_out.erase(Output::c_header);
881 if (contains(sub_out, Output::compiler_log)) {
882 sub_out[Output::compiler_log] = temp_compiler_log_dir.add_temp_file(output_files.at(Output::compiler_log), suffix, target);
883 }
884 debug(1) << "compile_multitarget: compile_sub_target " << sub_out[Output::object] << "\n";
885 sub_module.compile(sub_out);
886 auto *r = sub_module.get_auto_scheduler_results();
887 auto_scheduler_results.push_back(r ? *r : AutoSchedulerResults());
888 }
889
890 uint64_t cur_target_features[kFeaturesWordCount] = {0};
891 for (int i = 0; i < Target::FeatureEnd; ++i) {
892 if (target.has_feature((Target::Feature)i)) {
893 cur_target_features[i >> 6] |= ((uint64_t)1) << (i & 63);
894 }
895 }
896
897 Expr can_use;
898 if (target != base_target) {
899 std::vector<Expr> features_struct_args;
900 for (int i = 0; i < kFeaturesWordCount; ++i) {
901 features_struct_args.emplace_back(UIntImm::make(UInt(64), cur_target_features[i]));
902 }
903 can_use = Call::make(Int(32), "halide_can_use_target_features",
904 {kFeaturesWordCount, Call::make(type_of<uint64_t *>(), Call::make_struct, features_struct_args, Call::Intrinsic)},
905 Call::Extern);
906 } else {
907 can_use = IntImm::make(Int(32), 1);
908 }
909
910 for (int i = 0; i < kFeaturesWordCount; ++i) {
911 runtime_features[i] &= cur_target_features[i];
912 }
913
914 wrapper_args.push_back(can_use != 0);
915 wrapper_args.emplace_back(sub_fn_name);
916 }
917
918 // If we haven't specified "no runtime", build a runtime with the base target
919 // and add that to the result.
920 if (!base_target.has_feature(Target::NoRuntime)) {
921 // Start with a bare Target, set only the features we know are common to all.
922 Target runtime_target(base_target.os, base_target.arch, base_target.bits);
923 for (int i = 0; i < Target::FeatureEnd; ++i) {
924 // We never want NoRuntime set here.
925 if (i == Target::NoRuntime) {
926 continue;
927 }
928 const int word = i >> 6;
929 const int bit = i & 63;
930 if (runtime_features[word] & (((uint64_t)1) << bit)) {
931 runtime_target.set_feature((Target::Feature)i);
932 }
933 }
934 std::string runtime_path = contains(output_files, Output::static_library) ?
935 temp_obj_dir.add_temp_object_file(output_files.at(Output::static_library), "_runtime", runtime_target) :
936 add_suffix(output_files.at(Output::object), "_runtime");
937
938 std::map<Output, std::string> runtime_out =
939 {{Output::object, runtime_path}};
940 debug(1) << "compile_multitarget: compile_standalone_runtime " << runtime_out.at(Output::object) << "\n";
941 compile_standalone_runtime(runtime_out, runtime_target);
942 }
943
944 if (needs_wrapper) {
945 Expr indirect_result = Call::make(Int(32), Call::call_cached_indirect_function, wrapper_args, Call::Intrinsic);
946 std::string private_result_name = unique_name(fn_name + "_result");
947 Expr private_result_var = Variable::make(Int(32), private_result_name);
948 Stmt wrapper_body = AssertStmt::make(private_result_var == 0, private_result_var);
949 wrapper_body = LetStmt::make(private_result_name, indirect_result, wrapper_body);
950
951 // Always build with NoRuntime: that's handled as a separate module.
952 //
953 // Always build with NoBoundsQuery: underlying code will implement that (or not).
954 //
955 // Always build *without* NoAsserts (ie, with Asserts enabled): that's the
956 // only way to propagate a nonzero result code to our caller.
957 Target wrapper_target = base_target
958 .with_feature(Target::NoRuntime)
959 .with_feature(Target::NoBoundsQuery)
960 .without_feature(Target::NoAsserts);
961
962 // If the base target specified the Matlab target, we want the Matlab target
963 // on the wrapper instead.
964 if (base_target.has_feature(Target::Matlab)) {
965 wrapper_target = wrapper_target.with_feature(Target::Matlab);
966 }
967
968 Module wrapper_module(fn_name, wrapper_target);
969 wrapper_module.append(LoweredFunc(fn_name, base_target_args, wrapper_body, LinkageType::ExternalPlusMetadata));
970
971 std::string wrapper_path = contains(output_files, Output::static_library) ?
972 temp_obj_dir.add_temp_object_file(output_files.at(Output::static_library), "_wrapper", base_target, /* in_front*/ true) :
973 add_suffix(output_files.at(Output::object), "_wrapper");
974
975 std::map<Output, std::string> wrapper_out = {{Output::object, wrapper_path}};
976 debug(1) << "compile_multitarget: wrapper " << wrapper_out.at(Output::object) << "\n";
977 wrapper_module.compile(wrapper_out);
978 }
979
980 if (contains(output_files, Output::c_header)) {
981 Module header_module(fn_name, base_target);
982 header_module.append(LoweredFunc(fn_name, base_target_args, {}, LinkageType::ExternalPlusMetadata));
983 std::map<Output, std::string> header_out = {{Output::c_header, output_files.at(Output::c_header)}};
984 debug(1) << "compile_multitarget: c_header " << header_out.at(Output::c_header) << "\n";
985 header_module.compile(header_out);
986 }
987
988 if (contains(output_files, Output::registration)) {
989 debug(1) << "compile_multitarget: registration " << output_files.at(Output::registration) << "\n";
990 Module registration_module(fn_name, base_target);
991 registration_module.append(LoweredFunc(fn_name, base_target_args, {}, LinkageType::ExternalPlusMetadata));
992 std::map<Output, std::string> registration_out = {{Output::registration, output_files.at(Output::registration)}};
993 debug(1) << "compile_multitarget: registration " << registration_out.at(Output::registration) << "\n";
994 registration_module.compile(registration_out);
995 }
996
997 if (contains(output_files, Output::schedule)) {
998 debug(1) << "compile_multitarget: schedule " << output_files.at(Output::schedule) << "\n";
999 std::string scheduler = auto_scheduler_results.front().scheduler_name;
1000 if (scheduler.empty()) {
1001 scheduler = "(None)";
1002 }
1003 std::string machine_params = auto_scheduler_results.front().machine_params_string;
1004 if (machine_params.empty()) {
1005 machine_params = "(None)";
1006 }
1007
1008 // Find the features that are unique to each stage (vs the baseline case).
1009 const auto &baseline_target = auto_scheduler_results.back().target;
1010 const auto &baseline_features = baseline_target.get_features_bitset();
1011
1012 // Autoscheduling should be all-or-none across the subtargets;
1013 // if code tries to somehow only autoschedule some subtargets,
1014 // this code may break, and that's ok.
1015 std::ostringstream body;
1016 if (baseline_target.os == Target::OSUnknown && baseline_target.arch == Target::ArchUnknown) {
1017 body << "// No autoscheduler has been run for this Generator.";
1018 } else {
1019 for (size_t i = 0; i < auto_scheduler_results.size(); i++) {
1020 const auto &a = auto_scheduler_results[i];
1021 body << "\n\n";
1022 if (i == auto_scheduler_results.size() - 1) {
1023 body << "// default schedule\n";
1024 body << "{\n";
1025 } else {
1026 auto cur_features = a.target.get_features_bitset() & ~baseline_features;
1027 user_assert(cur_features.count() > 0) << "Multitarget subtargets must be distinct";
1028 std::ostringstream condition;
1029 for (int i = 0; i < Target::FeatureEnd; ++i) {
1030 if (!cur_features[i]) continue;
1031 if (!condition.str().empty()) {
1032 condition << " &&\n ";
1033 }
1034 condition << "target.has_feature(halide_target_feature_"
1035 << Target::feature_to_name((Target::Feature)i) << ")";
1036 }
1037 body << "if (" << condition.str() << ") {\n";
1038 }
1039 body << indent_string(a.schedule_source, " ");
1040 body << " return;\n";
1041 body << "}";
1042 }
1043 }
1044
1045 std::ofstream file(output_files.at(Output::schedule));
1046 emit_schedule_file(fn_name, targets, scheduler, machine_params, body.str(), file);
1047 }
1048
1049 if (contains(output_files, Output::static_library)) {
1050 debug(1) << "compile_multitarget: static_library " << output_files.at(Output::static_library) << "\n";
1051 create_static_library(temp_obj_dir.files(), base_target, output_files.at(Output::static_library));
1052 }
1053
1054 if (contains(output_files, Output::compiler_log)) {
1055 debug(1) << "compile_multitarget: compiler_log " << output_files.at(Output::compiler_log) << "\n";
1056
1057 std::ofstream compiler_log_file(output_files.at(Output::compiler_log));
1058 compiler_log_file << "[\n";
1059 const auto &f = temp_compiler_log_dir.files();
1060 for (size_t i = 0; i < f.size(); i++) {
1061 auto d = read_entire_file(f[i]);
1062 compiler_log_file.write(d.data(), d.size());
1063 if (i < f.size() - 1) {
1064 compiler_log_file << ",\n";
1065 }
1066 }
1067 compiler_log_file << "]\n";
1068 compiler_log_file.close();
1069 internal_assert(!compiler_log_file.fail());
1070 }
1071 }
1072
1073 } // namespace Halide
1074