1 #include "Module.h"
2 
3 #include <array>
4 #include <fstream>
5 #include <future>
6 #include <utility>
7 
8 #include "CodeGen_C.h"
9 #include "CodeGen_Internal.h"
10 #include "CodeGen_PyTorch.h"
11 #include "CompilerLogger.h"
12 #include "Debug.h"
13 #include "HexagonOffload.h"
14 #include "IROperator.h"
15 #include "LLVM_Headers.h"
16 #include "LLVM_Output.h"
17 #include "LLVM_Runtime_Linker.h"
18 #include "Pipeline.h"
19 #include "PythonExtensionGen.h"
20 #include "StmtToHtml.h"
21 
22 using Halide::Internal::debug;
23 
24 namespace Halide {
25 namespace Internal {
26 
27 // This is the One True Source of the known output types for halide,
28 // and the appropriate file extension for each output type. If you are
29 // explicitly managing file extensions somewhere else, you are probably
30 // doing it wrong; please prefer to use this table as the source of truth.
31 //
32 // Note that we deliberately default to ".py.cpp" (rather than .py.c) here for python_extension;
33 // in theory, the Python extension file we generate can be compiled just
34 // fine as a plain-C file... but if we are building with cpp-name-mangling
35 // enabled in the target, we will include generated .h files that can't be compiled.
36 // We really don't want to vary the file extensions based on target flags,
37 // and in practice, it's extremely unlikely that anyone needs to rely on this
38 // being pure C output (vs possibly C++).
get_output_info(const Target & target)39 std::map<Output, const OutputInfo> get_output_info(const Target &target) {
40     constexpr bool IsMulti = true;
41     constexpr bool IsSingle = false;
42     const bool is_windows_coff = target.os == Target::Windows;
43     std::map<Output, const OutputInfo> ext = {
44         {Output::assembly, {"assembly", ".s", IsMulti}},
45         {Output::bitcode, {"bitcode", ".bc", IsMulti}},
46         {Output::c_header, {"c_header", ".h", IsSingle}},
47         {Output::c_source, {"c_source", ".halide_generated.cpp", IsSingle}},
48         {Output::compiler_log, {"compiler_log", ".halide_compiler_log", IsSingle}},
49         {Output::cpp_stub, {"cpp_stub", ".stub.h", IsSingle}},
50         {Output::featurization, {"featurization", ".featurization", IsMulti}},
51         {Output::llvm_assembly, {"llvm_assembly", ".ll", IsMulti}},
52         {Output::object, {"object", is_windows_coff ? ".obj" : ".o", IsMulti}},
53         {Output::python_extension, {"python_extension", ".py.cpp", IsSingle}},
54         {Output::pytorch_wrapper, {"pytorch_wrapper", ".pytorch.h", IsSingle}},
55         {Output::registration, {"registration", ".registration.cpp", IsSingle}},
56         {Output::schedule, {"schedule", ".schedule.h", IsSingle}},
57         {Output::static_library, {"static_library", is_windows_coff ? ".lib" : ".a", IsSingle}},
58         {Output::stmt, {"stmt", ".stmt", IsMulti}},
59         {Output::stmt_html, {"stmt_html", ".stmt.html", IsMulti}},
60     };
61     return ext;
62 }
63 
64 namespace {
65 
66 class TemporaryObjectFileDir final {
67 public:
TemporaryObjectFileDir()68     TemporaryObjectFileDir()
69         : dir_path(dir_make_temp()) {
70     }
~TemporaryObjectFileDir()71     ~TemporaryObjectFileDir() {
72         for (const auto &f : dir_files) {
73             debug(1) << "file_unlink: " << f << "\n";
74             file_unlink(f);
75         }
76         debug(1) << "dir_rmdir: " << dir_path << "\n";
77         dir_rmdir(dir_path);
78     }
add_temp_file(const std::string & base_path_name,const std::string & suffix,const Target & target,bool in_front=false)79     std::string add_temp_file(const std::string &base_path_name,
80                               const std::string &suffix,
81                               const Target &target,
82                               bool in_front = false) {
83         size_t slash_idx = base_path_name.rfind('/');
84         size_t backslash_idx = base_path_name.rfind('\\');
85         if (slash_idx == std::string::npos) {
86             slash_idx = 0;
87         } else {
88             slash_idx++;
89         }
90         if (backslash_idx == std::string::npos) {
91             backslash_idx = 0;
92         } else {
93             backslash_idx++;
94         }
95         std::string base_name = base_path_name.substr(std::max(slash_idx, backslash_idx));
96         std::string name = dir_path + "/" + base_name + suffix;
97         debug(1) << "add_temp_object_file: " << name << "\n";
98         if (in_front) {
99             dir_files.insert(dir_files.begin(), name);
100         } else {
101             dir_files.push_back(name);
102         }
103         return name;
104     }
105 
add_temp_object_file(const std::string & base_path_name,const std::string & suffix,const Target & target,bool in_front=false)106     std::string add_temp_object_file(const std::string &base_path_name,
107                                      const std::string &suffix,
108                                      const Target &target,
109                                      bool in_front = false) {
110         const char *ext = (target.os == Target::Windows) ? ".obj" : ".o";
111         return add_temp_file(base_path_name, suffix + ext, target, in_front);
112     }
113 
files()114     const std::vector<std::string> &files() {
115         return dir_files;
116     }
117 
118 private:
119     const std::string dir_path;
120     std::vector<std::string> dir_files;
121     TemporaryObjectFileDir(const TemporaryObjectFileDir &) = delete;
122     void operator=(const TemporaryObjectFileDir &) = delete;
123 };
124 
125 // Given a pathname of the form /path/to/name.ext, append suffix before ext to produce /path/to/namesuffix.ext
add_suffix(const std::string & path,const std::string & suffix)126 std::string add_suffix(const std::string &path, const std::string &suffix) {
127     size_t last_path = std::min(path.rfind('/'), path.rfind('\\'));
128     if (last_path == std::string::npos) {
129         last_path = 0;
130     }
131     size_t dot = path.find('.', last_path);
132     if (dot == std::string::npos) {
133         return path + suffix;
134     } else {
135         return path.substr(0, dot) + suffix + path.substr(dot);
136     }
137 }
138 
validate_outputs(const std::map<Output,std::string> & in)139 void validate_outputs(const std::map<Output, std::string> &in) {
140     // We don't care about the extensions, so any Target will do
141     auto known = get_output_info(Target());
142     for (auto it : in) {
143         internal_assert(!it.second.empty()) << "Empty value for output: " << known.at(it.first).name;
144     }
145 }
146 
contains(const std::map<Output,std::string> & in,const Output & key)147 bool contains(const std::map<Output, std::string> &in, const Output &key) {
148     return in.find(key) != in.end();
149 }
150 
emit_registration(const Module & m,std::ostream & stream)151 void emit_registration(const Module &m, std::ostream &stream) {
152     /*
153         This relies on the filter library being linked in a way that doesn't
154         dead-strip "unused" initialization code; this may mean that you need to
155         explicitly link with with --whole-archive (or the equivalent) to ensure
156         that the registration code isn't omitted. Sadly, there's no portable way
157         to do this, so you may need to take care in your make/build/etc files:
158 
159         Linux:      -Wl,--whole-archive "/path/to/library" -Wl,-no-whole-archive
160         Darwin/OSX: -Wl,-force_load,/path/to/library
161         VS2015 R2+: /WHOLEARCHIVE:/path/to/library.lib
162         Bazel:      alwayslink=1
163 
164         Note also that registration files deliberately have no #includes, and
165         are specifically designed to be legal to concatenate into a single
166         source file; it should be equivalent to compile-and-link multiple
167         registration files separately, or to concatenate multiple registration
168         files into a single one which is then compiled.
169     */
170 
171     const std::string registration_template = R"INLINE_CODE(
172 // MACHINE GENERATED -- DO NOT EDIT
173 
174 extern "C" {
175 struct halide_filter_metadata_t;
176 void halide_register_argv_and_metadata(
177     int (*filter_argv_call)(void **),
178     const struct halide_filter_metadata_t *filter_metadata,
179     const char * const *extra_key_value_pairs
180 );
181 }
182 
183 $NAMESPACEOPEN$
184 extern int $SHORTNAME$_argv(void **args);
185 extern const struct halide_filter_metadata_t *$SHORTNAME$_metadata();
186 $NAMESPACECLOSE$
187 
188 #ifdef HALIDE_REGISTER_EXTRA_KEY_VALUE_PAIRS_FUNC
189 extern "C" const char * const *HALIDE_REGISTER_EXTRA_KEY_VALUE_PAIRS_FUNC();
190 #endif  // HALIDE_REGISTER_EXTRA_KEY_VALUE_PAIRS_FUNC
191 
192 namespace $NREGS$ {
193 namespace {
194 struct Registerer {
195     Registerer() {
196 #ifdef HALIDE_REGISTER_EXTRA_KEY_VALUE_PAIRS_FUNC
197         halide_register_argv_and_metadata(::$FULLNAME$_argv, ::$FULLNAME$_metadata(), HALIDE_REGISTER_EXTRA_KEY_VALUE_PAIRS_FUNC());
198 #else
199         halide_register_argv_and_metadata(::$FULLNAME$_argv, ::$FULLNAME$_metadata(), nullptr);
200 #endif  // HALIDE_REGISTER_EXTRA_KEY_VALUE_PAIRS_FUNC
201     }
202 };
203 static Registerer registerer;
204 }  // namespace
205 }  // $NREGS$
206 
207 )INLINE_CODE";
208 
209     for (const auto &f : m.functions()) {
210         if (f.linkage == LinkageType::ExternalPlusMetadata) {
211             std::vector<std::string> namespaces;
212             std::string simple_name = extract_namespaces(f.name, namespaces);
213             std::string nsopen, nsclose;
214             for (const auto &ns : namespaces) {
215                 nsopen += "namespace " + ns + " { ";
216                 nsclose += "}";
217             }
218             if (!m.target().has_feature(Target::CPlusPlusMangling)) {
219                 internal_assert(namespaces.empty());
220                 nsopen = "extern \"C\" {";
221                 nsclose = "}";
222             }
223             std::string nsreg = "halide_nsreg_" + replace_all(f.name, "::", "_");
224             std::string s = replace_all(registration_template, "$NAMESPACEOPEN$", nsopen);
225             s = replace_all(s, "$SHORTNAME$", simple_name);
226             s = replace_all(s, "$NAMESPACECLOSE$", nsclose);
227             s = replace_all(s, "$FULLNAME$", f.name);
228             s = replace_all(s, "$NREGS$", nsreg);
229             stream << s;
230         }
231     }
232 }
233 
indent_string(const std::string & src,const std::string & indent)234 std::string indent_string(const std::string &src, const std::string &indent) {
235     std::ostringstream o;
236     bool prev_was_newline = true;
237     for (size_t i = 0; i < src.size(); i++) {
238         const char c = src[i];
239         const bool is_newline = (c == '\n');
240         if (prev_was_newline && !is_newline) {
241             o << indent;
242         }
243         o << c;
244         prev_was_newline = is_newline;
245     }
246     return o.str();
247 }
248 
emit_schedule_file(const std::string & name,const std::vector<Target> & targets,const std::string & scheduler_name,const std::string & machine_params_string,const std::string & body,std::ostream & stream)249 void emit_schedule_file(const std::string &name,
250                         const std::vector<Target> &targets,
251                         const std::string &scheduler_name,
252                         const std::string &machine_params_string,
253                         const std::string &body,
254                         std::ostream &stream) {
255     std::string s = R"INLINE_CODE(#ifndef $CLEANNAME$_SCHEDULE_H
256 #define $CLEANNAME$_SCHEDULE_H
257 
258 // MACHINE GENERATED -- DO NOT EDIT
259 // This schedule was automatically generated by $SCHEDULER$
260 // for target=$TARGET$  // NOLINT
261 // with machine_params=$MACHINEPARAMS$
262 
263 #include "Halide.h"
264 
265 $NAMESPACEOPEN$
266 inline void apply_schedule_$SHORTNAME$(
267     ::Halide::Pipeline pipeline,
268     ::Halide::Target target
269 ) {
270     using ::Halide::Func;
271     using ::Halide::MemoryType;
272     using ::Halide::RVar;
273     using ::Halide::TailStrategy;
274     using ::Halide::Var;
275 $BODY$
276 }
277 $NAMESPACECLOSE$
278 #endif  // $CLEANNAME$_SCHEDULE_H
279 )INLINE_CODE";
280 
281     // For logging in the comment, strip out features that are almost
282     // certainly irrelevant to scheduling issues, to make for easier reading
283     const Target::Feature irrelevant_features[] = {
284         Target::CPlusPlusMangling,
285         Target::NoRuntime,
286         Target::UserContext,
287     };
288 
289     std::vector<std::string> namespaces;
290     std::string simple_name = extract_namespaces(name, namespaces);
291     std::string nsopen, nsclose;
292     for (const auto &ns : namespaces) {
293         nsopen += "namespace " + ns + " {\n";
294         nsclose += "}  // namespace " + ns + "\n";
295     }
296     std::string clean_name = replace_all(name, "::", "_");
297     std::string target_string;
298     for (Target t : targets) {
299         if (!target_string.empty()) target_string += ",";
300         for (auto f : irrelevant_features) {
301             t = t.without_feature(f);
302         }
303         target_string += t.to_string();
304     }
305     std::string body_text = indent_string(body, "    ");
306     s = replace_all(s, "$SCHEDULER$", scheduler_name);
307     s = replace_all(s, "$NAMESPACEOPEN$", nsopen);
308     s = replace_all(s, "$SHORTNAME$", simple_name);
309     s = replace_all(s, "$CLEANNAME$", clean_name);
310     s = replace_all(s, "$NAMESPACECLOSE$", nsclose);
311     s = replace_all(s, "$TARGET$", target_string);
312     s = replace_all(s, "$BODY$", body_text);
313     s = replace_all(s, "$MACHINEPARAMS$", machine_params_string);
314     stream << s;
315 }
316 
317 }  // namespace
318 
319 struct ModuleContents {
320     mutable RefCount ref_count;
321     std::string name;
322     Target target;
323     std::vector<Buffer<>> buffers;
324     std::vector<Internal::LoweredFunc> functions;
325     std::vector<Module> submodules;
326     std::vector<ExternalCode> external_code;
327     std::map<std::string, std::string> metadata_name_map;
328     bool any_strict_float{false};
329     std::unique_ptr<AutoSchedulerResults> auto_scheduler_results;
330 };
331 
332 template<>
ref_count(const ModuleContents * t)333 RefCount &ref_count<ModuleContents>(const ModuleContents *t) noexcept {
334     return t->ref_count;
335 }
336 
337 template<>
destroy(const ModuleContents * t)338 void destroy<ModuleContents>(const ModuleContents *t) {
339     delete t;
340 }
341 
LoweredFunc(const std::string & name,const std::vector<LoweredArgument> & args,Stmt body,LinkageType linkage,NameMangling name_mangling)342 LoweredFunc::LoweredFunc(const std::string &name,
343                          const std::vector<LoweredArgument> &args,
344                          Stmt body,
345                          LinkageType linkage,
346                          NameMangling name_mangling)
347     : name(name), args(args), body(std::move(body)), linkage(linkage), name_mangling(name_mangling) {
348 }
349 
LoweredFunc(const std::string & name,const std::vector<Argument> & args,Stmt body,LinkageType linkage,NameMangling name_mangling)350 LoweredFunc::LoweredFunc(const std::string &name,
351                          const std::vector<Argument> &args,
352                          Stmt body,
353                          LinkageType linkage,
354                          NameMangling name_mangling)
355     : name(name), body(std::move(body)), linkage(linkage), name_mangling(name_mangling) {
356     for (const Argument &i : args) {
357         this->args.emplace_back(i);
358     }
359 }
360 
361 }  // namespace Internal
362 
363 using namespace Halide::Internal;
364 
Module(const std::string & name,const Target & target)365 Module::Module(const std::string &name, const Target &target)
366     : contents(new Internal::ModuleContents) {
367     contents->name = name;
368     contents->target = target;
369 }
370 
set_auto_scheduler_results(const AutoSchedulerResults & auto_scheduler_results)371 void Module::set_auto_scheduler_results(const AutoSchedulerResults &auto_scheduler_results) {
372     internal_assert(contents->auto_scheduler_results.get() == nullptr);
373     contents->auto_scheduler_results.reset(new AutoSchedulerResults(auto_scheduler_results));
374 }
375 
set_any_strict_float(bool any_strict_float)376 void Module::set_any_strict_float(bool any_strict_float) {
377     contents->any_strict_float = any_strict_float;
378 }
379 
target() const380 const Target &Module::target() const {
381     return contents->target;
382 }
383 
name() const384 const std::string &Module::name() const {
385     return contents->name;
386 }
387 
get_auto_scheduler_results() const388 const AutoSchedulerResults *Module::get_auto_scheduler_results() const {
389     return contents->auto_scheduler_results.get();
390 }
391 
any_strict_float() const392 bool Module::any_strict_float() const {
393     return contents->any_strict_float;
394 }
395 
buffers() const396 const std::vector<Buffer<>> &Module::buffers() const {
397     return contents->buffers;
398 }
399 
functions() const400 const std::vector<Internal::LoweredFunc> &Module::functions() const {
401     return contents->functions;
402 }
403 
functions()404 std::vector<Internal::LoweredFunc> &Module::functions() {
405     return contents->functions;
406 }
407 
submodules() const408 const std::vector<Module> &Module::submodules() const {
409     return contents->submodules;
410 }
411 
external_code() const412 const std::vector<ExternalCode> &Module::external_code() const {
413     return contents->external_code;
414 }
415 
get_function_by_name(const std::string & name) const416 Internal::LoweredFunc Module::get_function_by_name(const std::string &name) const {
417     for (const auto &f : functions()) {
418         if (f.name == name) {
419             return f;
420         }
421     }
422     user_error << "get_function_by_name: function " << name << " not found.\n";
423     return Internal::LoweredFunc("", std::vector<Argument>{}, {}, LinkageType::External);
424 }
425 
append(const Buffer<> & buffer)426 void Module::append(const Buffer<> &buffer) {
427     contents->buffers.push_back(buffer);
428 }
429 
append(const Internal::LoweredFunc & function)430 void Module::append(const Internal::LoweredFunc &function) {
431     contents->functions.push_back(function);
432 }
433 
append(const Module & module)434 void Module::append(const Module &module) {
435     contents->submodules.push_back(module);
436 }
437 
append(const ExternalCode & external_code)438 void Module::append(const ExternalCode &external_code) {
439     contents->external_code.push_back(external_code);
440 }
441 
link_modules(const std::string & name,const std::vector<Module> & modules)442 Module link_modules(const std::string &name, const std::vector<Module> &modules) {
443     Module output(name, modules.front().target());
444 
445     for (size_t i = 0; i < modules.size(); i++) {
446         const Module &input = modules[i];
447 
448         if (output.target() != input.target()) {
449             user_error << "Mismatched targets in modules to link ("
450                        << output.name() << ", " << output.target().to_string()
451                        << "), ("
452                        << input.name() << ", " << input.target().to_string() << ")\n";
453         }
454 
455         // TODO(dsharlet): Check for naming collisions, maybe rename
456         // internal linkage declarations in the case of collision.
457         for (const auto &b : input.buffers()) {
458             output.append(b);
459         }
460         for (const auto &f : input.functions()) {
461             output.append(f);
462         }
463     }
464 
465     return output;
466 }
467 
compile_to_buffer() const468 Buffer<uint8_t> Module::compile_to_buffer() const {
469     // TODO: This Hexagon specific code should be removed as soon as possible.
470     // This may involve adding more general support for post-processing and
471     // a way of specifying to use it.
472     if (target().arch == Target::Hexagon) {
473         return compile_module_to_hexagon_shared_object(*this);
474     }
475 
476     llvm::LLVMContext context;
477     std::unique_ptr<llvm::Module> llvm_module(compile_module_to_llvm_module(*this, context));
478 
479     llvm::SmallVector<char, 4096> object;
480     llvm::raw_svector_ostream object_stream(object);
481     compile_llvm_module_to_object(*llvm_module, object_stream);
482 
483     if (debug::debug_level() >= 2) {
484         debug(2) << "Submodule assembly for " << name() << ": "
485                  << "\n";
486         llvm::SmallString<4096> assembly;
487         llvm::raw_svector_ostream assembly_stream(assembly);
488         compile_llvm_module_to_assembly(*llvm_module, assembly_stream);
489         debug(2) << assembly.c_str() << "\n";
490     }
491 
492     Buffer<uint8_t> result(object.size(), name());
493     memcpy(result.data(), reinterpret_cast<uint8_t *>(&object[0]), object.size());
494     return result;
495 }
496 
resolve_submodules() const497 Module Module::resolve_submodules() const {
498     if (submodules().empty()) {
499         return *this;
500     }
501 
502     Module lowered_module(name(), target());
503 
504     for (const auto &f : functions()) {
505         lowered_module.append(f);
506     }
507     for (const auto &buf : buffers()) {
508         lowered_module.append(buf);
509     }
510     for (const auto &ec : external_code()) {
511         lowered_module.append(ec);
512     }
513     for (const auto &m : submodules()) {
514         Module copy(m.resolve_submodules());
515 
516         // Propagate external code blocks.
517         for (const auto &ec : external_code()) {
518             // TODO(zalman): Is this the right thing to do?
519             bool already_in_list = false;
520             for (const auto &ec_sub : copy.external_code()) {
521                 if (ec_sub.name() == ec.name()) {
522                     already_in_list = true;
523                     break;
524                 }
525             }
526             if (!already_in_list) {
527                 copy.append(ec);
528             }
529         }
530 
531         auto buf = copy.compile_to_buffer();
532         lowered_module.append(buf);
533     }
534     // Copy the autoscheduler results back into the lowered module after resolving the submodules.
535     if (auto *r = contents->auto_scheduler_results.get()) {
536         lowered_module.set_auto_scheduler_results(*r);
537     }
538     return lowered_module;
539 }
540 
remap_metadata_name(const std::string & from,const std::string & to) const541 void Module::remap_metadata_name(const std::string &from, const std::string &to) const {
542     internal_assert(contents->metadata_name_map.find(from) == contents->metadata_name_map.end());
543     internal_assert(contents->metadata_name_map.find(to) == contents->metadata_name_map.end());
544     contents->metadata_name_map[from] = to;
545 }
546 
get_metadata_name_map() const547 std::map<std::string, std::string> Module::get_metadata_name_map() const {
548     return contents->metadata_name_map;
549 }
550 
compile(const std::map<Output,std::string> & output_files) const551 void Module::compile(const std::map<Output, std::string> &output_files) const {
552     validate_outputs(output_files);
553 
554     // output stmt and html prior to resolving submodules. We need to
555     // clear the output after writing it, otherwise the output will
556     // be overwritten by recursive calls after submodules are resolved.
557     if (contains(output_files, Output::stmt)) {
558         debug(1) << "Module.compile(): stmt " << output_files.at(Output::stmt) << "\n";
559         std::ofstream file(output_files.at(Output::stmt));
560         file << *this;
561     }
562     if (contains(output_files, Output::stmt_html)) {
563         debug(1) << "Module.compile(): stmt_html " << output_files.at(Output::stmt_html) << "\n";
564         Internal::print_to_html(output_files.at(Output::stmt_html), *this);
565     }
566 
567     // If there are submodules, recursively lower submodules to
568     // buffers on a copy of the module being compiled, then compile
569     // the copied module.
570     if (!submodules().empty()) {
571         std::map<Output, std::string> output_files_copy = output_files;
572         output_files_copy.erase(Output::stmt);
573         output_files_copy.erase(Output::stmt_html);
574         resolve_submodules().compile(output_files_copy);
575         return;
576     }
577 
578     auto *logger = get_compiler_logger();
579     if (contains(output_files, Output::object) || contains(output_files, Output::assembly) ||
580         contains(output_files, Output::bitcode) || contains(output_files, Output::llvm_assembly) ||
581         contains(output_files, Output::static_library)) {
582         llvm::LLVMContext context;
583         std::unique_ptr<llvm::Module> llvm_module(compile_module_to_llvm_module(*this, context));
584 
585         if (contains(output_files, Output::object)) {
586             const auto &f = output_files.at(Output::object);
587             debug(1) << "Module.compile(): object " << f << "\n";
588             auto out = make_raw_fd_ostream(f);
589             compile_llvm_module_to_object(*llvm_module, *out);
590             if (logger) {
591                 out->flush();
592                 logger->record_object_code_size(file_stat(f).file_size);
593             }
594         }
595         if (contains(output_files, Output::static_library)) {
596             // To simplify the code, we always create a temporary object output
597             // here, even if output_files.at(Output::object) was also set: in practice,
598             // no real-world code ever sets both object and static_library
599             // at the same time, so there is no meaningful performance advantage
600             // to be had.
601             TemporaryObjectFileDir temp_dir;
602             {
603                 std::string object = temp_dir.add_temp_object_file(output_files.at(Output::static_library), "", target());
604                 debug(1) << "Module.compile(): temporary object " << object << "\n";
605                 auto out = make_raw_fd_ostream(object);
606                 compile_llvm_module_to_object(*llvm_module, *out);
607                 out->flush();  // create_static_library() is happier if we do this
608                 if (logger && !contains(output_files, Output::object)) {
609                     // Don't double-record object-code size if we already recorded it for object
610                     logger->record_object_code_size(file_stat(object).file_size);
611                 }
612             }
613             debug(1) << "Module.compile(): static_library " << output_files.at(Output::static_library) << "\n";
614             Target base_target(target().os, target().arch, target().bits);
615             create_static_library(temp_dir.files(), base_target, output_files.at(Output::static_library));
616         }
617         if (contains(output_files, Output::assembly)) {
618             debug(1) << "Module.compile(): assembly " << output_files.at(Output::assembly) << "\n";
619             auto out = make_raw_fd_ostream(output_files.at(Output::assembly));
620             compile_llvm_module_to_assembly(*llvm_module, *out);
621         }
622         if (contains(output_files, Output::bitcode)) {
623             debug(1) << "Module.compile(): bitcode " << output_files.at(Output::bitcode) << "\n";
624             auto out = make_raw_fd_ostream(output_files.at(Output::bitcode));
625             compile_llvm_module_to_llvm_bitcode(*llvm_module, *out);
626         }
627         if (contains(output_files, Output::llvm_assembly)) {
628             debug(1) << "Module.compile(): llvm_assembly " << output_files.at(Output::llvm_assembly) << "\n";
629             auto out = make_raw_fd_ostream(output_files.at(Output::llvm_assembly));
630             compile_llvm_module_to_llvm_assembly(*llvm_module, *out);
631         }
632     }
633     if (contains(output_files, Output::c_header)) {
634         debug(1) << "Module.compile(): c_header " << output_files.at(Output::c_header) << "\n";
635         std::ofstream file(output_files.at(Output::c_header));
636         Internal::CodeGen_C cg(file,
637                                target(),
638                                target().has_feature(Target::CPlusPlusMangling) ? Internal::CodeGen_C::CPlusPlusHeader : Internal::CodeGen_C::CHeader,
639                                output_files.at(Output::c_header));
640         cg.compile(*this);
641     }
642     if (contains(output_files, Output::c_source)) {
643         debug(1) << "Module.compile(): c_source " << output_files.at(Output::c_source) << "\n";
644         std::ofstream file(output_files.at(Output::c_source));
645         Internal::CodeGen_C cg(file,
646                                target(),
647                                target().has_feature(Target::CPlusPlusMangling) ? Internal::CodeGen_C::CPlusPlusImplementation : Internal::CodeGen_C::CImplementation);
648         cg.compile(*this);
649     }
650     if (contains(output_files, Output::python_extension)) {
651         debug(1) << "Module.compile(): python_extension " << output_files.at(Output::python_extension) << "\n";
652         std::ofstream file(output_files.at(Output::python_extension));
653         Internal::PythonExtensionGen python_extension_gen(file);
654         python_extension_gen.compile(*this);
655     }
656     if (contains(output_files, Output::schedule)) {
657         debug(1) << "Module.compile(): schedule " << output_files.at(Output::schedule) << "\n";
658         std::ofstream file(output_files.at(Output::schedule));
659         auto *r = contents->auto_scheduler_results.get();
660         std::string scheduler = r ? r->scheduler_name : "(None)";
661         std::string machine_params = r ? r->machine_params_string : "(None)";
662         std::string body = r && !r->schedule_source.empty() ? r->schedule_source : "// No autoscheduler has been run for this Generator.\n";
663         emit_schedule_file(name(), {target()}, scheduler, machine_params, body, file);
664     }
665     if (contains(output_files, Output::featurization)) {
666         debug(1) << "Module.compile(): featurization " << output_files.at(Output::featurization) << "\n";
667         // If the featurization data is empty, just write an empty file
668         std::ofstream binfile(output_files.at(Output::featurization), std::ios::binary | std::ios_base::trunc);
669         auto *r = contents->auto_scheduler_results.get();
670         if (r) {
671             binfile.write((const char *)r->featurization.data(), r->featurization.size());
672         }
673         binfile.close();
674     }
675     if (contains(output_files, Output::registration)) {
676         debug(1) << "Module.compile(): registration " << output_files.at(Output::registration) << "\n";
677         std::ofstream file(output_files.at(Output::registration));
678         emit_registration(*this, file);
679         file.close();
680         internal_assert(!file.fail());
681     }
682     if (contains(output_files, Output::pytorch_wrapper)) {
683         debug(1) << "Module.compile(): pytorch_wrapper " << output_files.at(Output::pytorch_wrapper) << "\n";
684 
685         std::ofstream file(output_files.at(Output::pytorch_wrapper));
686         Internal::CodeGen_PyTorch cg(file);
687         cg.compile(*this);
688         file.close();
689         internal_assert(!file.fail());
690     }
691     if (contains(output_files, Output::compiler_log)) {
692         debug(1) << "Module.compile(): compiler_log " << output_files.at(Output::compiler_log) << "\n";
693         std::ofstream file(output_files.at(Output::compiler_log));
694         internal_assert(get_compiler_logger() != nullptr);
695         get_compiler_logger()->emit_to_stream(file);
696         file.close();
697         internal_assert(!file.fail());
698     }
699     // If HL_DEBUG_COMPILER_LOGGER is set, dump the log (if any) to stderr now, whether or it is required
700     if (get_env_variable("HL_DEBUG_COMPILER_LOGGER") == "1" && get_compiler_logger() != nullptr) {
701         get_compiler_logger()->emit_to_stream(std::cerr);
702     }
703 }
704 
compile_standalone_runtime(const std::map<Output,std::string> & output_files,Target t)705 std::map<Output, std::string> compile_standalone_runtime(const std::map<Output, std::string> &output_files, Target t) {
706     validate_outputs(output_files);
707 
708     Module empty("standalone_runtime", t.without_feature(Target::NoRuntime).without_feature(Target::JIT));
709     // For runtime, it only makes sense to output object files or static_library, so ignore
710     // everything else.
711     std::map<Output, std::string> actual_outputs;
712     for (auto key : {Output::object, Output::static_library}) {
713         auto it = output_files.find(key);
714         if (it != output_files.end()) {
715             actual_outputs[key] = it->second;
716         }
717     }
718     empty.compile(actual_outputs);
719     return actual_outputs;
720 }
721 
compile_standalone_runtime(const std::string & object_filename,Target t)722 void compile_standalone_runtime(const std::string &object_filename, Target t) {
723     compile_standalone_runtime({{Output::object, object_filename}}, t);
724 }
725 
726 namespace {
727 
728 class ScopedCompilerLogger {
729 public:
ScopedCompilerLogger(const CompilerLoggerFactory & compiler_logger_factory,const std::string & fn_name,const Target & target)730     ScopedCompilerLogger(const CompilerLoggerFactory &compiler_logger_factory, const std::string &fn_name, const Target &target) {
731         internal_assert(!get_compiler_logger());
732         if (compiler_logger_factory) {
733             set_compiler_logger(compiler_logger_factory(fn_name, target));
734         } else {
735             set_compiler_logger(nullptr);
736         }
737     }
738 
~ScopedCompilerLogger()739     ~ScopedCompilerLogger() {
740         set_compiler_logger(nullptr);
741     }
742 };
743 
744 }  // namespace
745 
compile_multitarget(const std::string & fn_name,const std::map<Output,std::string> & output_files,const std::vector<Target> & targets,const std::vector<std::string> & suffixes,const ModuleFactory & module_factory,const CompilerLoggerFactory & compiler_logger_factory)746 void compile_multitarget(const std::string &fn_name,
747                          const std::map<Output, std::string> &output_files,
748                          const std::vector<Target> &targets,
749                          const std::vector<std::string> &suffixes,
750                          const ModuleFactory &module_factory,
751                          const CompilerLoggerFactory &compiler_logger_factory) {
752     validate_outputs(output_files);
753 
754     user_assert(!fn_name.empty()) << "Function name must be specified.\n";
755     user_assert(!targets.empty()) << "Must specify at least one target.\n";
756     user_assert(suffixes.empty() || suffixes.size() == targets.size())
757         << "The suffixes list must be empty or the same length as the targets list.\n";
758 
759     // The final target in the list is considered "baseline", and is used
760     // for (e.g.) the runtime and shared code. It is often just arch-bits-os
761     // with no other features (though this is *not* a requirement).
762     const Target &base_target = targets.back();
763 
764     // JIT makes no sense.
765     user_assert(!base_target.has_feature(Target::JIT)) << "JIT not allowed for compile_multitarget.\n";
766 
767     const auto suffix_for_entry = [&](int i) -> std::string {
768         const std::string suffix = "-" + (suffixes.empty() ? targets[i].to_string() : suffixes[i]);
769         return suffix;
770     };
771 
772     const auto add_suffixes = [&](const std::map<Output, std::string> &in, const std::string &suffix) -> std::map<Output, std::string> {
773         // is_multi doesn't vary by Target, so we can pass an empty target here safely
774         auto output_info = get_output_info(Target());
775         std::map<Output, std::string> out = in;
776         for (auto &it : out) {
777             if (output_info[it.first].is_multi) {
778                 out[it.first] = add_suffix(it.second, suffix);
779             }
780         }
781         return out;
782     };
783 
784     // If only one target, don't bother with the runtime feature detection wrapping.
785     const bool needs_wrapper = (targets.size() > 1);
786     if (targets.size() == 1) {
787         debug(1) << "compile_multitarget: single target is " << base_target.to_string() << "\n";
788         ScopedCompilerLogger activate(compiler_logger_factory, fn_name, base_target);
789 
790         // If we want to have single-output object files use the target suffix, we'd
791         // want to do this instead:
792         //
793         //     auto sub_out = add_suffixes(output_files, suffix_for_entry(0));
794         //     module_factory(fn_name, base_target).compile(sub_out);
795         //
796         // This would make the filename outputs more symmetrical (ie the same for n=1 as for n>1)
797         // but at the expense of breaking existing users. So for now, we're going to continue
798         // with the legacy treatment below:
799         module_factory(fn_name, base_target).compile(output_files);
800         return;
801     }
802 
803     user_assert(((int)contains(output_files, Output::object) + (int)contains(output_files, Output::static_library)) == 1)
804         << "compile_multitarget() expects exactly one of 'object' and 'static_library' to be specified when multiple targets are specified.\n";
805 
806     // For safety, the runtime must be built only with features common to all
807     // of the targets; given an unusual ordering like
808     //
809     //     x86-64-linux,x86-64-sse41
810     //
811     // we should still always be *correct*: this ordering would never select sse41
812     // (since x86-64-linux would be selected first due to ordering), but could
813     // crash on non-sse41 machines (if we generated a runtime with sse41 instructions
814     // included). So we'll keep track of the common features as we walk thru the targets.
815 
816     // Using something like std::bitset would be arguably cleaner here, but we need an
817     // array-of-uint64 for calls to halide_can_use_target_features() anyway,
818     // so we'll just build and maintain in that form to avoid extra conversion.
819     constexpr int kFeaturesWordCount = (Target::FeatureEnd + 63) / (sizeof(uint64_t) * 8);
820     uint64_t runtime_features[kFeaturesWordCount] = {(uint64_t)-1LL};
821 
822     TemporaryObjectFileDir temp_obj_dir, temp_compiler_log_dir;
823     std::vector<Expr> wrapper_args;
824     std::vector<LoweredArgument> base_target_args;
825     std::vector<AutoSchedulerResults> auto_scheduler_results;
826 
827     for (size_t i = 0; i < targets.size(); ++i) {
828         const Target &target = targets[i];
829 
830         // arch-bits-os must be identical across all targets.
831         if (target.os != base_target.os ||
832             target.arch != base_target.arch ||
833             target.bits != base_target.bits) {
834             user_error << "All Targets must have matching arch-bits-os for compile_multitarget.\n";
835         }
836         // Some features must match across all targets.
837         static const std::array<Target::Feature, 9> must_match_features = {{
838             Target::ASAN,
839             Target::CPlusPlusMangling,
840             Target::Debug,
841             Target::JIT,
842             Target::Matlab,
843             Target::MSAN,
844             Target::NoRuntime,
845             Target::TSAN,
846             Target::UserContext,
847         }};
848         for (auto f : must_match_features) {
849             if (target.has_feature(f) != base_target.has_feature(f)) {
850                 user_error << "All Targets must have feature '" << Target::feature_to_name(f) << "'' set identically for compile_multitarget.\n";
851                 break;
852             }
853         }
854 
855         // Each sub-target has a function name that is the 'real' name plus a suffix
856         std::string suffix = suffix_for_entry(i);
857         std::string sub_fn_name = needs_wrapper ? (fn_name + suffix) : fn_name;
858 
859         // We always produce the runtime separately, so add NoRuntime explicitly.
860         // Matlab should be added to the wrapper pipeline below, instead of each sub-pipeline.
861         Target sub_fn_target = target.with_feature(Target::NoRuntime);
862         if (needs_wrapper) {
863             sub_fn_target = sub_fn_target.without_feature(Target::Matlab);
864         }
865 
866         {
867             ScopedCompilerLogger activate(compiler_logger_factory, sub_fn_name, sub_fn_target);
868             Module sub_module = module_factory(sub_fn_name, sub_fn_target);
869             // Re-assign every time -- should be the same across all targets anyway,
870             // but base_target is always the last one we encounter.
871             base_target_args = sub_module.get_function_by_name(sub_fn_name).args;
872 
873             auto sub_out = add_suffixes(output_files, suffix);
874             if (contains(output_files, Output::static_library)) {
875                 sub_out[Output::object] = temp_obj_dir.add_temp_object_file(output_files.at(Output::static_library), suffix, target);
876                 sub_out.erase(Output::static_library);
877             }
878             sub_out.erase(Output::registration);
879             sub_out.erase(Output::schedule);
880             sub_out.erase(Output::c_header);
881             if (contains(sub_out, Output::compiler_log)) {
882                 sub_out[Output::compiler_log] = temp_compiler_log_dir.add_temp_file(output_files.at(Output::compiler_log), suffix, target);
883             }
884             debug(1) << "compile_multitarget: compile_sub_target " << sub_out[Output::object] << "\n";
885             sub_module.compile(sub_out);
886             auto *r = sub_module.get_auto_scheduler_results();
887             auto_scheduler_results.push_back(r ? *r : AutoSchedulerResults());
888         }
889 
890         uint64_t cur_target_features[kFeaturesWordCount] = {0};
891         for (int i = 0; i < Target::FeatureEnd; ++i) {
892             if (target.has_feature((Target::Feature)i)) {
893                 cur_target_features[i >> 6] |= ((uint64_t)1) << (i & 63);
894             }
895         }
896 
897         Expr can_use;
898         if (target != base_target) {
899             std::vector<Expr> features_struct_args;
900             for (int i = 0; i < kFeaturesWordCount; ++i) {
901                 features_struct_args.emplace_back(UIntImm::make(UInt(64), cur_target_features[i]));
902             }
903             can_use = Call::make(Int(32), "halide_can_use_target_features",
904                                  {kFeaturesWordCount, Call::make(type_of<uint64_t *>(), Call::make_struct, features_struct_args, Call::Intrinsic)},
905                                  Call::Extern);
906         } else {
907             can_use = IntImm::make(Int(32), 1);
908         }
909 
910         for (int i = 0; i < kFeaturesWordCount; ++i) {
911             runtime_features[i] &= cur_target_features[i];
912         }
913 
914         wrapper_args.push_back(can_use != 0);
915         wrapper_args.emplace_back(sub_fn_name);
916     }
917 
918     // If we haven't specified "no runtime", build a runtime with the base target
919     // and add that to the result.
920     if (!base_target.has_feature(Target::NoRuntime)) {
921         // Start with a bare Target, set only the features we know are common to all.
922         Target runtime_target(base_target.os, base_target.arch, base_target.bits);
923         for (int i = 0; i < Target::FeatureEnd; ++i) {
924             // We never want NoRuntime set here.
925             if (i == Target::NoRuntime) {
926                 continue;
927             }
928             const int word = i >> 6;
929             const int bit = i & 63;
930             if (runtime_features[word] & (((uint64_t)1) << bit)) {
931                 runtime_target.set_feature((Target::Feature)i);
932             }
933         }
934         std::string runtime_path = contains(output_files, Output::static_library) ?
935                                        temp_obj_dir.add_temp_object_file(output_files.at(Output::static_library), "_runtime", runtime_target) :
936                                        add_suffix(output_files.at(Output::object), "_runtime");
937 
938         std::map<Output, std::string> runtime_out =
939             {{Output::object, runtime_path}};
940         debug(1) << "compile_multitarget: compile_standalone_runtime " << runtime_out.at(Output::object) << "\n";
941         compile_standalone_runtime(runtime_out, runtime_target);
942     }
943 
944     if (needs_wrapper) {
945         Expr indirect_result = Call::make(Int(32), Call::call_cached_indirect_function, wrapper_args, Call::Intrinsic);
946         std::string private_result_name = unique_name(fn_name + "_result");
947         Expr private_result_var = Variable::make(Int(32), private_result_name);
948         Stmt wrapper_body = AssertStmt::make(private_result_var == 0, private_result_var);
949         wrapper_body = LetStmt::make(private_result_name, indirect_result, wrapper_body);
950 
951         // Always build with NoRuntime: that's handled as a separate module.
952         //
953         // Always build with NoBoundsQuery: underlying code will implement that (or not).
954         //
955         // Always build *without* NoAsserts (ie, with Asserts enabled): that's the
956         // only way to propagate a nonzero result code to our caller.
957         Target wrapper_target = base_target
958                                     .with_feature(Target::NoRuntime)
959                                     .with_feature(Target::NoBoundsQuery)
960                                     .without_feature(Target::NoAsserts);
961 
962         // If the base target specified the Matlab target, we want the Matlab target
963         // on the wrapper instead.
964         if (base_target.has_feature(Target::Matlab)) {
965             wrapper_target = wrapper_target.with_feature(Target::Matlab);
966         }
967 
968         Module wrapper_module(fn_name, wrapper_target);
969         wrapper_module.append(LoweredFunc(fn_name, base_target_args, wrapper_body, LinkageType::ExternalPlusMetadata));
970 
971         std::string wrapper_path = contains(output_files, Output::static_library) ?
972                                        temp_obj_dir.add_temp_object_file(output_files.at(Output::static_library), "_wrapper", base_target, /* in_front*/ true) :
973                                        add_suffix(output_files.at(Output::object), "_wrapper");
974 
975         std::map<Output, std::string> wrapper_out = {{Output::object, wrapper_path}};
976         debug(1) << "compile_multitarget: wrapper " << wrapper_out.at(Output::object) << "\n";
977         wrapper_module.compile(wrapper_out);
978     }
979 
980     if (contains(output_files, Output::c_header)) {
981         Module header_module(fn_name, base_target);
982         header_module.append(LoweredFunc(fn_name, base_target_args, {}, LinkageType::ExternalPlusMetadata));
983         std::map<Output, std::string> header_out = {{Output::c_header, output_files.at(Output::c_header)}};
984         debug(1) << "compile_multitarget: c_header " << header_out.at(Output::c_header) << "\n";
985         header_module.compile(header_out);
986     }
987 
988     if (contains(output_files, Output::registration)) {
989         debug(1) << "compile_multitarget: registration " << output_files.at(Output::registration) << "\n";
990         Module registration_module(fn_name, base_target);
991         registration_module.append(LoweredFunc(fn_name, base_target_args, {}, LinkageType::ExternalPlusMetadata));
992         std::map<Output, std::string> registration_out = {{Output::registration, output_files.at(Output::registration)}};
993         debug(1) << "compile_multitarget: registration " << registration_out.at(Output::registration) << "\n";
994         registration_module.compile(registration_out);
995     }
996 
997     if (contains(output_files, Output::schedule)) {
998         debug(1) << "compile_multitarget: schedule " << output_files.at(Output::schedule) << "\n";
999         std::string scheduler = auto_scheduler_results.front().scheduler_name;
1000         if (scheduler.empty()) {
1001             scheduler = "(None)";
1002         }
1003         std::string machine_params = auto_scheduler_results.front().machine_params_string;
1004         if (machine_params.empty()) {
1005             machine_params = "(None)";
1006         }
1007 
1008         // Find the features that are unique to each stage (vs the baseline case).
1009         const auto &baseline_target = auto_scheduler_results.back().target;
1010         const auto &baseline_features = baseline_target.get_features_bitset();
1011 
1012         // Autoscheduling should be all-or-none across the subtargets;
1013         // if code tries to somehow only autoschedule some subtargets,
1014         // this code may break, and that's ok.
1015         std::ostringstream body;
1016         if (baseline_target.os == Target::OSUnknown && baseline_target.arch == Target::ArchUnknown) {
1017             body << "// No autoscheduler has been run for this Generator.";
1018         } else {
1019             for (size_t i = 0; i < auto_scheduler_results.size(); i++) {
1020                 const auto &a = auto_scheduler_results[i];
1021                 body << "\n\n";
1022                 if (i == auto_scheduler_results.size() - 1) {
1023                     body << "// default schedule\n";
1024                     body << "{\n";
1025                 } else {
1026                     auto cur_features = a.target.get_features_bitset() & ~baseline_features;
1027                     user_assert(cur_features.count() > 0) << "Multitarget subtargets must be distinct";
1028                     std::ostringstream condition;
1029                     for (int i = 0; i < Target::FeatureEnd; ++i) {
1030                         if (!cur_features[i]) continue;
1031                         if (!condition.str().empty()) {
1032                             condition << " &&\n    ";
1033                         }
1034                         condition << "target.has_feature(halide_target_feature_"
1035                                   << Target::feature_to_name((Target::Feature)i) << ")";
1036                     }
1037                     body << "if (" << condition.str() << ") {\n";
1038                 }
1039                 body << indent_string(a.schedule_source, "    ");
1040                 body << "    return;\n";
1041                 body << "}";
1042             }
1043         }
1044 
1045         std::ofstream file(output_files.at(Output::schedule));
1046         emit_schedule_file(fn_name, targets, scheduler, machine_params, body.str(), file);
1047     }
1048 
1049     if (contains(output_files, Output::static_library)) {
1050         debug(1) << "compile_multitarget: static_library " << output_files.at(Output::static_library) << "\n";
1051         create_static_library(temp_obj_dir.files(), base_target, output_files.at(Output::static_library));
1052     }
1053 
1054     if (contains(output_files, Output::compiler_log)) {
1055         debug(1) << "compile_multitarget: compiler_log " << output_files.at(Output::compiler_log) << "\n";
1056 
1057         std::ofstream compiler_log_file(output_files.at(Output::compiler_log));
1058         compiler_log_file << "[\n";
1059         const auto &f = temp_compiler_log_dir.files();
1060         for (size_t i = 0; i < f.size(); i++) {
1061             auto d = read_entire_file(f[i]);
1062             compiler_log_file.write(d.data(), d.size());
1063             if (i < f.size() - 1) {
1064                 compiler_log_file << ",\n";
1065             }
1066         }
1067         compiler_log_file << "]\n";
1068         compiler_log_file.close();
1069         internal_assert(!compiler_log_file.fail());
1070     }
1071 }
1072 
1073 }  // namespace Halide
1074