1 #ifndef HALIDE_CODEGEN_LLVM_H
2 #define HALIDE_CODEGEN_LLVM_H
3 
4 /** \file
5  *
6  * Defines the base-class for all architecture-specific code
7  * generators that use llvm.
8  */
9 
10 namespace llvm {
11 class Value;
12 class Module;
13 class Function;
14 class FunctionType;
15 class IRBuilderDefaultInserter;
16 class ConstantFolder;
17 template<typename, typename>
18 class IRBuilder;
19 class LLVMContext;
20 class Type;
21 class StructType;
22 class Instruction;
23 class CallInst;
24 class ExecutionEngine;
25 class AllocaInst;
26 class Constant;
27 class Triple;
28 class MDNode;
29 class NamedMDNode;
30 class DataLayout;
31 class BasicBlock;
32 class GlobalVariable;
33 }  // namespace llvm
34 
35 #include <map>
36 #include <memory>
37 #include <string>
38 #include <vector>
39 
40 #include "IRVisitor.h"
41 #include "Module.h"
42 #include "Scope.h"
43 #include "Target.h"
44 
45 namespace Halide {
46 struct ExternSignature;
47 }  // namespace Halide
48 
49 namespace Halide {
50 namespace Internal {
51 
52 /** A code generator abstract base class. Actual code generators
53  * (e.g. CodeGen_X86) inherit from this. This class is responsible
54  * for taking a Halide Stmt and producing llvm bitcode, machine
55  * code in an object file, or machine code accessible through a
56  * function pointer.
57  */
58 class CodeGen_LLVM : public IRVisitor {
59 public:
60     /** Create an instance of CodeGen_LLVM suitable for the target. */
61     static CodeGen_LLVM *new_for_target(const Target &target,
62                                         llvm::LLVMContext &context);
63 
64     ~CodeGen_LLVM() override;
65 
66     /** Takes a halide Module and compiles it to an llvm Module. */
67     virtual std::unique_ptr<llvm::Module> compile(const Module &module);
68 
69     /** The target we're generating code for */
get_target()70     const Target &get_target() const {
71         return target;
72     }
73 
74     /** Tell the code generator which LLVM context to use. */
75     void set_context(llvm::LLVMContext &context);
76 
77     /** Initialize internal llvm state for the enabled targets. */
78     static void initialize_llvm();
79 
80     static std::unique_ptr<llvm::Module> compile_trampolines(
81         const Target &target,
82         llvm::LLVMContext &context,
83         const std::string &suffix,
84         const std::vector<std::pair<std::string, ExternSignature>> &externs);
85 
get_requested_alloca_total()86     size_t get_requested_alloca_total() const {
87         return requested_alloca_total;
88     }
89 
90 protected:
91     CodeGen_LLVM(Target t);
92 
93     /** Compile a specific halide declaration into the llvm Module. */
94     // @{
95     virtual void compile_func(const LoweredFunc &func, const std::string &simple_name, const std::string &extern_name);
96     virtual void compile_buffer(const Buffer<> &buffer);
97     // @}
98 
99     /** Helper functions for compiling Halide functions to llvm
100      * functions. begin_func performs all the work necessary to begin
101      * generating code for a function with a given argument list with
102      * the IRBuilder. A call to begin_func should be a followed by a
103      * call to end_func with the same arguments, to generate the
104      * appropriate cleanup code. */
105     // @{
106     virtual void begin_func(LinkageType linkage, const std::string &simple_name,
107                             const std::string &extern_name, const std::vector<LoweredArgument> &args);
108     virtual void end_func(const std::vector<LoweredArgument> &args);
109     // @}
110 
111     /** What should be passed as -mcpu, -mattrs, and related for
112      * compilation. The architecture-specific code generator should
113      * define these. */
114     // @{
115     virtual std::string mcpu() const = 0;
116     virtual std::string mattrs() const = 0;
117     virtual bool use_soft_float_abi() const = 0;
118     virtual bool use_pic() const;
119     // @}
120 
121     /** Should indexing math be promoted to 64-bit on platforms with
122      * 64-bit pointers? */
promote_indices()123     virtual bool promote_indices() const {
124         return true;
125     }
126 
127     /** What's the natural vector bit-width to use for loads, stores, etc. */
128     virtual int native_vector_bits() const = 0;
129 
130     /** Return the type in which arithmetic should be done for the
131      * given storage type. */
132     virtual Type upgrade_type_for_arithmetic(const Type &) const;
133 
134     /** Return the type that a given Halide type should be
135      * stored/loaded from memory as. */
136     virtual Type upgrade_type_for_storage(const Type &) const;
137 
138     /** Return the type that a Halide type should be passed in and out
139      * of functions as. */
140     virtual Type upgrade_type_for_argument_passing(const Type &) const;
141 
142     /** State needed by llvm for code generation, including the
143      * current module, function, context, builder, and most recently
144      * generated llvm value. */
145     //@{
146     static bool llvm_X86_enabled;
147     static bool llvm_ARM_enabled;
148     static bool llvm_Hexagon_enabled;
149     static bool llvm_AArch64_enabled;
150     static bool llvm_NVPTX_enabled;
151     static bool llvm_Mips_enabled;
152     static bool llvm_PowerPC_enabled;
153     static bool llvm_AMDGPU_enabled;
154     static bool llvm_WebAssembly_enabled;
155     static bool llvm_RISCV_enabled;
156 
157     std::unique_ptr<llvm::Module> module;
158     llvm::Function *function;
159     llvm::LLVMContext *context;
160     llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter> *builder;
161     llvm::Value *value;
162     llvm::MDNode *very_likely_branch;
163     llvm::MDNode *default_fp_math_md;
164     llvm::MDNode *strict_fp_math_md;
165     std::vector<LoweredArgument> current_function_args;
166     //@}
167 
168     /** The target we're generating code for */
169     Halide::Target target;
170 
171     /** Grab all the context specific internal state. */
172     virtual void init_context();
173     /** Initialize the CodeGen_LLVM internal state to compile a fresh
174      * module. This allows reuse of one CodeGen_LLVM object to compiled
175      * multiple related modules (e.g. multiple device kernels). */
176     virtual void init_module();
177 
178     /** Add external_code entries to llvm module. */
179     void add_external_code(const Module &halide_module);
180 
181     /** Run all of llvm's optimization passes on the module. */
182     void optimize_module();
183 
184     /** Add an entry to the symbol table, hiding previous entries with
185      * the same name. Call this when new values come into scope. */
186     void sym_push(const std::string &name, llvm::Value *value);
187 
188     /** Remove an entry for the symbol table, revealing any previous
189      * entries with the same name. Call this when values go out of
190      * scope. */
191     void sym_pop(const std::string &name);
192 
193     /** Fetch an entry from the symbol table. If the symbol is not
194      * found, it either errors out (if the second arg is true), or
195      * returns nullptr. */
196     llvm::Value *sym_get(const std::string &name,
197                          bool must_succeed = true) const;
198 
199     /** Test if an item exists in the symbol table. */
200     bool sym_exists(const std::string &name) const;
201 
202     /** Given a Halide ExternSignature, return the equivalent llvm::FunctionType. */
203     llvm::FunctionType *signature_to_type(const ExternSignature &signature);
204 
205     /** Some useful llvm types */
206     // @{
207     llvm::Type *void_t, *i1_t, *i8_t, *i16_t, *i32_t, *i64_t, *f16_t, *f32_t, *f64_t;
208     llvm::StructType *halide_buffer_t_type,
209         *type_t_type,
210         *dimension_t_type,
211         *metadata_t_type,
212         *argument_t_type,
213         *scalar_value_t_type,
214         *device_interface_t_type,
215         *pseudostack_slot_t_type,
216         *semaphore_t_type,
217         *semaphore_acquire_t_type,
218         *parallel_task_t_type;
219 
220     // @}
221 
222     /** Some useful llvm types for subclasses */
223     // @{
224     llvm::Type *i8x8, *i8x16, *i8x32;
225     llvm::Type *i16x4, *i16x8, *i16x16;
226     llvm::Type *i32x2, *i32x4, *i32x8;
227     llvm::Type *i64x2, *i64x4;
228     llvm::Type *f32x2, *f32x4, *f32x8;
229     llvm::Type *f64x2, *f64x4;
230     // @}
231 
232     /** Some wildcard variables used for peephole optimizations in
233      * subclasses */
234     // @{
235     Expr wild_i8x8, wild_i16x4, wild_i32x2;                // 64-bit signed ints
236     Expr wild_u8x8, wild_u16x4, wild_u32x2;                // 64-bit unsigned ints
237     Expr wild_i8x16, wild_i16x8, wild_i32x4, wild_i64x2;   // 128-bit signed ints
238     Expr wild_u8x16, wild_u16x8, wild_u32x4, wild_u64x2;   // 128-bit unsigned ints
239     Expr wild_i8x32, wild_i16x16, wild_i32x8, wild_i64x4;  // 256-bit signed ints
240     Expr wild_u8x32, wild_u16x16, wild_u32x8, wild_u64x4;  // 256-bit unsigned ints
241 
242     Expr wild_f32x2;              // 64-bit floats
243     Expr wild_f32x4, wild_f64x2;  // 128-bit floats
244     Expr wild_f32x8, wild_f64x4;  // 256-bit floats
245 
246     // Wildcards for a varying number of lanes.
247     Expr wild_u1x_, wild_i8x_, wild_u8x_, wild_i16x_, wild_u16x_;
248     Expr wild_i32x_, wild_u32x_, wild_i64x_, wild_u64x_;
249     Expr wild_f32x_, wild_f64x_;
250     Expr min_i8, max_i8, max_u8;
251     Expr min_i16, max_i16, max_u16;
252     Expr min_i32, max_i32, max_u32;
253     Expr min_i64, max_i64, max_u64;
254     Expr min_f32, max_f32, min_f64, max_f64;
255     // @}
256 
257     /** Emit code that evaluates an expression, and return the llvm
258      * representation of the result of the expression. */
259     llvm::Value *codegen(const Expr &);
260 
261     /** Emit code that runs a statement. */
262     void codegen(const Stmt &);
263 
264     /** Codegen a vector Expr by codegenning each lane and combining. */
265     void scalarize(const Expr &);
266 
267     /** Some destructors should always be called. Others should only
268      * be called if the pipeline is exiting with an error code. */
269     enum DestructorType { Always,
270                           OnError,
271                           OnSuccess };
272 
273     /* Call this at the location of object creation to register how an
274      * object should be destroyed. This does three things:
275      * 1) Emits code here that puts the object in a unique
276      * null-initialized stack slot
277      * 2) Adds an instruction to the destructor block that calls the
278      * destructor on that stack slot if it's not null.
279      * 3) Returns that stack slot, so you can neuter the destructor
280      * (by storing null to the stack slot) or destroy the object early
281      * (by calling trigger_destructor).
282      */
283     llvm::Value *register_destructor(llvm::Function *destructor_fn, llvm::Value *obj, DestructorType when);
284 
285     /** Call a destructor early. Pass in the value returned by register destructor. */
286     void trigger_destructor(llvm::Function *destructor_fn, llvm::Value *stack_slot);
287 
288     /** Retrieves the block containing the error handling
289      * code. Creates it if it doesn't already exist for this
290      * function. */
291     llvm::BasicBlock *get_destructor_block();
292 
293     /** Codegen an assertion. If false, returns the error code (if not
294      * null), or evaluates and returns the message, which must be an
295      * Int(32) expression. */
296     // @{
297     void create_assertion(llvm::Value *condition, const Expr &message, llvm::Value *error_code = nullptr);
298     // @}
299 
300     /** Codegen a block of asserts with pure conditions */
301     void codegen_asserts(const std::vector<const AssertStmt *> &asserts);
302 
303     /** Codegen a call to do_parallel_tasks */
304     struct ParallelTask {
305         Stmt body;
306         struct SemAcquire {
307             Expr semaphore;
308             Expr count;
309         };
310         std::vector<SemAcquire> semaphores;
311         std::string loop_var;
312         Expr min, extent;
313         Expr serial;
314         std::string name;
315     };
316     int task_depth;
317     void get_parallel_tasks(const Stmt &s, std::vector<ParallelTask> &tasks, std::pair<std::string, int> prefix);
318     void do_parallel_tasks(const std::vector<ParallelTask> &tasks);
319     void do_as_parallel_task(const Stmt &s);
320 
321     /** Return the the pipeline with the given error code. Will run
322      * the destructor block. */
323     void return_with_error_code(llvm::Value *error_code);
324 
325     /** Put a string constant in the module as a global variable and return a pointer to it. */
326     llvm::Constant *create_string_constant(const std::string &str);
327 
328     /** Put a binary blob in the module as a global variable and return a pointer to it. */
329     llvm::Constant *create_binary_blob(const std::vector<char> &data, const std::string &name, bool constant = true);
330 
331     /** Widen an llvm scalar into an llvm vector with the given number of lanes. */
332     llvm::Value *create_broadcast(llvm::Value *, int lanes);
333 
334     /** Generate a pointer into a named buffer at a given index, of a
335      * given type. The index counts according to the scalar type of
336      * the type passed in. */
337     // @{
338     llvm::Value *codegen_buffer_pointer(const std::string &buffer, Type type, llvm::Value *index);
339     llvm::Value *codegen_buffer_pointer(const std::string &buffer, Type type, Expr index);
340     llvm::Value *codegen_buffer_pointer(llvm::Value *base_address, Type type, Expr index);
341     llvm::Value *codegen_buffer_pointer(llvm::Value *base_address, Type type, llvm::Value *index);
342     // @}
343 
344     /** Turn a Halide Type into an llvm::Value representing a constant halide_type_t */
345     llvm::Value *make_halide_type_t(const Type &);
346 
347     /** Mark a load or store with type-based-alias-analysis metadata
348      * so that llvm knows it can reorder loads and stores across
349      * different buffers */
350     void add_tbaa_metadata(llvm::Instruction *inst, std::string buffer, const Expr &index);
351 
352     /** Get a unique name for the actual block of memory that an
353      * allocate node uses. Used so that alias analysis understands
354      * when multiple Allocate nodes shared the same memory. */
get_allocation_name(const std::string & n)355     virtual std::string get_allocation_name(const std::string &n) {
356         return n;
357     }
358 
359     using IRVisitor::visit;
360 
361     /** Generate code for various IR nodes. These can be overridden by
362      * architecture-specific code to perform peephole
363      * optimizations. The result of each is stored in \ref value */
364     // @{
365     void visit(const IntImm *) override;
366     void visit(const UIntImm *) override;
367     void visit(const FloatImm *) override;
368     void visit(const StringImm *) override;
369     void visit(const Cast *) override;
370     void visit(const Variable *) override;
371     void visit(const Add *) override;
372     void visit(const Sub *) override;
373     void visit(const Mul *) override;
374     void visit(const Div *) override;
375     void visit(const Mod *) override;
376     void visit(const Min *) override;
377     void visit(const Max *) override;
378     void visit(const EQ *) override;
379     void visit(const NE *) override;
380     void visit(const LT *) override;
381     void visit(const LE *) override;
382     void visit(const GT *) override;
383     void visit(const GE *) override;
384     void visit(const And *) override;
385     void visit(const Or *) override;
386     void visit(const Not *) override;
387     void visit(const Select *) override;
388     void visit(const Load *) override;
389     void visit(const Ramp *) override;
390     void visit(const Broadcast *) override;
391     void visit(const Call *) override;
392     void visit(const Let *) override;
393     void visit(const LetStmt *) override;
394     void visit(const AssertStmt *) override;
395     void visit(const ProducerConsumer *) override;
396     void visit(const For *) override;
397     void visit(const Acquire *) override;
398     void visit(const Store *) override;
399     void visit(const Block *) override;
400     void visit(const Fork *) override;
401     void visit(const IfThenElse *) override;
402     void visit(const Evaluate *) override;
403     void visit(const Shuffle *) override;
404     void visit(const VectorReduce *) override;
405     void visit(const Prefetch *) override;
406     void visit(const Atomic *) override;
407     // @}
408 
409     /** Generate code for an allocate node. It has no default
410      * implementation - it must be handled in an architecture-specific
411      * way. */
412     void visit(const Allocate *) override = 0;
413 
414     /** Generate code for a free node. It has no default
415      * implementation and must be handled in an architecture-specific
416      * way. */
417     void visit(const Free *) override = 0;
418 
419     /** These IR nodes should have been removed during
420      * lowering. CodeGen_LLVM will error out if they are present */
421     // @{
422     void visit(const Provide *) override;
423     void visit(const Realize *) override;
424     // @}
425 
426     /** If we have to bail out of a pipeline midway, this should
427      * inject the appropriate target-specific cleanup code. */
prepare_for_early_exit()428     virtual void prepare_for_early_exit() {
429     }
430 
431     /** Get the llvm type equivalent to the given halide type in the
432      * current context. */
433     virtual llvm::Type *llvm_type_of(const Type &) const;
434 
435     /** Perform an alloca at the function entrypoint. Will be cleaned
436      * on function exit. */
437     llvm::Value *create_alloca_at_entry(llvm::Type *type, int n,
438                                         bool zero_initialize = false,
439                                         const std::string &name = "");
440 
441     /** A (very) conservative guess at the size of all alloca() storage requested
442      * (including alignment padding). It's currently meant only to be used as
443      * a very coarse way to ensure there is enough stack space when testing
444      * on the WebAssembly backend.
445      *
446      * It is *not* meant to be a useful proxy for "stack space needed", for a
447      * number of reasons:
448      * - allocas with non-overlapping lifetimes will share space
449      * - on some backends, LLVM may promote register-sized allocas into registers
450      * - while this accounts for alloca() calls we know about, it doesn't attempt
451      *   to account for stack spills, function call overhead, etc.
452      */
453     size_t requested_alloca_total = 0;
454 
455     /** Which buffers came in from the outside world (and so we can't
456      * guarantee their alignment) */
457     std::set<std::string> external_buffer;
458 
459     /** The user_context argument. May be a constant null if the
460      * function is being compiled without a user context. */
461     llvm::Value *get_user_context() const;
462 
463     /** Implementation of the intrinsic call to
464      * interleave_vectors. This implementation allows for interleaving
465      * an arbitrary number of vectors.*/
466     virtual llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &);
467 
468     /** Generate a call to a vector intrinsic or runtime inlined
469      * function. The arguments are sliced up into vectors of the width
470      * given by 'intrin_lanes', the intrinsic is called on each
471      * piece, then the results (if any) are concatenated back together
472      * into the original type 't'. For the version that takes an
473      * llvm::Type *, the type may be void, so the vector width of the
474      * arguments must be specified explicitly as
475      * 'called_lanes'. */
476     // @{
477     llvm::Value *call_intrin(const Type &t, int intrin_lanes,
478                              const std::string &name, std::vector<Expr>);
479     llvm::Value *call_intrin(llvm::Type *t, int intrin_lanes,
480                              const std::string &name, std::vector<llvm::Value *>);
481     // @}
482 
483     /** Take a slice of lanes out of an llvm vector. Pads with undefs
484      * if you ask for more lanes than the vector has. */
485     virtual llvm::Value *slice_vector(llvm::Value *vec, int start, int extent);
486 
487     /** Concatenate a bunch of llvm vectors. Must be of the same type. */
488     virtual llvm::Value *concat_vectors(const std::vector<llvm::Value *> &);
489 
490     /** Create an LLVM shuffle vectors instruction. */
491     virtual llvm::Value *shuffle_vectors(llvm::Value *a, llvm::Value *b,
492                                          const std::vector<int> &indices);
493     /** Shorthand for shuffling a vector with an undef vector. */
494     llvm::Value *shuffle_vectors(llvm::Value *v, const std::vector<int> &indices);
495 
496     /** Go looking for a vector version of a runtime function. Will
497      * return the best match. Matches in the following order:
498      *
499      * 1) The requested vector width.
500      *
501      * 2) The width which is the smallest power of two
502      * greater than or equal to the vector width.
503      *
504      * 3) All the factors of 2) greater than one, in decreasing order.
505      *
506      * 4) The smallest power of two not yet tried.
507      *
508      * So for a 5-wide vector, it tries: 5, 8, 4, 2, 16.
509      *
510      * If there's no match, returns (nullptr, 0).
511      */
512     std::pair<llvm::Function *, int> find_vector_runtime_function(const std::string &name, int lanes);
513 
514     virtual bool supports_atomic_add(const Type &t) const;
515 
516     /** Compile a horizontal reduction that starts with an explicit
517      * initial value. There are lots of complex ways to peephole
518      * optimize this pattern, especially with the proliferation of
519      * dot-product instructions, and they can usefully share logic
520      * across backends. */
521     virtual void codegen_vector_reduce(const VectorReduce *op, const Expr &init);
522 
523     /** Are we inside an atomic node that uses mutex locks?
524         This is used for detecting deadlocks from nested atomics & illegal vectorization. */
525     bool inside_atomic_mutex_node;
526 
527     /** Emit atomic store instructions? */
528     bool emit_atomic_stores;
529 
530 private:
531     /** All the values in scope at the current code location during
532      * codegen. Use sym_push and sym_pop to access. */
533     Scope<llvm::Value *> symbol_table;
534 
535     /** String constants already emitted to the module. Tracked to
536      * prevent emitting the same string many times. */
537     std::map<std::string, llvm::Constant *> string_constants;
538 
539     /** A basic block to branch to on error that triggers all
540      * destructors. As destructors are registered, code gets added
541      * to this block. */
542     llvm::BasicBlock *destructor_block;
543 
544     /** Turn off all unsafe math flags in scopes while this is set. */
545     bool strict_float;
546 
547     /** Embed an instance of halide_filter_metadata_t in the code, using
548      * the given name (by convention, this should be ${FUNCTIONNAME}_metadata)
549      * as extern "C" linkage. Note that the return value is a function-returning-
550      * pointer-to-constant-data.
551      */
552     llvm::Function *embed_metadata_getter(const std::string &metadata_getter_name,
553                                           const std::string &function_name, const std::vector<LoweredArgument> &args,
554                                           const std::map<std::string, std::string> &metadata_name_map);
555 
556     /** Embed a constant expression as a global variable. */
557     llvm::Constant *embed_constant_expr(Expr e, llvm::Type *t);
558     llvm::Constant *embed_constant_scalar_value_t(const Expr &e);
559 
560     llvm::Function *add_argv_wrapper(llvm::Function *fn, const std::string &name, bool result_in_argv = false);
561 
562     llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr);
563 
564     virtual void codegen_predicated_vector_load(const Load *op);
565     virtual void codegen_predicated_vector_store(const Store *op);
566 
567     void codegen_atomic_store(const Store *op);
568 
569     void init_codegen(const std::string &name, bool any_strict_float = false);
570     std::unique_ptr<llvm::Module> finish_codegen();
571 
572     /** A helper routine for generating folded vector reductions. */
573     template<typename Op>
574     bool try_to_fold_vector_reduce(const Op *op);
575 };
576 
577 }  // namespace Internal
578 
579 /** Given a Halide module, generate an llvm::Module. */
580 std::unique_ptr<llvm::Module> codegen_llvm(const Module &module,
581                                            llvm::LLVMContext &context);
582 
583 }  // namespace Halide
584 
585 #endif
586