1 #ifndef HALIDE_CODEGEN_LLVM_H 2 #define HALIDE_CODEGEN_LLVM_H 3 4 /** \file 5 * 6 * Defines the base-class for all architecture-specific code 7 * generators that use llvm. 8 */ 9 10 namespace llvm { 11 class Value; 12 class Module; 13 class Function; 14 class FunctionType; 15 class IRBuilderDefaultInserter; 16 class ConstantFolder; 17 template<typename, typename> 18 class IRBuilder; 19 class LLVMContext; 20 class Type; 21 class StructType; 22 class Instruction; 23 class CallInst; 24 class ExecutionEngine; 25 class AllocaInst; 26 class Constant; 27 class Triple; 28 class MDNode; 29 class NamedMDNode; 30 class DataLayout; 31 class BasicBlock; 32 class GlobalVariable; 33 } // namespace llvm 34 35 #include <map> 36 #include <memory> 37 #include <string> 38 #include <vector> 39 40 #include "IRVisitor.h" 41 #include "Module.h" 42 #include "Scope.h" 43 #include "Target.h" 44 45 namespace Halide { 46 struct ExternSignature; 47 } // namespace Halide 48 49 namespace Halide { 50 namespace Internal { 51 52 /** A code generator abstract base class. Actual code generators 53 * (e.g. CodeGen_X86) inherit from this. This class is responsible 54 * for taking a Halide Stmt and producing llvm bitcode, machine 55 * code in an object file, or machine code accessible through a 56 * function pointer. 57 */ 58 class CodeGen_LLVM : public IRVisitor { 59 public: 60 /** Create an instance of CodeGen_LLVM suitable for the target. */ 61 static CodeGen_LLVM *new_for_target(const Target &target, 62 llvm::LLVMContext &context); 63 64 ~CodeGen_LLVM() override; 65 66 /** Takes a halide Module and compiles it to an llvm Module. */ 67 virtual std::unique_ptr<llvm::Module> compile(const Module &module); 68 69 /** The target we're generating code for */ get_target()70 const Target &get_target() const { 71 return target; 72 } 73 74 /** Tell the code generator which LLVM context to use. */ 75 void set_context(llvm::LLVMContext &context); 76 77 /** Initialize internal llvm state for the enabled targets. */ 78 static void initialize_llvm(); 79 80 static std::unique_ptr<llvm::Module> compile_trampolines( 81 const Target &target, 82 llvm::LLVMContext &context, 83 const std::string &suffix, 84 const std::vector<std::pair<std::string, ExternSignature>> &externs); 85 get_requested_alloca_total()86 size_t get_requested_alloca_total() const { 87 return requested_alloca_total; 88 } 89 90 protected: 91 CodeGen_LLVM(Target t); 92 93 /** Compile a specific halide declaration into the llvm Module. */ 94 // @{ 95 virtual void compile_func(const LoweredFunc &func, const std::string &simple_name, const std::string &extern_name); 96 virtual void compile_buffer(const Buffer<> &buffer); 97 // @} 98 99 /** Helper functions for compiling Halide functions to llvm 100 * functions. begin_func performs all the work necessary to begin 101 * generating code for a function with a given argument list with 102 * the IRBuilder. A call to begin_func should be a followed by a 103 * call to end_func with the same arguments, to generate the 104 * appropriate cleanup code. */ 105 // @{ 106 virtual void begin_func(LinkageType linkage, const std::string &simple_name, 107 const std::string &extern_name, const std::vector<LoweredArgument> &args); 108 virtual void end_func(const std::vector<LoweredArgument> &args); 109 // @} 110 111 /** What should be passed as -mcpu, -mattrs, and related for 112 * compilation. The architecture-specific code generator should 113 * define these. */ 114 // @{ 115 virtual std::string mcpu() const = 0; 116 virtual std::string mattrs() const = 0; 117 virtual bool use_soft_float_abi() const = 0; 118 virtual bool use_pic() const; 119 // @} 120 121 /** Should indexing math be promoted to 64-bit on platforms with 122 * 64-bit pointers? */ promote_indices()123 virtual bool promote_indices() const { 124 return true; 125 } 126 127 /** What's the natural vector bit-width to use for loads, stores, etc. */ 128 virtual int native_vector_bits() const = 0; 129 130 /** Return the type in which arithmetic should be done for the 131 * given storage type. */ 132 virtual Type upgrade_type_for_arithmetic(const Type &) const; 133 134 /** Return the type that a given Halide type should be 135 * stored/loaded from memory as. */ 136 virtual Type upgrade_type_for_storage(const Type &) const; 137 138 /** Return the type that a Halide type should be passed in and out 139 * of functions as. */ 140 virtual Type upgrade_type_for_argument_passing(const Type &) const; 141 142 /** State needed by llvm for code generation, including the 143 * current module, function, context, builder, and most recently 144 * generated llvm value. */ 145 //@{ 146 static bool llvm_X86_enabled; 147 static bool llvm_ARM_enabled; 148 static bool llvm_Hexagon_enabled; 149 static bool llvm_AArch64_enabled; 150 static bool llvm_NVPTX_enabled; 151 static bool llvm_Mips_enabled; 152 static bool llvm_PowerPC_enabled; 153 static bool llvm_AMDGPU_enabled; 154 static bool llvm_WebAssembly_enabled; 155 static bool llvm_RISCV_enabled; 156 157 std::unique_ptr<llvm::Module> module; 158 llvm::Function *function; 159 llvm::LLVMContext *context; 160 llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter> *builder; 161 llvm::Value *value; 162 llvm::MDNode *very_likely_branch; 163 llvm::MDNode *default_fp_math_md; 164 llvm::MDNode *strict_fp_math_md; 165 std::vector<LoweredArgument> current_function_args; 166 //@} 167 168 /** The target we're generating code for */ 169 Halide::Target target; 170 171 /** Grab all the context specific internal state. */ 172 virtual void init_context(); 173 /** Initialize the CodeGen_LLVM internal state to compile a fresh 174 * module. This allows reuse of one CodeGen_LLVM object to compiled 175 * multiple related modules (e.g. multiple device kernels). */ 176 virtual void init_module(); 177 178 /** Add external_code entries to llvm module. */ 179 void add_external_code(const Module &halide_module); 180 181 /** Run all of llvm's optimization passes on the module. */ 182 void optimize_module(); 183 184 /** Add an entry to the symbol table, hiding previous entries with 185 * the same name. Call this when new values come into scope. */ 186 void sym_push(const std::string &name, llvm::Value *value); 187 188 /** Remove an entry for the symbol table, revealing any previous 189 * entries with the same name. Call this when values go out of 190 * scope. */ 191 void sym_pop(const std::string &name); 192 193 /** Fetch an entry from the symbol table. If the symbol is not 194 * found, it either errors out (if the second arg is true), or 195 * returns nullptr. */ 196 llvm::Value *sym_get(const std::string &name, 197 bool must_succeed = true) const; 198 199 /** Test if an item exists in the symbol table. */ 200 bool sym_exists(const std::string &name) const; 201 202 /** Given a Halide ExternSignature, return the equivalent llvm::FunctionType. */ 203 llvm::FunctionType *signature_to_type(const ExternSignature &signature); 204 205 /** Some useful llvm types */ 206 // @{ 207 llvm::Type *void_t, *i1_t, *i8_t, *i16_t, *i32_t, *i64_t, *f16_t, *f32_t, *f64_t; 208 llvm::StructType *halide_buffer_t_type, 209 *type_t_type, 210 *dimension_t_type, 211 *metadata_t_type, 212 *argument_t_type, 213 *scalar_value_t_type, 214 *device_interface_t_type, 215 *pseudostack_slot_t_type, 216 *semaphore_t_type, 217 *semaphore_acquire_t_type, 218 *parallel_task_t_type; 219 220 // @} 221 222 /** Some useful llvm types for subclasses */ 223 // @{ 224 llvm::Type *i8x8, *i8x16, *i8x32; 225 llvm::Type *i16x4, *i16x8, *i16x16; 226 llvm::Type *i32x2, *i32x4, *i32x8; 227 llvm::Type *i64x2, *i64x4; 228 llvm::Type *f32x2, *f32x4, *f32x8; 229 llvm::Type *f64x2, *f64x4; 230 // @} 231 232 /** Some wildcard variables used for peephole optimizations in 233 * subclasses */ 234 // @{ 235 Expr wild_i8x8, wild_i16x4, wild_i32x2; // 64-bit signed ints 236 Expr wild_u8x8, wild_u16x4, wild_u32x2; // 64-bit unsigned ints 237 Expr wild_i8x16, wild_i16x8, wild_i32x4, wild_i64x2; // 128-bit signed ints 238 Expr wild_u8x16, wild_u16x8, wild_u32x4, wild_u64x2; // 128-bit unsigned ints 239 Expr wild_i8x32, wild_i16x16, wild_i32x8, wild_i64x4; // 256-bit signed ints 240 Expr wild_u8x32, wild_u16x16, wild_u32x8, wild_u64x4; // 256-bit unsigned ints 241 242 Expr wild_f32x2; // 64-bit floats 243 Expr wild_f32x4, wild_f64x2; // 128-bit floats 244 Expr wild_f32x8, wild_f64x4; // 256-bit floats 245 246 // Wildcards for a varying number of lanes. 247 Expr wild_u1x_, wild_i8x_, wild_u8x_, wild_i16x_, wild_u16x_; 248 Expr wild_i32x_, wild_u32x_, wild_i64x_, wild_u64x_; 249 Expr wild_f32x_, wild_f64x_; 250 Expr min_i8, max_i8, max_u8; 251 Expr min_i16, max_i16, max_u16; 252 Expr min_i32, max_i32, max_u32; 253 Expr min_i64, max_i64, max_u64; 254 Expr min_f32, max_f32, min_f64, max_f64; 255 // @} 256 257 /** Emit code that evaluates an expression, and return the llvm 258 * representation of the result of the expression. */ 259 llvm::Value *codegen(const Expr &); 260 261 /** Emit code that runs a statement. */ 262 void codegen(const Stmt &); 263 264 /** Codegen a vector Expr by codegenning each lane and combining. */ 265 void scalarize(const Expr &); 266 267 /** Some destructors should always be called. Others should only 268 * be called if the pipeline is exiting with an error code. */ 269 enum DestructorType { Always, 270 OnError, 271 OnSuccess }; 272 273 /* Call this at the location of object creation to register how an 274 * object should be destroyed. This does three things: 275 * 1) Emits code here that puts the object in a unique 276 * null-initialized stack slot 277 * 2) Adds an instruction to the destructor block that calls the 278 * destructor on that stack slot if it's not null. 279 * 3) Returns that stack slot, so you can neuter the destructor 280 * (by storing null to the stack slot) or destroy the object early 281 * (by calling trigger_destructor). 282 */ 283 llvm::Value *register_destructor(llvm::Function *destructor_fn, llvm::Value *obj, DestructorType when); 284 285 /** Call a destructor early. Pass in the value returned by register destructor. */ 286 void trigger_destructor(llvm::Function *destructor_fn, llvm::Value *stack_slot); 287 288 /** Retrieves the block containing the error handling 289 * code. Creates it if it doesn't already exist for this 290 * function. */ 291 llvm::BasicBlock *get_destructor_block(); 292 293 /** Codegen an assertion. If false, returns the error code (if not 294 * null), or evaluates and returns the message, which must be an 295 * Int(32) expression. */ 296 // @{ 297 void create_assertion(llvm::Value *condition, const Expr &message, llvm::Value *error_code = nullptr); 298 // @} 299 300 /** Codegen a block of asserts with pure conditions */ 301 void codegen_asserts(const std::vector<const AssertStmt *> &asserts); 302 303 /** Codegen a call to do_parallel_tasks */ 304 struct ParallelTask { 305 Stmt body; 306 struct SemAcquire { 307 Expr semaphore; 308 Expr count; 309 }; 310 std::vector<SemAcquire> semaphores; 311 std::string loop_var; 312 Expr min, extent; 313 Expr serial; 314 std::string name; 315 }; 316 int task_depth; 317 void get_parallel_tasks(const Stmt &s, std::vector<ParallelTask> &tasks, std::pair<std::string, int> prefix); 318 void do_parallel_tasks(const std::vector<ParallelTask> &tasks); 319 void do_as_parallel_task(const Stmt &s); 320 321 /** Return the the pipeline with the given error code. Will run 322 * the destructor block. */ 323 void return_with_error_code(llvm::Value *error_code); 324 325 /** Put a string constant in the module as a global variable and return a pointer to it. */ 326 llvm::Constant *create_string_constant(const std::string &str); 327 328 /** Put a binary blob in the module as a global variable and return a pointer to it. */ 329 llvm::Constant *create_binary_blob(const std::vector<char> &data, const std::string &name, bool constant = true); 330 331 /** Widen an llvm scalar into an llvm vector with the given number of lanes. */ 332 llvm::Value *create_broadcast(llvm::Value *, int lanes); 333 334 /** Generate a pointer into a named buffer at a given index, of a 335 * given type. The index counts according to the scalar type of 336 * the type passed in. */ 337 // @{ 338 llvm::Value *codegen_buffer_pointer(const std::string &buffer, Type type, llvm::Value *index); 339 llvm::Value *codegen_buffer_pointer(const std::string &buffer, Type type, Expr index); 340 llvm::Value *codegen_buffer_pointer(llvm::Value *base_address, Type type, Expr index); 341 llvm::Value *codegen_buffer_pointer(llvm::Value *base_address, Type type, llvm::Value *index); 342 // @} 343 344 /** Turn a Halide Type into an llvm::Value representing a constant halide_type_t */ 345 llvm::Value *make_halide_type_t(const Type &); 346 347 /** Mark a load or store with type-based-alias-analysis metadata 348 * so that llvm knows it can reorder loads and stores across 349 * different buffers */ 350 void add_tbaa_metadata(llvm::Instruction *inst, std::string buffer, const Expr &index); 351 352 /** Get a unique name for the actual block of memory that an 353 * allocate node uses. Used so that alias analysis understands 354 * when multiple Allocate nodes shared the same memory. */ get_allocation_name(const std::string & n)355 virtual std::string get_allocation_name(const std::string &n) { 356 return n; 357 } 358 359 using IRVisitor::visit; 360 361 /** Generate code for various IR nodes. These can be overridden by 362 * architecture-specific code to perform peephole 363 * optimizations. The result of each is stored in \ref value */ 364 // @{ 365 void visit(const IntImm *) override; 366 void visit(const UIntImm *) override; 367 void visit(const FloatImm *) override; 368 void visit(const StringImm *) override; 369 void visit(const Cast *) override; 370 void visit(const Variable *) override; 371 void visit(const Add *) override; 372 void visit(const Sub *) override; 373 void visit(const Mul *) override; 374 void visit(const Div *) override; 375 void visit(const Mod *) override; 376 void visit(const Min *) override; 377 void visit(const Max *) override; 378 void visit(const EQ *) override; 379 void visit(const NE *) override; 380 void visit(const LT *) override; 381 void visit(const LE *) override; 382 void visit(const GT *) override; 383 void visit(const GE *) override; 384 void visit(const And *) override; 385 void visit(const Or *) override; 386 void visit(const Not *) override; 387 void visit(const Select *) override; 388 void visit(const Load *) override; 389 void visit(const Ramp *) override; 390 void visit(const Broadcast *) override; 391 void visit(const Call *) override; 392 void visit(const Let *) override; 393 void visit(const LetStmt *) override; 394 void visit(const AssertStmt *) override; 395 void visit(const ProducerConsumer *) override; 396 void visit(const For *) override; 397 void visit(const Acquire *) override; 398 void visit(const Store *) override; 399 void visit(const Block *) override; 400 void visit(const Fork *) override; 401 void visit(const IfThenElse *) override; 402 void visit(const Evaluate *) override; 403 void visit(const Shuffle *) override; 404 void visit(const VectorReduce *) override; 405 void visit(const Prefetch *) override; 406 void visit(const Atomic *) override; 407 // @} 408 409 /** Generate code for an allocate node. It has no default 410 * implementation - it must be handled in an architecture-specific 411 * way. */ 412 void visit(const Allocate *) override = 0; 413 414 /** Generate code for a free node. It has no default 415 * implementation and must be handled in an architecture-specific 416 * way. */ 417 void visit(const Free *) override = 0; 418 419 /** These IR nodes should have been removed during 420 * lowering. CodeGen_LLVM will error out if they are present */ 421 // @{ 422 void visit(const Provide *) override; 423 void visit(const Realize *) override; 424 // @} 425 426 /** If we have to bail out of a pipeline midway, this should 427 * inject the appropriate target-specific cleanup code. */ prepare_for_early_exit()428 virtual void prepare_for_early_exit() { 429 } 430 431 /** Get the llvm type equivalent to the given halide type in the 432 * current context. */ 433 virtual llvm::Type *llvm_type_of(const Type &) const; 434 435 /** Perform an alloca at the function entrypoint. Will be cleaned 436 * on function exit. */ 437 llvm::Value *create_alloca_at_entry(llvm::Type *type, int n, 438 bool zero_initialize = false, 439 const std::string &name = ""); 440 441 /** A (very) conservative guess at the size of all alloca() storage requested 442 * (including alignment padding). It's currently meant only to be used as 443 * a very coarse way to ensure there is enough stack space when testing 444 * on the WebAssembly backend. 445 * 446 * It is *not* meant to be a useful proxy for "stack space needed", for a 447 * number of reasons: 448 * - allocas with non-overlapping lifetimes will share space 449 * - on some backends, LLVM may promote register-sized allocas into registers 450 * - while this accounts for alloca() calls we know about, it doesn't attempt 451 * to account for stack spills, function call overhead, etc. 452 */ 453 size_t requested_alloca_total = 0; 454 455 /** Which buffers came in from the outside world (and so we can't 456 * guarantee their alignment) */ 457 std::set<std::string> external_buffer; 458 459 /** The user_context argument. May be a constant null if the 460 * function is being compiled without a user context. */ 461 llvm::Value *get_user_context() const; 462 463 /** Implementation of the intrinsic call to 464 * interleave_vectors. This implementation allows for interleaving 465 * an arbitrary number of vectors.*/ 466 virtual llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &); 467 468 /** Generate a call to a vector intrinsic or runtime inlined 469 * function. The arguments are sliced up into vectors of the width 470 * given by 'intrin_lanes', the intrinsic is called on each 471 * piece, then the results (if any) are concatenated back together 472 * into the original type 't'. For the version that takes an 473 * llvm::Type *, the type may be void, so the vector width of the 474 * arguments must be specified explicitly as 475 * 'called_lanes'. */ 476 // @{ 477 llvm::Value *call_intrin(const Type &t, int intrin_lanes, 478 const std::string &name, std::vector<Expr>); 479 llvm::Value *call_intrin(llvm::Type *t, int intrin_lanes, 480 const std::string &name, std::vector<llvm::Value *>); 481 // @} 482 483 /** Take a slice of lanes out of an llvm vector. Pads with undefs 484 * if you ask for more lanes than the vector has. */ 485 virtual llvm::Value *slice_vector(llvm::Value *vec, int start, int extent); 486 487 /** Concatenate a bunch of llvm vectors. Must be of the same type. */ 488 virtual llvm::Value *concat_vectors(const std::vector<llvm::Value *> &); 489 490 /** Create an LLVM shuffle vectors instruction. */ 491 virtual llvm::Value *shuffle_vectors(llvm::Value *a, llvm::Value *b, 492 const std::vector<int> &indices); 493 /** Shorthand for shuffling a vector with an undef vector. */ 494 llvm::Value *shuffle_vectors(llvm::Value *v, const std::vector<int> &indices); 495 496 /** Go looking for a vector version of a runtime function. Will 497 * return the best match. Matches in the following order: 498 * 499 * 1) The requested vector width. 500 * 501 * 2) The width which is the smallest power of two 502 * greater than or equal to the vector width. 503 * 504 * 3) All the factors of 2) greater than one, in decreasing order. 505 * 506 * 4) The smallest power of two not yet tried. 507 * 508 * So for a 5-wide vector, it tries: 5, 8, 4, 2, 16. 509 * 510 * If there's no match, returns (nullptr, 0). 511 */ 512 std::pair<llvm::Function *, int> find_vector_runtime_function(const std::string &name, int lanes); 513 514 virtual bool supports_atomic_add(const Type &t) const; 515 516 /** Compile a horizontal reduction that starts with an explicit 517 * initial value. There are lots of complex ways to peephole 518 * optimize this pattern, especially with the proliferation of 519 * dot-product instructions, and they can usefully share logic 520 * across backends. */ 521 virtual void codegen_vector_reduce(const VectorReduce *op, const Expr &init); 522 523 /** Are we inside an atomic node that uses mutex locks? 524 This is used for detecting deadlocks from nested atomics & illegal vectorization. */ 525 bool inside_atomic_mutex_node; 526 527 /** Emit atomic store instructions? */ 528 bool emit_atomic_stores; 529 530 private: 531 /** All the values in scope at the current code location during 532 * codegen. Use sym_push and sym_pop to access. */ 533 Scope<llvm::Value *> symbol_table; 534 535 /** String constants already emitted to the module. Tracked to 536 * prevent emitting the same string many times. */ 537 std::map<std::string, llvm::Constant *> string_constants; 538 539 /** A basic block to branch to on error that triggers all 540 * destructors. As destructors are registered, code gets added 541 * to this block. */ 542 llvm::BasicBlock *destructor_block; 543 544 /** Turn off all unsafe math flags in scopes while this is set. */ 545 bool strict_float; 546 547 /** Embed an instance of halide_filter_metadata_t in the code, using 548 * the given name (by convention, this should be ${FUNCTIONNAME}_metadata) 549 * as extern "C" linkage. Note that the return value is a function-returning- 550 * pointer-to-constant-data. 551 */ 552 llvm::Function *embed_metadata_getter(const std::string &metadata_getter_name, 553 const std::string &function_name, const std::vector<LoweredArgument> &args, 554 const std::map<std::string, std::string> &metadata_name_map); 555 556 /** Embed a constant expression as a global variable. */ 557 llvm::Constant *embed_constant_expr(Expr e, llvm::Type *t); 558 llvm::Constant *embed_constant_scalar_value_t(const Expr &e); 559 560 llvm::Function *add_argv_wrapper(llvm::Function *fn, const std::string &name, bool result_in_argv = false); 561 562 llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr); 563 564 virtual void codegen_predicated_vector_load(const Load *op); 565 virtual void codegen_predicated_vector_store(const Store *op); 566 567 void codegen_atomic_store(const Store *op); 568 569 void init_codegen(const std::string &name, bool any_strict_float = false); 570 std::unique_ptr<llvm::Module> finish_codegen(); 571 572 /** A helper routine for generating folded vector reductions. */ 573 template<typename Op> 574 bool try_to_fold_vector_reduce(const Op *op); 575 }; 576 577 } // namespace Internal 578 579 /** Given a Halide module, generate an llvm::Module. */ 580 std::unique_ptr<llvm::Module> codegen_llvm(const Module &module, 581 llvm::LLVMContext &context); 582 583 } // namespace Halide 584 585 #endif 586