1 #include "CodeGen_PTX_Dev.h"
2 #include "CSE.h"
3 #include "CodeGen_Internal.h"
4 #include "Debug.h"
5 #include "ExprUsesVar.h"
6 #include "IREquality.h"
7 #include "IRMatch.h"
8 #include "IRMutator.h"
9 #include "IROperator.h"
10 #include "IRPrinter.h"
11 #include "LLVM_Headers.h"
12 #include "LLVM_Runtime_Linker.h"
13 #include "Simplify.h"
14 #include "Solve.h"
15 #include "Target.h"
16 
17 #include <fstream>
18 
19 // This is declared in NVPTX.h, which is not exported. Ugly, but seems better than
20 // hardcoding a path to the .h file.
21 #ifdef WITH_NVPTX
22 namespace llvm {
23 FunctionPass *createNVVMReflectPass(const StringMap<int> &Mapping);
24 }
25 #endif
26 
27 namespace Halide {
28 namespace Internal {
29 
30 using std::string;
31 using std::vector;
32 
33 using namespace llvm;
34 
CodeGen_PTX_Dev(Target host)35 CodeGen_PTX_Dev::CodeGen_PTX_Dev(Target host)
36     : CodeGen_LLVM(host) {
37 #if !defined(WITH_NVPTX)
38     user_error << "ptx not enabled for this build of Halide.\n";
39 #endif
40     user_assert(llvm_NVPTX_enabled) << "llvm build not configured with nvptx target enabled\n.";
41 
42     context = new llvm::LLVMContext();
43 }
44 
~CodeGen_PTX_Dev()45 CodeGen_PTX_Dev::~CodeGen_PTX_Dev() {
46     // This is required as destroying the context before the module
47     // results in a crash. Really, responsibility for destruction
48     // should be entirely in the parent class.
49     // TODO: Figure out how to better manage the context -- e.g. allow using
50     // same one as the host.
51     module.reset();
52     delete context;
53 }
54 
upgrade_type_for_storage(const Type & t) const55 Type CodeGen_PTX_Dev::upgrade_type_for_storage(const Type &t) const {
56     if (t.element_of() == Float(16)) return t;
57     return CodeGen_LLVM::upgrade_type_for_storage(t);
58 }
59 
add_kernel(Stmt stmt,const std::string & name,const std::vector<DeviceArgument> & args)60 void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
61                                  const std::string &name,
62                                  const std::vector<DeviceArgument> &args) {
63     internal_assert(module != nullptr);
64 
65     debug(2) << "In CodeGen_PTX_Dev::add_kernel\n";
66 
67     // Now deduce the types of the arguments to our function
68     vector<llvm::Type *> arg_types(args.size());
69     for (size_t i = 0; i < args.size(); i++) {
70         if (args[i].is_buffer) {
71             arg_types[i] = llvm_type_of(UInt(8))->getPointerTo();
72         } else {
73             arg_types[i] = llvm_type_of(args[i].type);
74         }
75     }
76 
77     // Make our function
78     FunctionType *func_t = FunctionType::get(void_t, arg_types, false);
79     function = llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, name, module.get());
80     set_function_attributes_for_target(function, target);
81 
82     // Mark the buffer args as no alias
83     for (size_t i = 0; i < args.size(); i++) {
84         if (args[i].is_buffer) {
85             function->addParamAttr(i, Attribute::NoAlias);
86         }
87     }
88 
89     // Make the initial basic block
90     entry_block = BasicBlock::Create(*context, "entry", function);
91     builder->SetInsertPoint(entry_block);
92 
93     // Put the arguments in the symbol table
94     vector<string> arg_sym_names;
95     {
96         size_t i = 0;
97         for (auto &fn_arg : function->args()) {
98 
99             string arg_sym_name = args[i].name;
100             sym_push(arg_sym_name, &fn_arg);
101             fn_arg.setName(arg_sym_name);
102             arg_sym_names.push_back(arg_sym_name);
103 
104             i++;
105         }
106     }
107 
108     // We won't end the entry block yet, because we'll want to add
109     // some allocas to it later if there are local allocations. Start
110     // a new block to put all the code.
111     BasicBlock *body_block = BasicBlock::Create(*context, "body", function);
112     builder->SetInsertPoint(body_block);
113 
114     debug(1) << "Generating llvm bitcode for kernel...\n";
115     // Ok, we have a module, function, context, and a builder
116     // pointing at a brand new basic block. We're good to go.
117     stmt.accept(this);
118 
119     // Now we need to end the function
120     builder->CreateRetVoid();
121 
122     // Make the entry block point to the body block
123     builder->SetInsertPoint(entry_block);
124     builder->CreateBr(body_block);
125 
126     // Add the nvvm annotation that it is a kernel function.
127     llvm::Metadata *md_args[] = {
128         llvm::ValueAsMetadata::get(function),
129         MDString::get(*context, "kernel"),
130         llvm::ValueAsMetadata::get(ConstantInt::get(i32_t, 1))};
131 
132     MDNode *md_node = MDNode::get(*context, md_args);
133 
134     module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(md_node);
135 
136     // Now verify the function is ok
137     verifyFunction(*function);
138 
139     // Finally, verify the module is ok
140     verifyModule(*module);
141 
142     debug(2) << "Done generating llvm bitcode for PTX\n";
143 
144     // Clear the symbol table
145     for (size_t i = 0; i < arg_sym_names.size(); i++) {
146         sym_pop(arg_sym_names[i]);
147     }
148 }
149 
init_module()150 void CodeGen_PTX_Dev::init_module() {
151     init_context();
152 
153 #ifdef WITH_NVPTX
154     module = get_initial_module_for_ptx_device(target, context);
155 #endif
156 }
157 
visit(const Call * op)158 void CodeGen_PTX_Dev::visit(const Call *op) {
159     if (op->is_intrinsic(Call::gpu_thread_barrier)) {
160         // Even though we always insert a __syncthreads equivalent
161         // (which has both a device and shared memory fence)
162         // check to make sure the intrinsic has the right number of
163         // arguments
164         internal_assert(op->args.size() == 1) << "gpu_thread_barrier() intrinsic must specify memory fence type.\n";
165 
166         auto fence_type_ptr = as_const_int(op->args[0]);
167         internal_assert(fence_type_ptr) << "gpu_thread_barrier() parameter is not a constant integer.\n";
168 
169         llvm::Function *barrier0 = module->getFunction("llvm.nvvm.barrier0");
170         internal_assert(barrier0) << "Could not find PTX barrier intrinsic (llvm.nvvm.barrier0)\n";
171         builder->CreateCall(barrier0);
172         value = ConstantInt::get(i32_t, 0);
173     } else {
174         CodeGen_LLVM::visit(op);
175     }
176 }
177 
simt_intrinsic(const string & name)178 string CodeGen_PTX_Dev::simt_intrinsic(const string &name) {
179     if (ends_with(name, ".__thread_id_x")) {
180         return "llvm.nvvm.read.ptx.sreg.tid.x";
181     } else if (ends_with(name, ".__thread_id_y")) {
182         return "llvm.nvvm.read.ptx.sreg.tid.y";
183     } else if (ends_with(name, ".__thread_id_z")) {
184         return "llvm.nvvm.read.ptx.sreg.tid.z";
185     } else if (ends_with(name, ".__thread_id_w")) {
186         return "llvm.nvvm.read.ptx.sreg.tid.w";
187     } else if (ends_with(name, ".__block_id_x")) {
188         return "llvm.nvvm.read.ptx.sreg.ctaid.x";
189     } else if (ends_with(name, ".__block_id_y")) {
190         return "llvm.nvvm.read.ptx.sreg.ctaid.y";
191     } else if (ends_with(name, ".__block_id_z")) {
192         return "llvm.nvvm.read.ptx.sreg.ctaid.z";
193     } else if (ends_with(name, ".__block_id_w")) {
194         return "llvm.nvvm.read.ptx.sreg.ctaid.w";
195     }
196     internal_error << "simt_intrinsic called on bad variable name\n";
197     return "";
198 }
199 
visit(const For * loop)200 void CodeGen_PTX_Dev::visit(const For *loop) {
201     if (is_gpu_var(loop->name)) {
202         Expr simt_idx = Call::make(Int(32), simt_intrinsic(loop->name), std::vector<Expr>(), Call::Extern);
203         internal_assert(is_zero(loop->min));
204         sym_push(loop->name, codegen(simt_idx));
205         codegen(loop->body);
206         sym_pop(loop->name);
207     } else {
208         CodeGen_LLVM::visit(loop);
209     }
210 }
211 
visit(const Allocate * alloc)212 void CodeGen_PTX_Dev::visit(const Allocate *alloc) {
213     user_assert(!alloc->new_expr.defined()) << "Allocate node inside PTX kernel has custom new expression.\n"
214                                             << "(Memoization is not supported inside GPU kernels at present.)\n";
215     if (alloc->memory_type == MemoryType::GPUShared) {
216         // PTX uses zero in address space 3 as the base address for shared memory
217         Value *shared_base = Constant::getNullValue(PointerType::get(i8_t, 3));
218         sym_push(alloc->name, shared_base);
219     } else {
220         debug(2) << "Allocate " << alloc->name << " on device\n";
221 
222         string allocation_name = alloc->name;
223         debug(3) << "Pushing allocation called " << allocation_name << " onto the symbol table\n";
224 
225         // Jump back to the entry and generate an alloca. Note that by
226         // jumping back we're rendering any expression we carry back
227         // meaningless, so we had better only be dealing with
228         // constants here.
229         int32_t size = alloc->constant_allocation_size();
230         internal_assert(size > 0)
231             << "Allocation " << alloc->name << " has a dynamic size. "
232             << "This should have been moved to the heap by the "
233             << "fuse_gpu_thread_loops lowering pass.\n";
234 
235         BasicBlock *here = builder->GetInsertBlock();
236 
237         builder->SetInsertPoint(entry_block);
238         Value *ptr = builder->CreateAlloca(llvm_type_of(alloc->type), ConstantInt::get(i32_t, size));
239         builder->SetInsertPoint(here);
240         sym_push(allocation_name, ptr);
241     }
242     codegen(alloc->body);
243 }
244 
visit(const Free * f)245 void CodeGen_PTX_Dev::visit(const Free *f) {
246     sym_pop(f->name);
247 }
248 
visit(const AssertStmt * op)249 void CodeGen_PTX_Dev::visit(const AssertStmt *op) {
250     // Discard the error message for now.
251     Expr trap = Call::make(Int(32), "halide_ptx_trap", {}, Call::Extern);
252     codegen(IfThenElse::make(!op->condition, Evaluate::make(trap)));
253 }
254 
visit(const Load * op)255 void CodeGen_PTX_Dev::visit(const Load *op) {
256 
257     // Do aligned 4-wide 32-bit loads as a single i128 load.
258     const Ramp *r = op->index.as<Ramp>();
259     // TODO: lanes >= 4, not lanes == 4
260     if (is_one(op->predicate) && r && is_one(r->stride) && r->lanes == 4 && op->type.bits() == 32) {
261         ModulusRemainder align = op->alignment;
262         if (align.modulus % 4 == 0 && align.remainder % 4 == 0) {
263             Expr index = simplify(r->base / 4);
264             Expr equiv = Load::make(UInt(128), op->name, index,
265                                     op->image, op->param, const_true(), align / 4);
266             equiv = reinterpret(op->type, equiv);
267             codegen(equiv);
268             return;
269         }
270     }
271 
272     CodeGen_LLVM::visit(op);
273 }
274 
visit(const Store * op)275 void CodeGen_PTX_Dev::visit(const Store *op) {
276     // Issue atomic store if we are inside an Atomic node.
277     if (emit_atomic_stores) {
278         user_assert(is_one(op->predicate)) << "Atomic update does not support predicated store.\n";
279         user_assert(op->value.type().bits() >= 32) << "CUDA: 8-bit or 16-bit atomics are not supported.\n";
280 #if LLVM_VERSION < 90
281         user_assert(op->value.type().is_scalar())
282             << "CUDA atomic update does not support vectorization with LLVM version < 9.\n";
283         // Generate nvvm intrinsics for the atomics if this is a float atomicAdd.
284         // Otherwise defer to the llvm codegen. For llvm version >= 90, atomicrmw support floats so we
285         // can also refer to llvm.
286         // Half atomics are supported by compute capability 7.x or higher.
287         if (op->value.type().is_float() &&
288             (op->value.type().bits() == 32 ||
289              (op->value.type().bits() == 64 &&
290               (target.get_cuda_capability_lower_bound() >= 61)))) {
291             Expr val_expr = op->value;
292             Expr equiv_load = Load::make(op->value.type(), op->name, op->index, Buffer<>(), op->param, op->predicate, op->alignment);
293             Expr delta = simplify(common_subexpression_elimination(op->value - equiv_load));
294             // For atomicAdd, we check if op->value - store[index] is independent of store.
295             bool is_atomic_add = !expr_uses_var(delta, op->name);
296             if (is_atomic_add) {
297                 Value *ptr = codegen_buffer_pointer(op->name, op->value.type(), op->index);
298                 Value *val = codegen(delta);
299                 llvm::Function *intrin = nullptr;
300                 if (op->value.type().bits() == 32) {
301                     intrin = module->getFunction("llvm.nvvm.atomic.load.add.f32.p0f32");
302                     internal_assert(intrin) << "Could not find atomic intrinsics llvm.nvvm.atomic.load.add.f32.p0f32\n";
303                 } else {
304                     internal_assert(op->value.type().bits() == 64);
305                     intrin = module->getFunction("llvm.nvvm.atomic.load.add.f64.p0f64");
306                     internal_assert(intrin) << "Could not find atomic intrinsics llvm.nvvm.atomic.load.add.f64.p0f64\n";
307                 }
308                 value = builder->CreateCall(intrin, {ptr, val});
309                 return;
310             }
311         }
312 #endif
313     }
314 
315     // Do aligned 4-wide 32-bit stores as a single i128 store.
316     const Ramp *r = op->index.as<Ramp>();
317     // TODO: lanes >= 4, not lanes == 4
318     if (is_one(op->predicate) && r && is_one(r->stride) && r->lanes == 4 && op->value.type().bits() == 32) {
319         ModulusRemainder align = op->alignment;
320         if (align.modulus % 4 == 0 && align.remainder % 4 == 0) {
321             Expr index = simplify(r->base / 4);
322             Expr value = reinterpret(UInt(128), op->value);
323             Stmt equiv = Store::make(op->name, value, index, op->param, const_true(), align / 4);
324             codegen(equiv);
325             return;
326         }
327     }
328 
329     CodeGen_LLVM::visit(op);
330 }
331 
visit(const Atomic * op)332 void CodeGen_PTX_Dev::visit(const Atomic *op) {
333     // CUDA requires all the threads in a warp to perform the same operations,
334     // which means our mutex will lead to deadlock.
335     user_assert(op->mutex_name.empty())
336         << "The atomic update requires a mutex lock, which is not supported in CUDA.\n";
337 
338     // Issue atomic stores.
339     ScopedValue<bool> old_emit_atomic_stores(emit_atomic_stores, true);
340     CodeGen_LLVM::visit(op);
341 }
342 
codegen_vector_reduce(const VectorReduce * op,const Expr & init)343 void CodeGen_PTX_Dev::codegen_vector_reduce(const VectorReduce *op, const Expr &init) {
344     // Pattern match 8/16-bit dot products
345 
346     const int input_lanes = op->value.type().lanes();
347     const int factor = input_lanes / op->type.lanes();
348     const Mul *mul = op->value.as<Mul>();
349     if (op->op == VectorReduce::Add &&
350         mul &&
351         (factor % 4 == 0) &&
352         (op->type.element_of() == Int(32) ||
353          op->type.element_of() == UInt(32))) {
354         Expr i = init;
355         if (!i.defined()) {
356             i = cast(mul->type, 0);
357         }
358         // Try to narrow the multiply args to 8-bit
359         Expr a = mul->a, b = mul->b;
360         if (op->type.is_uint()) {
361             a = lossless_cast(UInt(8, input_lanes), a);
362             b = lossless_cast(UInt(8, input_lanes), b);
363         } else {
364             a = lossless_cast(Int(8, input_lanes), a);
365             b = lossless_cast(Int(8, input_lanes), b);
366             if (!a.defined()) {
367                 // try uint
368                 a = lossless_cast(UInt(8, input_lanes), mul->a);
369             }
370             if (!b.defined()) {
371                 b = lossless_cast(UInt(8, input_lanes), mul->b);
372             }
373         }
374         // If we only managed to narrow one of them, try to narrow the
375         // other to 16-bit. Swap the args so that it's always 'a'.
376         Expr a_orig = mul->a;
377         if (a.defined() && !b.defined()) {
378             std::swap(a, b);
379             a_orig = mul->b;
380         }
381         if (b.defined() && !a.defined()) {
382             // Try 16-bit instead
383             a = lossless_cast(UInt(16, input_lanes), a_orig);
384             if (!a.defined() && !op->type.is_uint()) {
385                 a = lossless_cast(Int(16, input_lanes), a_orig);
386             }
387         }
388 
389         if (a.defined() && b.defined()) {
390             std::ostringstream ss;
391             if (a.type().bits() == 8) {
392                 ss << "dp4a";
393             } else {
394                 ss << "dp2a";
395             }
396             if (a.type().is_int()) {
397                 ss << "_s32";
398             } else {
399                 ss << "_u32";
400             }
401             if (b.type().is_int()) {
402                 ss << "_s32";
403             } else {
404                 ss << "_u32";
405             }
406             const int a_32_bit_words_per_sum = (factor * a.type().bits()) / 32;
407             const int b_32_bit_words_per_sum = (factor * b.type().bits()) / 32;
408             // Reinterpret a and b as 32-bit values with fewer
409             // lanes. If they're aligned dense loads we should just do a
410             // different load.
411             for (Expr *e : {&a, &b}) {
412                 int sub_lanes = 32 / e->type().bits();
413                 const Load *load = e->as<Load>();
414                 const Ramp *idx = load ? load->index.as<Ramp>() : nullptr;
415                 if (idx &&
416                     is_one(idx->stride) &&
417                     load->alignment.modulus % sub_lanes == 0 &&
418                     load->alignment.remainder % sub_lanes == 0) {
419                     Expr new_idx = simplify(idx->base / sub_lanes);
420                     int load_lanes = input_lanes / sub_lanes;
421                     if (input_lanes > sub_lanes) {
422                         new_idx = Ramp::make(new_idx, 1, load_lanes);
423                     }
424                     *e = Load::make(Int(32, load_lanes),
425                                     load->name,
426                                     new_idx,
427                                     load->image,
428                                     load->param,
429                                     const_true(load_lanes),
430                                     load->alignment / sub_lanes);
431                 } else {
432                     *e = reinterpret(Int(32, input_lanes / sub_lanes), *e);
433                 }
434             }
435             string name = ss.str();
436             vector<Expr> result;
437             for (int l = 0; l < op->type.lanes(); l++) {
438                 // To compute a single lane of the output, we'll
439                 // extract the appropriate slice of the args, which
440                 // have been reinterpreted as 32-bit vectors, then
441                 // call either dp4a or dp2a the appropriate number of
442                 // times, and finally sum the result.
443                 Expr i_slice, a_slice, b_slice;
444                 if (i.type().is_scalar()) {
445                     i_slice = i;
446                 } else {
447                     i_slice = Shuffle::make_extract_element(i, l);
448                 }
449                 if (a.type().is_scalar()) {
450                     a_slice = a;
451                 } else {
452                     a_slice = Shuffle::make_slice(a, l * a_32_bit_words_per_sum, 1, a_32_bit_words_per_sum);
453                 }
454                 if (b.type().is_scalar()) {
455                     b_slice = b;
456                 } else {
457                     b_slice = Shuffle::make_slice(b, l * b_32_bit_words_per_sum, 1, b_32_bit_words_per_sum);
458                 }
459                 for (int i = 0; i < b_32_bit_words_per_sum; i++) {
460                     if (a_slice.type().lanes() == b_slice.type().lanes()) {
461                         Expr a_lane, b_lane;
462                         if (b_slice.type().is_scalar()) {
463                             a_lane = a_slice;
464                             b_lane = b_slice;
465                         } else {
466                             a_lane = Shuffle::make_extract_element(a_slice, i);
467                             b_lane = Shuffle::make_extract_element(b_slice, i);
468                         }
469                         i_slice = Call::make(i_slice.type(), name,
470                                              {a_lane, b_lane, i_slice},
471                                              Call::PureExtern);
472                     } else {
473                         internal_assert(a_slice.type().lanes() == 2 * b_slice.type().lanes());
474                         Expr a_lane_lo, a_lane_hi, b_lane;
475                         if (b_slice.type().is_scalar()) {
476                             b_lane = b_slice;
477                         } else {
478                             b_lane = Shuffle::make_extract_element(b_slice, i);
479                         }
480                         a_lane_lo = Shuffle::make_extract_element(a_slice, 2 * i);
481                         a_lane_hi = Shuffle::make_extract_element(a_slice, 2 * i + 1);
482                         i_slice = Call::make(i_slice.type(), name,
483                                              {a_lane_lo, a_lane_hi, b_lane, i_slice},
484                                              Call::PureExtern);
485                     }
486                 }
487                 i_slice = simplify(i_slice);
488                 i_slice = common_subexpression_elimination(i_slice);
489                 result.push_back(i_slice);
490             }
491             // Concatenate the per-lane results to get the full vector result
492             Expr equiv = Shuffle::make_concat(result);
493             equiv.accept(this);
494             return;
495         }
496     }
497     CodeGen_LLVM::codegen_vector_reduce(op, init);
498 }
499 
march() const500 string CodeGen_PTX_Dev::march() const {
501     return "nvptx64";
502 }
503 
mcpu() const504 string CodeGen_PTX_Dev::mcpu() const {
505     if (target.has_feature(Target::CUDACapability80)) {
506         return "sm_80";
507     } else if (target.has_feature(Target::CUDACapability75)) {
508         return "sm_75";
509     } else if (target.has_feature(Target::CUDACapability70)) {
510         return "sm_70";
511     } else if (target.has_feature(Target::CUDACapability61)) {
512         return "sm_61";
513     } else if (target.has_feature(Target::CUDACapability50)) {
514         return "sm_50";
515     } else if (target.has_feature(Target::CUDACapability35)) {
516         return "sm_35";
517     } else if (target.has_feature(Target::CUDACapability32)) {
518         return "sm_32";
519     } else if (target.has_feature(Target::CUDACapability30)) {
520         return "sm_30";
521     } else {
522         return "sm_20";
523     }
524 }
525 
mattrs() const526 string CodeGen_PTX_Dev::mattrs() const {
527     if (target.has_feature(Target::CUDACapability80)) {
528         return "+ptx70";
529     } else if (target.has_feature(Target::CUDACapability70) ||
530                target.has_feature(Target::CUDACapability75)) {
531         return "+ptx60";
532     } else if (target.has_feature(Target::CUDACapability61)) {
533         return "+ptx50";
534     } else if (target.features_any_of({Target::CUDACapability32,
535                                        Target::CUDACapability50})) {
536         // Need ptx isa 4.0.
537         return "+ptx40";
538     } else {
539         // Use the default. For llvm 3.5 it's ptx 3.2.
540         return "";
541     }
542 }
543 
use_soft_float_abi() const544 bool CodeGen_PTX_Dev::use_soft_float_abi() const {
545     return false;
546 }
547 
compile_to_src()548 vector<char> CodeGen_PTX_Dev::compile_to_src() {
549 
550 #ifdef WITH_NVPTX
551 
552     debug(2) << "In CodeGen_PTX_Dev::compile_to_src";
553 
554     // DISABLED - hooked in here to force PrintBeforeAll option - seems to be the only way?
555     /*char* argv[] = { "llc", "-print-before-all" };*/
556     /*int argc = sizeof(argv)/sizeof(char*);*/
557     /*cl::ParseCommandLineOptions(argc, argv, "Halide PTX internal compiler\n");*/
558 
559     llvm::Triple triple(module->getTargetTriple());
560 
561     // Allocate target machine
562 
563     std::string err_str;
564     const llvm::Target *llvm_target = TargetRegistry::lookupTarget(triple.str(), err_str);
565     internal_assert(llvm_target) << err_str << "\n";
566 
567     TargetOptions options;
568 #if LLVM_VERSION < 120
569     options.PrintMachineCode = false;
570 #endif
571     options.AllowFPOpFusion = FPOpFusion::Fast;
572     options.UnsafeFPMath = true;
573     options.NoInfsFPMath = true;
574     options.NoNaNsFPMath = true;
575     options.HonorSignDependentRoundingFPMathOption = false;
576     options.NoZerosInBSS = false;
577     options.GuaranteedTailCallOpt = false;
578     options.StackAlignmentOverride = 0;
579 
580     std::unique_ptr<TargetMachine>
581         target_machine(llvm_target->createTargetMachine(triple.str(),
582                                                         mcpu(), mattrs(), options,
583                                                         llvm::Reloc::PIC_,
584                                                         llvm::CodeModel::Small,
585                                                         CodeGenOpt::Aggressive));
586 
587     internal_assert(target_machine.get()) << "Could not allocate target machine!";
588 
589     module->setDataLayout(target_machine->createDataLayout());
590 
591     // Set up passes
592     llvm::SmallString<8> outstr;
593     raw_svector_ostream ostream(outstr);
594     ostream.SetUnbuffered();
595 
596     legacy::FunctionPassManager function_pass_manager(module.get());
597     legacy::PassManager module_pass_manager;
598 
599     module_pass_manager.add(createTargetTransformInfoWrapperPass(target_machine->getTargetIRAnalysis()));
600     function_pass_manager.add(createTargetTransformInfoWrapperPass(target_machine->getTargetIRAnalysis()));
601 
602     // NVidia's libdevice library uses a __nvvm_reflect to choose
603     // how to handle denormalized numbers. (The pass replaces calls
604     // to __nvvm_reflect with a constant via a map lookup. The inliner
605     // pass then resolves these situations to fast code, often a single
606     // instruction per decision point.)
607     //
608     // The default is (more) IEEE like handling. FTZ mode flushes them
609     // to zero. (This may only apply to single-precision.)
610     //
611     // The libdevice documentation covers other options for math accuracy
612     // such as replacing division with multiply by the reciprocal and
613     // use of fused-multiply-add, but they do not seem to be controlled
614     // by this __nvvvm_reflect mechanism and may be flags to earlier compiler
615     // passes.
616     const int kFTZDenorms = 1;
617 
618     // Insert a module flag for the FTZ handling.
619     module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
620                           kFTZDenorms);
621 
622     if (kFTZDenorms) {
623         for (llvm::Function &fn : *module) {
624             fn.addFnAttr("nvptx-f32ftz", "true");
625         }
626     }
627 
628     // At present, we default to *enabling* LLVM loop optimization,
629     // unless DisableLLVMLoopOpt is set; we're going to flip this to defaulting
630     // to *not* enabling these optimizations (and removing the DisableLLVMLoopOpt feature).
631     // See https://github.com/halide/Halide/issues/4113 for more info.
632     // (Note that setting EnableLLVMLoopOpt always enables loop opt, regardless
633     // of the setting of DisableLLVMLoopOpt.)
634     const bool do_loop_opt = !target.has_feature(Target::DisableLLVMLoopOpt) ||
635                              target.has_feature(Target::EnableLLVMLoopOpt);
636 
637     PassManagerBuilder b;
638     b.OptLevel = 3;
639     b.Inliner = createFunctionInliningPass(b.OptLevel, 0, false);
640     b.LoopVectorize = do_loop_opt;
641     b.SLPVectorize = true;
642     b.DisableUnrollLoops = !do_loop_opt;
643 
644     target_machine->adjustPassManager(b);
645 
646     b.populateFunctionPassManager(function_pass_manager);
647     b.populateModulePassManager(module_pass_manager);
648 
649     // Override default to generate verbose assembly.
650     target_machine->Options.MCOptions.AsmVerbose = true;
651 
652     // Output string stream
653 
654     // Ask the target to add backend passes as necessary.
655     bool fail = target_machine->addPassesToEmitFile(module_pass_manager, ostream, nullptr,
656 #if LLVM_VERSION >= 100
657                                                     ::llvm::CGFT_AssemblyFile,
658 #else
659                                                     TargetMachine::CGFT_AssemblyFile,
660 #endif
661                                                     true);
662     if (fail) {
663         internal_error << "Failed to set up passes to emit PTX source\n";
664     }
665 
666     // Run optimization passes
667     function_pass_manager.doInitialization();
668     for (llvm::Module::iterator i = module->begin(); i != module->end(); i++) {
669         function_pass_manager.run(*i);
670     }
671     function_pass_manager.doFinalization();
672     module_pass_manager.run(*module);
673 
674     if (debug::debug_level() >= 2) {
675         dump();
676     }
677     debug(2) << "Done with CodeGen_PTX_Dev::compile_to_src";
678 
679     debug(1) << "PTX kernel:\n"
680              << outstr.c_str() << "\n";
681 
682     vector<char> buffer(outstr.begin(), outstr.end());
683 
684     // Dump the SASS too if the cuda SDK is in the path
685     if (debug::debug_level() >= 2) {
686         debug(2) << "Compiling PTX to SASS. Will fail if CUDA SDK is not installed (and in the path).\n";
687 
688         TemporaryFile ptx(get_current_kernel_name(), ".ptx");
689         TemporaryFile sass(get_current_kernel_name(), ".sass");
690 
691         std::ofstream f(ptx.pathname());
692         f.write(buffer.data(), buffer.size());
693         f.close();
694 
695         string cmd = "ptxas --gpu-name " + mcpu() + " " + ptx.pathname() + " -o " + sass.pathname();
696         if (system(cmd.c_str()) == 0) {
697             cmd = "nvdisasm " + sass.pathname();
698             int ret = system(cmd.c_str());
699             (void)ret;  // Don't care if it fails
700         }
701 
702         // Note: It works to embed the contents of the .sass file in
703         // the buffer instead of the ptx source, and this could help
704         // with app startup times. Expose via the target?
705         /*
706         {
707             std::ifstream f(sass.pathname());
708             buffer.clear();
709             f.seekg(0, std::ios_base::end);
710             std::streampos sz = f.tellg();
711             buffer.resize(sz);
712             f.seekg(0, std::ios_base::beg);
713             f.read(buffer.data(), sz);
714         }
715         */
716     }
717 
718     // Null-terminate the ptx source
719     buffer.push_back(0);
720     return buffer;
721 #else  // WITH_NVPTX
722     return vector<char>();
723 #endif
724 }
725 
native_vector_bits() const726 int CodeGen_PTX_Dev::native_vector_bits() const {
727     // PTX doesn't really do vectorization. The widest type is a double.
728     return 64;
729 }
730 
get_current_kernel_name()731 string CodeGen_PTX_Dev::get_current_kernel_name() {
732     return get_llvm_function_name(function);
733 }
734 
dump()735 void CodeGen_PTX_Dev::dump() {
736     module->print(dbgs(), nullptr, false, true);
737 }
738 
print_gpu_name(const std::string & name)739 std::string CodeGen_PTX_Dev::print_gpu_name(const std::string &name) {
740     return name;
741 }
742 
supports_atomic_add(const Type & t) const743 bool CodeGen_PTX_Dev::supports_atomic_add(const Type &t) const {
744     if (t.bits() < 32) {
745         // TODO: Half atomics are supported by compute capability 7.x or higher.
746         return false;
747     }
748     if (t.is_int_or_uint()) {
749         return true;
750     }
751     if (t.is_float() && t.bits() == 32) {
752         return true;
753     }
754     if (t.is_float() && t.bits() == 64) {
755         // double atomics are supported since CC6.1
756         return target.get_cuda_capability_lower_bound() >= 61;
757     }
758     return false;
759 }
760 
761 }  // namespace Internal
762 }  // namespace Halide
763