1 #include <sstream>
2 
3 #include "CodeGen_D3D12Compute_Dev.h"
4 #include "CodeGen_GPU_Host.h"
5 #include "CodeGen_Internal.h"
6 #include "CodeGen_Metal_Dev.h"
7 #include "CodeGen_OpenCL_Dev.h"
8 #include "CodeGen_OpenGLCompute_Dev.h"
9 #include "CodeGen_OpenGL_Dev.h"
10 #include "CodeGen_PTX_Dev.h"
11 #include "Debug.h"
12 #include "DeviceArgument.h"
13 #include "ExprUsesVar.h"
14 #include "IROperator.h"
15 #include "IRPrinter.h"
16 #include "LLVM_Headers.h"
17 #include "Simplify.h"
18 #include "Util.h"
19 #include "VaryingAttributes.h"
20 
21 namespace Halide {
22 namespace Internal {
23 
24 using std::map;
25 using std::pair;
26 using std::string;
27 using std::vector;
28 
29 using namespace llvm;
30 
31 // Sniff the contents of a kernel to extracts the bounds of all the
32 // thread indices (so we know how many threads to launch), and the
33 // amount of shared memory to allocate.
34 class ExtractBounds : public IRVisitor {
35 public:
36     Expr num_threads[4];
37     Expr num_blocks[4];
38     Expr shared_mem_size;
39 
ExtractBounds()40     ExtractBounds()
41         : shared_mem_size(0), found_shared(false) {
42         for (int i = 0; i < 4; i++) {
43             num_threads[i] = num_blocks[i] = 1;
44         }
45     }
46 
47 private:
48     bool found_shared;
49 
50     using IRVisitor::visit;
51 
visit(const For * op)52     void visit(const For *op) override {
53         if (CodeGen_GPU_Dev::is_gpu_var(op->name)) {
54             internal_assert(is_zero(op->min));
55         }
56 
57         if (ends_with(op->name, ".__thread_id_x")) {
58             num_threads[0] = op->extent;
59         } else if (ends_with(op->name, ".__thread_id_y")) {
60             num_threads[1] = op->extent;
61         } else if (ends_with(op->name, ".__thread_id_z")) {
62             num_threads[2] = op->extent;
63         } else if (ends_with(op->name, ".__thread_id_w")) {
64             num_threads[3] = op->extent;
65         } else if (ends_with(op->name, ".__block_id_x")) {
66             num_blocks[0] = op->extent;
67         } else if (ends_with(op->name, ".__block_id_y")) {
68             num_blocks[1] = op->extent;
69         } else if (ends_with(op->name, ".__block_id_z")) {
70             num_blocks[2] = op->extent;
71         } else if (ends_with(op->name, ".__block_id_w")) {
72             num_blocks[3] = op->extent;
73         }
74 
75         op->body.accept(this);
76     }
77 
visit(const LetStmt * op)78     void visit(const LetStmt *op) override {
79         if (expr_uses_var(shared_mem_size, op->name)) {
80             shared_mem_size = Let::make(op->name, op->value, shared_mem_size);
81         }
82         op->body.accept(this);
83     }
84 
visit(const Allocate * allocate)85     void visit(const Allocate *allocate) override {
86         user_assert(!allocate->new_expr.defined()) << "Allocate node inside GPU kernel has custom new expression.\n"
87                                                    << "(Memoization is not supported inside GPU kernels at present.)\n";
88 
89         if (allocate->memory_type == MemoryType::GPUShared) {
90             internal_assert(allocate->extents.size() == 1);
91             shared_mem_size += allocate->extents[0] * allocate->type.bytes();
92             found_shared = true;
93         }
94         allocate->body.accept(this);
95     }
96 };
97 
98 template<typename CodeGen_CPU>
CodeGen_GPU_Host(Target target)99 CodeGen_GPU_Host<CodeGen_CPU>::CodeGen_GPU_Host(Target target)
100     : CodeGen_CPU(target) {
101     // For the default GPU, the order of preferences is: Metal,
102     // OpenCL, CUDA, OpenGLCompute, and OpenGL last.
103     // The code is in reverse order to allow later tests to override
104     // earlier ones.
105     if (target.has_feature(Target::OpenGL)) {
106         debug(1) << "Constructing OpenGL device codegen\n";
107         cgdev[DeviceAPI::GLSL] = new CodeGen_OpenGL_Dev(target);
108     }
109     if (target.has_feature(Target::OpenGLCompute)) {
110         debug(1) << "Constructing OpenGL Compute device codegen\n";
111         cgdev[DeviceAPI::OpenGLCompute] = new CodeGen_OpenGLCompute_Dev(target);
112     }
113     if (target.has_feature(Target::CUDA)) {
114         debug(1) << "Constructing CUDA device codegen\n";
115         cgdev[DeviceAPI::CUDA] = new CodeGen_PTX_Dev(target);
116     }
117     if (target.has_feature(Target::OpenCL)) {
118         debug(1) << "Constructing OpenCL device codegen\n";
119         cgdev[DeviceAPI::OpenCL] = new CodeGen_OpenCL_Dev(target);
120     }
121     if (target.has_feature(Target::Metal)) {
122         debug(1) << "Constructing Metal device codegen\n";
123         cgdev[DeviceAPI::Metal] = new CodeGen_Metal_Dev(target);
124     }
125     if (target.has_feature(Target::D3D12Compute)) {
126         debug(1) << "Constructing Direct3D 12 Compute device codegen\n";
127         cgdev[DeviceAPI::D3D12Compute] = new CodeGen_D3D12Compute_Dev(target);
128     }
129 
130     if (cgdev.empty()) {
131         internal_error << "Requested unknown GPU target: " << target.to_string() << "\n";
132     }
133 }
134 
135 template<typename CodeGen_CPU>
~CodeGen_GPU_Host()136 CodeGen_GPU_Host<CodeGen_CPU>::~CodeGen_GPU_Host() {
137     for (pair<const DeviceAPI, CodeGen_GPU_Dev *> &i : cgdev) {
138         delete i.second;
139     }
140 }
141 
142 template<typename CodeGen_CPU>
compile_func(const LoweredFunc & f,const std::string & simple_name,const std::string & extern_name)143 void CodeGen_GPU_Host<CodeGen_CPU>::compile_func(const LoweredFunc &f,
144                                                  const std::string &simple_name,
145                                                  const std::string &extern_name) {
146     function_name = simple_name;
147 
148     // Create a new module for all of the kernels we find in this function.
149     for (pair<const DeviceAPI, CodeGen_GPU_Dev *> &i : cgdev) {
150         i.second->init_module();
151     }
152 
153     // Call the base implementation to create the function.
154     CodeGen_CPU::compile_func(f, simple_name, extern_name);
155 
156     // We need to insert code after the existing entry block, so that
157     // the destructor stack slots exist before we do the assertions
158     // involved in initializing gpu kernels.
159 
160     // Split the entry block just before its end.
161     BasicBlock *entry = &function->getEntryBlock();
162     llvm::Instruction *terminator = entry->getTerminator();
163     internal_assert(terminator);
164     BasicBlock *post_entry = entry->splitBasicBlock(terminator);
165 
166     // Create some code that does the GPU initialization.
167     BasicBlock *init_kernels_bb = BasicBlock::Create(*context, "init_kernels",
168                                                      function, post_entry);
169 
170     // The entry block should go to the init kernels block instead of
171     // the post entry block.
172     entry->getTerminator()->eraseFromParent();
173     builder->SetInsertPoint(entry);
174     builder->CreateBr(init_kernels_bb);
175 
176     // Fill out the init kernels block
177     builder->SetInsertPoint(init_kernels_bb);
178 
179     for (pair<const DeviceAPI, CodeGen_GPU_Dev *> &i : cgdev) {
180 
181         CodeGen_GPU_Dev *gpu_codegen = i.second;
182         std::string api_unique_name = gpu_codegen->api_unique_name();
183 
184         // If the module state for this API/function did not get created, there were
185         // no kernels using this API.
186         llvm::Value *module_state = get_module_state(api_unique_name, false);
187         if (!module_state) {
188             continue;
189         }
190 
191         debug(2) << "Generating init_kernels for " << api_unique_name << "\n";
192 
193         std::vector<char> kernel_src = gpu_codegen->compile_to_src();
194 
195         Value *kernel_src_ptr =
196             CodeGen_CPU::create_binary_blob(kernel_src,
197                                             "halide_" + function_name + "_" + api_unique_name + "_kernel_src");
198 
199         if (f.args[0].name == "__user_context") {
200             // The user context is first argument of the function.
201             // We retrieve it here so it's available for subsequent calls of
202             // get_user_context().
203             sym_push("__user_context", iterator_to_pointer(function->arg_begin()));
204         }
205 
206         Value *user_context = get_user_context();
207         Value *kernel_size = ConstantInt::get(i32_t, kernel_src.size());
208         std::string init_kernels_name = "halide_" + api_unique_name + "_initialize_kernels";
209         llvm::Function *init = module->getFunction(init_kernels_name);
210         internal_assert(init) << "Could not find function " + init_kernels_name + " in initial module\n";
211         vector<Value *> init_kernels_args = {user_context, module_state, kernel_src_ptr, kernel_size};
212         Value *result = builder->CreateCall(init, init_kernels_args);
213         Value *did_succeed = builder->CreateICmpEQ(result, ConstantInt::get(i32_t, 0));
214         CodeGen_CPU::create_assertion(did_succeed, Expr(), result);
215     }
216 
217     // the init kernels block should branch to the post-entry block
218     builder->CreateBr(post_entry);
219 
220     function_name = "";
221 }
222 
223 template<typename CodeGen_CPU>
visit(const For * loop)224 void CodeGen_GPU_Host<CodeGen_CPU>::visit(const For *loop) {
225     if (CodeGen_GPU_Dev::is_gpu_var(loop->name)) {
226         // We're in the loop over outermost block dimension
227         debug(2) << "Kernel launch: " << loop->name << "\n";
228 
229         internal_assert(loop->device_api != DeviceAPI::Default_GPU)
230             << "A concrete device API should have been selected before codegen.";
231 
232         ExtractBounds bounds;
233         loop->accept(&bounds);
234 
235         debug(2) << "Kernel bounds: ("
236                  << bounds.num_threads[0] << ", "
237                  << bounds.num_threads[1] << ", "
238                  << bounds.num_threads[2] << ", "
239                  << bounds.num_threads[3] << ") threads, ("
240                  << bounds.num_blocks[0] << ", "
241                  << bounds.num_blocks[1] << ", "
242                  << bounds.num_blocks[2] << ", "
243                  << bounds.num_blocks[3] << ") blocks\n";
244 
245         // compile the kernel
246         string kernel_name = unique_name("kernel_" + loop->name);
247         for (size_t i = 0; i < kernel_name.size(); i++) {
248             if (!isalnum(kernel_name[i])) {
249                 kernel_name[i] = '_';
250             }
251         }
252 
253         Value *null_float_ptr = ConstantPointerNull::get(CodeGen_LLVM::f32_t->getPointerTo());
254         Value *zero_int32 = codegen(Expr(cast<int>(0)));
255 
256         Value *gpu_num_padded_attributes = zero_int32;
257         Value *gpu_vertex_buffer = null_float_ptr;
258         Value *gpu_num_coords_dim0 = zero_int32;
259         Value *gpu_num_coords_dim1 = zero_int32;
260 
261         if (loop->device_api == DeviceAPI::GLSL) {
262 
263             // GL draw calls that invoke the GLSL shader are issued for pairs of
264             // for-loops over spatial x and y dimensions. For each for-loop we create
265             // one scalar vertex attribute for the spatial dimension corresponding to
266             // that loop, plus one scalar attribute for each expression previously
267             // labeled as "glsl_varying"
268 
269             // Pass variables created during setup_gpu_vertex_buffer to the
270             // dev run function call.
271             gpu_num_padded_attributes = codegen(Variable::make(Int(32), "glsl.num_padded_attributes"));
272             gpu_num_coords_dim0 = codegen(Variable::make(Int(32), "glsl.num_coords_dim0"));
273             gpu_num_coords_dim1 = codegen(Variable::make(Int(32), "glsl.num_coords_dim1"));
274 
275             // Look up the allocation for the vertex buffer and cast it to the
276             // right type
277             gpu_vertex_buffer = codegen(Variable::make(type_of<float *>(), "glsl.vertex_buffer"));
278             gpu_vertex_buffer = builder->CreatePointerCast(gpu_vertex_buffer,
279                                                            CodeGen_LLVM::f32_t->getPointerTo());
280         }
281 
282         // compute a closure over the state passed into the kernel
283         HostClosure c(loop->body, loop->name);
284 
285         // Determine the arguments that must be passed into the halide function
286         vector<DeviceArgument> closure_args = c.arguments();
287 
288         // Sort the args by the size of the underlying type. This is
289         // helpful for avoiding struct-packing ambiguities in metal,
290         // which passes the scalar args as a struct.
291         std::sort(closure_args.begin(), closure_args.end(),
292                   [](const DeviceArgument &a, const DeviceArgument &b) {
293                       if (a.is_buffer == b.is_buffer) {
294                           return a.type.bits() > b.type.bits();
295                       } else {
296                           // Ensure that buffer arguments come first:
297                           // for many OpenGL/Compute systems, the
298                           // legal indices for buffer args are much
299                           // more restrictive than for scalar args,
300                           // and scalar args can be 'grown' by
301                           // LICM. Putting buffers first makes it much
302                           // more likely we won't fail on some
303                           // hardware.
304                           return a.is_buffer > b.is_buffer;
305                       }
306                   });
307 
308         // Halide allows passing of scalar float and integer arguments. For
309         // OpenGL, pack these into vec4 uniforms and varying attributes
310         if (loop->device_api == DeviceAPI::GLSL) {
311 
312             int num_uniform_floats = 0;
313 
314             // The spatial x and y coordinates are passed in the first two
315             // scalar float varying slots
316             int num_varying_floats = 2;
317             int num_uniform_ints = 0;
318 
319             // Pack scalar parameters into vec4
320             for (size_t i = 0; i < closure_args.size(); i++) {
321                 if (closure_args[i].is_buffer) {
322                     continue;
323                 } else if (ends_with(closure_args[i].name, ".varying")) {
324                     closure_args[i].packed_index = num_varying_floats++;
325                 } else if (closure_args[i].type.is_float()) {
326                     closure_args[i].packed_index = num_uniform_floats++;
327                 } else if (closure_args[i].type.is_int()) {
328                     closure_args[i].packed_index = num_uniform_ints++;
329                 }
330             }
331         }
332 
333         for (size_t i = 0; i < closure_args.size(); i++) {
334             if (closure_args[i].is_buffer && allocations.contains(closure_args[i].name)) {
335                 closure_args[i].size = allocations.get(closure_args[i].name).constant_bytes;
336             }
337         }
338 
339         CodeGen_GPU_Dev *gpu_codegen = cgdev[loop->device_api];
340         user_assert(gpu_codegen != nullptr)
341             << "Loop is scheduled on device " << loop->device_api
342             << " which does not appear in target " << target.to_string() << "\n";
343         gpu_codegen->add_kernel(loop, kernel_name, closure_args);
344 
345         // get the actual name of the generated kernel for this loop
346         kernel_name = gpu_codegen->get_current_kernel_name();
347         debug(2) << "Compiled launch to kernel \"" << kernel_name << "\"\n";
348         Value *entry_name_str = builder->CreateGlobalStringPtr(kernel_name, "entry_name");
349 
350         llvm::Type *target_size_t_type = (target.bits == 32) ? i32_t : i64_t;
351 
352         // build the kernel arguments array
353         llvm::PointerType *arg_t = i8_t->getPointerTo();  // void*
354         int num_args = (int)closure_args.size();
355 
356         // nullptr-terminated list
357         llvm::Type *gpu_args_arr_type = ArrayType::get(arg_t, num_args + 1);
358         Value *gpu_args_arr =
359             create_alloca_at_entry(
360                 gpu_args_arr_type,
361                 1, false,
362                 kernel_name + "_args");
363 
364         // nullptr-terminated list of size_t's
365         llvm::Type *gpu_arg_sizes_arr_type = ArrayType::get(target_size_t_type, num_args + 1);
366         llvm::ArrayType *gpu_arg_types_arr_type = ArrayType::get(type_t_type, num_args + 1);
367         vector<Constant *> arg_types_array_entries;
368 
369         std::string api_unique_name = gpu_codegen->api_unique_name();
370 
371         Value *gpu_arg_sizes_arr = nullptr;
372         bool runtime_run_takes_types = gpu_codegen->kernel_run_takes_types();
373 
374         if (!runtime_run_takes_types) {
375             gpu_arg_sizes_arr =
376                 create_alloca_at_entry(
377                     gpu_arg_sizes_arr_type,
378                     1, false,
379                     kernel_name + "_arg_sizes");
380         }
381 
382         llvm::Type *gpu_arg_is_buffer_arr_type = ArrayType::get(i8_t, num_args + 1);
383         Value *gpu_arg_is_buffer_arr =
384             create_alloca_at_entry(
385                 gpu_arg_is_buffer_arr_type,
386                 1, false,
387                 kernel_name + "_arg_is_buffer");
388 
389         for (int i = 0; i < num_args; i++) {
390             // get the closure argument
391             string name = closure_args[i].name;
392             Value *val;
393 
394             if (closure_args[i].is_buffer) {
395                 // If it's a buffer, get the .buffer symbol
396                 val = sym_get(name + ".buffer");
397             } else if (ends_with(name, ".varying")) {
398                 // Expressions for varying attributes are passed in the
399                 // expression mesh. Pass a non-nullptr value in the argument array
400                 // to keep it in sync with the argument names encoded in the
401                 // shader header
402                 val = ConstantInt::get(target_size_t_type, 1);
403             } else {
404                 // Otherwise just look up the symbol
405                 val = sym_get(name);
406             }
407 
408             if (!closure_args[i].is_buffer) {
409                 // allocate stack space to mirror the closure element. It
410                 // might be in a register and we need a pointer to it for
411                 // the gpu args array.
412                 Value *ptr = create_alloca_at_entry(val->getType(), 1, false, name + ".stack");
413                 // store the closure value into the stack space
414                 builder->CreateStore(val, ptr);
415                 val = ptr;
416             }
417 
418             // store a void * pointer to the argument into the gpu_args_arr
419             Value *bits = builder->CreateBitCast(val, arg_t);
420             builder->CreateStore(bits,
421                                  builder->CreateConstGEP2_32(
422                                      gpu_args_arr_type,
423                                      gpu_args_arr,
424                                      0,
425                                      i));
426 
427             if (runtime_run_takes_types) {
428                 Constant *arg_type_fields[] = {
429                     ConstantInt::get(i8_t, closure_args[i].type.code()),
430                     ConstantInt::get(i8_t, closure_args[i].type.bits()),
431                     ConstantInt::get(i16_t, 1)};
432                 arg_types_array_entries.push_back(ConstantStruct::get(type_t_type, arg_type_fields));
433             } else {
434                 // store the size of the argument.
435                 int size_bytes = (closure_args[i].is_buffer) ? 8 : closure_args[i].type.bytes();
436                 builder->CreateStore(ConstantInt::get(target_size_t_type, size_bytes),
437                                      builder->CreateConstGEP2_32(
438                                          gpu_arg_sizes_arr_type,
439                                          gpu_arg_sizes_arr,
440                                          0,
441                                          i));
442             }
443 
444             builder->CreateStore(ConstantInt::get(i8_t, closure_args[i].is_buffer),
445                                  builder->CreateConstGEP2_32(
446                                      gpu_arg_is_buffer_arr_type,
447                                      gpu_arg_is_buffer_arr,
448                                      0,
449                                      i));
450         }
451         // nullptr-terminate the lists
452         builder->CreateStore(ConstantPointerNull::get(arg_t),
453                              builder->CreateConstGEP2_32(
454                                  gpu_args_arr_type,
455                                  gpu_args_arr,
456                                  0,
457                                  num_args));
458         if (runtime_run_takes_types) {
459             Constant *arg_type_fields[] = {
460                 ConstantInt::get(i8_t, 0),
461                 ConstantInt::get(i8_t, 0),
462                 ConstantInt::get(i16_t, 0)};
463             arg_types_array_entries.push_back(ConstantStruct::get(type_t_type, arg_type_fields));
464         } else {
465             builder->CreateStore(ConstantInt::get(target_size_t_type, 0),
466                                  builder->CreateConstGEP2_32(
467                                      gpu_arg_sizes_arr_type,
468                                      gpu_arg_sizes_arr,
469                                      0,
470                                      num_args));
471         }
472         builder->CreateStore(ConstantInt::get(i8_t, 0),
473                              builder->CreateConstGEP2_32(
474                                  gpu_arg_is_buffer_arr_type,
475                                  gpu_arg_is_buffer_arr,
476                                  0,
477                                  num_args));
478 
479         GlobalVariable *arg_types_array_storage = nullptr;
480         if (runtime_run_takes_types) {
481             arg_types_array_storage = new GlobalVariable(
482                 *module,
483                 gpu_arg_types_arr_type,
484                 /*isConstant*/ true,
485                 GlobalValue::PrivateLinkage,
486                 ConstantArray::get(gpu_arg_types_arr_type, arg_types_array_entries));
487         }
488 
489         // TODO: only three dimensions can be passed to
490         // cuLaunchKernel. How should we handle blkid[3]?
491         internal_assert(is_one(bounds.num_threads[3]) && is_one(bounds.num_blocks[3]))
492             << bounds.num_threads[3] << ", " << bounds.num_blocks[3] << "\n";
493         debug(4) << "CodeGen_GPU_Host get_user_context returned " << get_user_context() << "\n";
494         debug(3) << "bounds.num_blocks[0] = " << bounds.num_blocks[0] << "\n";
495         debug(3) << "bounds.num_blocks[1] = " << bounds.num_blocks[1] << "\n";
496         debug(3) << "bounds.num_blocks[2] = " << bounds.num_blocks[2] << "\n";
497         debug(3) << "bounds.num_threads[0] = " << bounds.num_threads[0] << "\n";
498         debug(3) << "bounds.num_threads[1] = " << bounds.num_threads[1] << "\n";
499         debug(3) << "bounds.num_threads[2] = " << bounds.num_threads[2] << "\n";
500 
501         Constant *zero = ConstantInt::get(i32_t, 0);
502         Value *zeros[] = {zero, zero};
503 
504         // Order-of-evaluation is guaranteed to be in order in brace-init-lists,
505         // so the multiple calls to codegen here are fine
506         Value *launch_args[] = {
507             get_user_context(),
508             builder->CreateLoad(get_module_state(api_unique_name)),
509             entry_name_str,
510             codegen(bounds.num_blocks[0]),
511             codegen(bounds.num_blocks[1]),
512             codegen(bounds.num_blocks[2]),
513             codegen(bounds.num_threads[0]),
514             codegen(bounds.num_threads[1]),
515             codegen(bounds.num_threads[2]),
516             codegen(bounds.shared_mem_size),
517             runtime_run_takes_types ? ConstantExpr::getInBoundsGetElementPtr(gpu_arg_types_arr_type, arg_types_array_storage, zeros) : builder->CreateConstGEP2_32(gpu_arg_sizes_arr_type, gpu_arg_sizes_arr, 0, 0, "gpu_arg_sizes_ar_ref" + api_unique_name),
518             builder->CreateConstGEP2_32(
519                 gpu_args_arr_type,
520                 gpu_args_arr,
521                 0,
522                 0,
523                 "gpu_args_arr_ref" + api_unique_name),
524             builder->CreateConstGEP2_32(
525                 gpu_arg_is_buffer_arr_type,
526                 gpu_arg_is_buffer_arr,
527                 0,
528                 0,
529                 "gpu_arg_is_buffer_ref" + api_unique_name),
530             gpu_num_padded_attributes,
531             gpu_vertex_buffer,
532             gpu_num_coords_dim0,
533             gpu_num_coords_dim1,
534         };
535         std::string run_fn_name = "halide_" + api_unique_name + "_run";
536         llvm::Function *dev_run_fn = module->getFunction(run_fn_name);
537         internal_assert(dev_run_fn) << "Could not find " << run_fn_name << " in module\n";
538         Value *result = builder->CreateCall(dev_run_fn, launch_args);
539         Value *did_succeed = builder->CreateICmpEQ(result, ConstantInt::get(i32_t, 0));
540 
541         CodeGen_CPU::create_assertion(did_succeed,
542                                       // Should have already called halide_error inside the gpu runtime
543                                       halide_error_code_device_run_failed,
544                                       result);
545     } else {
546         CodeGen_CPU::visit(loop);
547     }
548 }
549 
550 template<typename CodeGen_CPU>
get_module_state(const std::string & api_unique_name,bool create)551 Value *CodeGen_GPU_Host<CodeGen_CPU>::get_module_state(const std::string &api_unique_name,
552                                                        bool create) {
553     std::string name = "module_state_" + function_name + "_" + api_unique_name;
554     GlobalVariable *module_state = module->getGlobalVariable(name, true);
555     if (!module_state && create) {
556         // Create a global variable to hold the module state
557         PointerType *void_ptr_type = llvm::Type::getInt8PtrTy(*context);
558         module_state = new GlobalVariable(*module, void_ptr_type,
559                                           false, GlobalVariable::InternalLinkage,
560                                           ConstantPointerNull::get(void_ptr_type),
561                                           name);
562         debug(4) << "Created device module state global variable\n";
563     }
564 
565     return module_state;
566 }
567 
568 // Force template instantiation.
569 #ifdef WITH_X86
570 template class CodeGen_GPU_Host<CodeGen_X86>;
571 #endif
572 
573 #if defined(WITH_ARM) || defined(WITH_AARCH64)
574 template class CodeGen_GPU_Host<CodeGen_ARM>;
575 #endif
576 
577 #ifdef WITH_MIPS
578 template class CodeGen_GPU_Host<CodeGen_MIPS>;
579 #endif
580 
581 #ifdef WITH_POWERPC
582 template class CodeGen_GPU_Host<CodeGen_PowerPC>;
583 #endif
584 
585 #ifdef WITH_WEBASSEMBLY
586 template class CodeGen_GPU_Host<CodeGen_WebAssembly>;
587 #endif
588 
589 #ifdef WITH_RISCV
590 template class CodeGen_GPU_Host<CodeGen_RISCV>;
591 #endif
592 
593 }  // namespace Internal
594 }  // namespace Halide
595