1 #include "CodeGen_OpenGLCompute_Dev.h"
2 #include "Debug.h"
3 #include "Deinterleave.h"
4 #include "IRMatch.h"
5 #include "IRMutator.h"
6 #include "IROperator.h"
7 #include "Simplify.h"
8 #include "VaryingAttributes.h"
9 #include <iomanip>
10 #include <limits>
11 #include <map>
12 
13 namespace Halide {
14 namespace Internal {
15 
16 using std::ostringstream;
17 using std::string;
18 using std::vector;
19 
CodeGen_OpenGLCompute_Dev(Target target)20 CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_Dev(Target target)
21     : glc(src_stream, target) {
22 }
23 
CodeGen_OpenGLCompute_C(std::ostream & s,Target t)24 CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::CodeGen_OpenGLCompute_C(std::ostream &s, Target t)
25     : CodeGen_GLSLBase(s, t) {
26     builtin["trunc_f32"] = "trunc";
27 }
28 
print_type(Type type,AppendSpaceIfNeeded space)29 string CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::print_type(Type type, AppendSpaceIfNeeded space) {
30     Type mapped_type = map_type(type);
31     if (mapped_type.is_uint() && !mapped_type.is_bool()) {
32         string s = mapped_type.is_scalar() ? "uint" : "uvec" + std::to_string(mapped_type.lanes());
33         if (space == AppendSpace) {
34             s += " ";
35         }
36         return s;
37     } else {
38         return CodeGen_GLSLBase::print_type(type, space);
39     }
40 }
41 
42 namespace {
simt_intrinsic(const string & name)43 string simt_intrinsic(const string &name) {
44     if (ends_with(name, ".__thread_id_x")) {
45         return "gl_LocalInvocationID.x";
46     } else if (ends_with(name, ".__thread_id_y")) {
47         return "gl_LocalInvocationID.y";
48     } else if (ends_with(name, ".__thread_id_z")) {
49         return "gl_LocalInvocationID.z";
50     } else if (ends_with(name, ".__thread_id_w")) {
51         internal_error << "4-dimension loops with " << name << " are not supported\n";
52     } else if (ends_with(name, ".__block_id_x")) {
53         return "gl_WorkGroupID.x";
54     } else if (ends_with(name, ".__block_id_y")) {
55         return "gl_WorkGroupID.y";
56     } else if (ends_with(name, ".__block_id_z")) {
57         return "gl_WorkGroupID.z";
58     } else if (ends_with(name, ".__block_id_w")) {
59         internal_error << "4-dimension loops with " << name << " are not supported\n";
60     }
61     internal_error << "simt_intrinsic called on bad variable name: " << name << "\n";
62     return "";
63 }
64 
thread_loop_workgroup_index(const string & name)65 int thread_loop_workgroup_index(const string &name) {
66     string ids[] = {".__thread_id_x",
67                     ".__thread_id_y",
68                     ".__thread_id_z",
69                     ".__thread_id_w"};
70     for (size_t i = 0; i < sizeof(ids) / sizeof(string); i++) {
71         if (ends_with(name, ids[i])) {
72             return i;
73         }
74     }
75     return -1;
76 }
77 }  // namespace
78 
visit(const Call * op)79 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const Call *op) {
80     if (op->is_intrinsic(Call::gpu_thread_barrier)) {
81         internal_assert(op->args.size() == 1) << "gpu_thread_barrier() intrinsic must specify memory fence type.\n";
82 
83         auto fence_type_ptr = as_const_int(op->args[0]);
84         internal_assert(fence_type_ptr) << "gpu_thread_barrier() parameter is not a constant integer.\n";
85         auto fence_type = *fence_type_ptr;
86 
87         stream << get_indent() << "barrier();\n";
88 
89         // barrier() is an execution barrier; for memory behavior, we'll use the
90         // least-common-denominator groupMemoryBarrier(), because other fence types
91         // require extensions or GL 4.3 as a minumum.
92         if (fence_type & CodeGen_GPU_Dev::MemoryFenceType::Device ||
93             fence_type & CodeGen_GPU_Dev::MemoryFenceType::Shared) {
94             stream << "groupMemoryBarrier();\n";
95         }
96         print_assignment(op->type, "0");
97     } else {
98         CodeGen_GLSLBase::visit(op);
99     }
100 }
101 
visit(const For * loop)102 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const For *loop) {
103     user_assert(loop->for_type != ForType::GPULane)
104         << "The OpenGLCompute backend does not support the gpu_lanes() scheduling directive.";
105 
106     if (is_gpu_var(loop->name)) {
107         internal_assert((loop->for_type == ForType::GPUBlock) ||
108                         (loop->for_type == ForType::GPUThread))
109             << "kernel loop must be either gpu block or gpu thread\n";
110         internal_assert(is_zero(loop->min));
111 
112         debug(4) << "loop extent is " << loop->extent << "\n";
113         //
114         //  Need to extract workgroup size.
115         //
116         int index = thread_loop_workgroup_index(loop->name);
117         if (index >= 0) {
118             const IntImm *int_limit = loop->extent.as<IntImm>();
119             user_assert(int_limit != nullptr) << "For OpenGLCompute workgroup size must be a constant integer.\n";
120             int new_workgroup_size = int_limit->value;
121             user_assert(workgroup_size[index] == 0 ||
122                         workgroup_size[index] == new_workgroup_size)
123                 << "OpenGLCompute requires all gpu kernels have same workgroup size, "
124                 << "but two different ones were encountered " << workgroup_size[index]
125                 << " and " << new_workgroup_size
126                 << " in dimension " << index << ".\n";
127             workgroup_size[index] = new_workgroup_size;
128             debug(4) << "Workgroup size for index " << index << " is " << workgroup_size[index] << "\n";
129         }
130 
131         stream << get_indent() << print_type(Int(32)) << " " << print_name(loop->name)
132                << " = int(" << simt_intrinsic(loop->name) << ");\n";
133 
134         loop->body.accept(this);
135 
136     } else {
137         user_assert(loop->for_type != ForType::Parallel)
138             << "Cannot use parallel loops inside OpenGLCompute kernel\n";
139         CodeGen_C::visit(loop);
140     }
141 }
142 
visit(const Ramp * op)143 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const Ramp *op) {
144     ostringstream rhs;
145     rhs << print_type(op->type) << "(";
146 
147     if (op->lanes > 4) {
148         internal_error << "GLSL: ramp lanes " << op->lanes << " is not supported\n";
149     }
150 
151     rhs << print_expr(op->base);
152 
153     for (int i = 1; i < op->lanes; ++i) {
154         rhs << ", " << print_expr(Add::make(op->base, Mul::make(i, op->stride)));
155     }
156 
157     rhs << ")";
158     print_assignment(op->base.type(), rhs.str());
159 }
160 
visit(const Broadcast * op)161 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const Broadcast *op) {
162     string id_value = print_expr(op->value);
163     ostringstream oss;
164     oss << print_type(op->type.with_lanes(op->lanes)) << "(" << id_value << ")";
165     print_assignment(op->type.with_lanes(op->lanes), oss.str());
166 }
167 
visit(const Load * op)168 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const Load *op) {
169     user_assert(is_one(op->predicate)) << "GLSL: predicated load is not supported.\n";
170     // TODO: support vectors
171     // https://github.com/halide/Halide/issues/4975
172     internal_assert(op->type.is_scalar());
173     string id_index = print_expr(op->index);
174 
175     ostringstream oss;
176     oss << print_name(op->name);
177     if (!allocations.contains(op->name)) {
178         oss << ".data";
179     }
180     oss << "[" << id_index << "]";
181     print_assignment(op->type, oss.str());
182 }
183 
visit(const Store * op)184 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const Store *op) {
185     user_assert(is_one(op->predicate)) << "GLSL: predicated store is not supported.\n";
186     // TODO: support vectors
187     // https://github.com/halide/Halide/issues/4975
188     internal_assert(op->value.type().is_scalar());
189     string id_index = print_expr(op->index);
190 
191     string id_value = print_expr(op->value);
192 
193     stream << get_indent() << print_name(op->name);
194     if (!allocations.contains(op->name)) {
195         stream << ".data";
196     }
197     stream << "[" << id_index << "] = " << print_type(op->value.type()) << "(" << id_value << ");\n";
198 
199     // Need a cache clear on stores to avoid reusing stale loaded
200     // values from before the store.
201     cache.clear();
202 }
203 
visit(const Select * op)204 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const Select *op) {
205     ostringstream rhs;
206     string true_val = print_expr(op->true_value);
207     string false_val = print_expr(op->false_value);
208     string cond = print_expr(op->condition);
209     rhs << print_type(op->type)
210         << "(" << cond
211         << " ? " << true_val
212         << " : " << false_val
213         << ")";
214     print_assignment(op->type, rhs.str());
215 }
216 
add_kernel(Stmt s,const string & name,const vector<DeviceArgument> & args)217 void CodeGen_OpenGLCompute_Dev::add_kernel(Stmt s,
218                                            const string &name,
219                                            const vector<DeviceArgument> &args) {
220     debug(2) << "CodeGen_OpenGLCompute_Dev::compile " << name << "\n";
221 
222     // TODO: do we have to uniquify these names, or can we trust that they are safe?
223     cur_kernel_name = name;
224     glc.add_kernel(s, name, args);
225 }
226 
227 namespace {
228 class FindSharedAllocations : public IRVisitor {
229     using IRVisitor::visit;
230 
visit(const Allocate * op)231     void visit(const Allocate *op) override {
232         op->body.accept(this);
233         if (op->memory_type == MemoryType::GPUShared) {
234             allocs.push_back(op);
235         }
236     }
237 
238 public:
239     vector<const Allocate *> allocs;
240 };
241 }  // namespace
242 
add_kernel(const Stmt & s,const string & name,const vector<DeviceArgument> & args)243 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::add_kernel(const Stmt &s,
244                                                                     const string &name,
245                                                                     const vector<DeviceArgument> &args) {
246 
247     debug(2) << "Adding OpenGLCompute kernel " << name << "\n";
248     cache.clear();
249 
250     if (target.os == Target::Android) {
251         stream << "#version 310 es\n"
252                << "#extension GL_ANDROID_extension_pack_es31a : require\n";
253     } else if (target.has_feature(Target::EGL)) {
254         stream << "#version 310 es\n";
255     } else {
256         stream << "#version 430\n";
257     }
258     add_common_macros(stream);
259     stream << "float float_from_bits(int x) { return intBitsToFloat(int(x)); }\n";
260     stream << "#define halide_unused(x) (void)(x)\n";
261 
262     for (size_t i = 0; i < args.size(); i++) {
263         if (args[i].is_buffer) {
264             //
265             // layout(binding = 10) buffer buffer10 {
266             //     vec3 data[];
267             // } inBuffer;
268             //
269             stream << "layout(binding=" << i << ")"
270                    << " buffer buffer" << i << " { "
271                    << print_type(args[i].type) << " data[]; } "
272                    << print_name(args[i].name) << ";\n";
273         } else {
274             stream << "layout(location = " << i << ") uniform " << print_type(args[i].type)
275                    << " " << print_name(args[i].name) << ";\n";
276         }
277     }
278 
279     // Find all the shared allocations and declare them at global scope.
280     FindSharedAllocations fsa;
281     s.accept(&fsa);
282     for (const Allocate *op : fsa.allocs) {
283         internal_assert(op->extents.size() == 1 && is_const(op->extents[0]));
284         stream << "shared "
285                << print_type(op->type) << " "
286                << print_name(op->name) << "["
287                << op->extents[0] << "];\n";
288     }
289 
290     // We'll figure out the workgroup size while traversing the stmt
291     workgroup_size[0] = 0;
292     workgroup_size[1] = 0;
293     workgroup_size[2] = 0;
294 
295     stream << "void main()\n{\n";
296     indent += 2;
297     print(s);
298     indent -= 2;
299     stream << "}\n";
300 
301     // Declare the workgroup size.
302     indent += 2;
303     stream << "layout(local_size_x = " << workgroup_size[0];
304     if (workgroup_size[1] > 1) {
305         stream << ", local_size_y = " << workgroup_size[1];
306     }
307     if (workgroup_size[2] > 1) {
308         stream << ", local_size_z = " << workgroup_size[2];
309     }
310     stream << ") in;\n// end of kernel " << name << "\n";
311     indent -= 2;
312 }
313 
init_module()314 void CodeGen_OpenGLCompute_Dev::init_module() {
315     src_stream.str("");
316     src_stream.clear();
317     cur_kernel_name = "";
318 }
319 
visit(const Allocate * op)320 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const Allocate *op) {
321     debug(2) << "OpenGLCompute: Allocate " << op->name << " of type " << op->type << " on device\n";
322 
323     stream << get_indent();
324     Allocation alloc;
325     alloc.type = op->type;
326     allocations.push(op->name, alloc);
327 
328     internal_assert(!op->extents.empty());
329     Expr extent = 1;
330     for (Expr e : op->extents) {
331         extent *= e;
332     }
333     extent = simplify(extent);
334     internal_assert(is_const(extent));
335 
336     if (op->memory_type != MemoryType::GPUShared) {
337         stream << "{\n";
338         indent += 2;
339         stream << get_indent();
340         // Shared allocations were already declared at global scope.
341         stream << print_type(op->type) << " "
342                << print_name(op->name) << "["
343                << op->extents[0] << "];\n";
344     }
345     op->body.accept(this);
346 
347     if (op->memory_type != MemoryType::GPUShared) {
348         indent -= 2;
349         stream << get_indent() << "}\n";
350     }
351 }
352 
visit(const Free * op)353 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const Free *op) {
354     debug(2) << "OpenGLCompute: Free on device for " << op->name << "\n";
355 
356     allocations.pop(op->name);
357 }
358 
visit(const Evaluate * op)359 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const Evaluate *op) {
360     if (is_const(op->value)) return;
361     print_expr(op->value);
362 }
363 
visit(const IntImm * op)364 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const IntImm *op) {
365     if (op->type == Int(32)) {
366         // GL seems to interpret some large int immediates as uints.
367         id = "int(" + std::to_string(op->value) + ")";
368     } else {
369         id = print_type(op->type) + "(" + std::to_string(op->value) + ")";
370     }
371 }
372 
compile_to_src()373 vector<char> CodeGen_OpenGLCompute_Dev::compile_to_src() {
374     string str = src_stream.str();
375     debug(1) << "GLSL Compute source:\n"
376              << str << "\n";
377     vector<char> buffer(str.begin(), str.end());
378     buffer.push_back(0);
379     return buffer;
380 }
381 
get_current_kernel_name()382 string CodeGen_OpenGLCompute_Dev::get_current_kernel_name() {
383     return cur_kernel_name;
384 }
385 
dump()386 void CodeGen_OpenGLCompute_Dev::dump() {
387     std::cerr << src_stream.str() << "\n";
388 }
389 
print_gpu_name(const std::string & name)390 std::string CodeGen_OpenGLCompute_Dev::print_gpu_name(const std::string &name) {
391     return name;
392 }
393 
394 }  // namespace Internal
395 }  // namespace Halide
396