1 #include "CodeGen_OpenGLCompute_Dev.h"
2 #include "Debug.h"
3 #include "Deinterleave.h"
4 #include "IRMatch.h"
5 #include "IRMutator.h"
6 #include "IROperator.h"
7 #include "Simplify.h"
8 #include "VaryingAttributes.h"
9 #include <iomanip>
10 #include <limits>
11 #include <map>
12
13 namespace Halide {
14 namespace Internal {
15
16 using std::ostringstream;
17 using std::string;
18 using std::vector;
19
CodeGen_OpenGLCompute_Dev(Target target)20 CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_Dev(Target target)
21 : glc(src_stream, target) {
22 }
23
CodeGen_OpenGLCompute_C(std::ostream & s,Target t)24 CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::CodeGen_OpenGLCompute_C(std::ostream &s, Target t)
25 : CodeGen_GLSLBase(s, t) {
26 builtin["trunc_f32"] = "trunc";
27 }
28
print_type(Type type,AppendSpaceIfNeeded space)29 string CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::print_type(Type type, AppendSpaceIfNeeded space) {
30 Type mapped_type = map_type(type);
31 if (mapped_type.is_uint() && !mapped_type.is_bool()) {
32 string s = mapped_type.is_scalar() ? "uint" : "uvec" + std::to_string(mapped_type.lanes());
33 if (space == AppendSpace) {
34 s += " ";
35 }
36 return s;
37 } else {
38 return CodeGen_GLSLBase::print_type(type, space);
39 }
40 }
41
42 namespace {
simt_intrinsic(const string & name)43 string simt_intrinsic(const string &name) {
44 if (ends_with(name, ".__thread_id_x")) {
45 return "gl_LocalInvocationID.x";
46 } else if (ends_with(name, ".__thread_id_y")) {
47 return "gl_LocalInvocationID.y";
48 } else if (ends_with(name, ".__thread_id_z")) {
49 return "gl_LocalInvocationID.z";
50 } else if (ends_with(name, ".__thread_id_w")) {
51 internal_error << "4-dimension loops with " << name << " are not supported\n";
52 } else if (ends_with(name, ".__block_id_x")) {
53 return "gl_WorkGroupID.x";
54 } else if (ends_with(name, ".__block_id_y")) {
55 return "gl_WorkGroupID.y";
56 } else if (ends_with(name, ".__block_id_z")) {
57 return "gl_WorkGroupID.z";
58 } else if (ends_with(name, ".__block_id_w")) {
59 internal_error << "4-dimension loops with " << name << " are not supported\n";
60 }
61 internal_error << "simt_intrinsic called on bad variable name: " << name << "\n";
62 return "";
63 }
64
thread_loop_workgroup_index(const string & name)65 int thread_loop_workgroup_index(const string &name) {
66 string ids[] = {".__thread_id_x",
67 ".__thread_id_y",
68 ".__thread_id_z",
69 ".__thread_id_w"};
70 for (size_t i = 0; i < sizeof(ids) / sizeof(string); i++) {
71 if (ends_with(name, ids[i])) {
72 return i;
73 }
74 }
75 return -1;
76 }
77 } // namespace
78
visit(const Call * op)79 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const Call *op) {
80 if (op->is_intrinsic(Call::gpu_thread_barrier)) {
81 internal_assert(op->args.size() == 1) << "gpu_thread_barrier() intrinsic must specify memory fence type.\n";
82
83 auto fence_type_ptr = as_const_int(op->args[0]);
84 internal_assert(fence_type_ptr) << "gpu_thread_barrier() parameter is not a constant integer.\n";
85 auto fence_type = *fence_type_ptr;
86
87 stream << get_indent() << "barrier();\n";
88
89 // barrier() is an execution barrier; for memory behavior, we'll use the
90 // least-common-denominator groupMemoryBarrier(), because other fence types
91 // require extensions or GL 4.3 as a minumum.
92 if (fence_type & CodeGen_GPU_Dev::MemoryFenceType::Device ||
93 fence_type & CodeGen_GPU_Dev::MemoryFenceType::Shared) {
94 stream << "groupMemoryBarrier();\n";
95 }
96 print_assignment(op->type, "0");
97 } else {
98 CodeGen_GLSLBase::visit(op);
99 }
100 }
101
visit(const For * loop)102 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const For *loop) {
103 user_assert(loop->for_type != ForType::GPULane)
104 << "The OpenGLCompute backend does not support the gpu_lanes() scheduling directive.";
105
106 if (is_gpu_var(loop->name)) {
107 internal_assert((loop->for_type == ForType::GPUBlock) ||
108 (loop->for_type == ForType::GPUThread))
109 << "kernel loop must be either gpu block or gpu thread\n";
110 internal_assert(is_zero(loop->min));
111
112 debug(4) << "loop extent is " << loop->extent << "\n";
113 //
114 // Need to extract workgroup size.
115 //
116 int index = thread_loop_workgroup_index(loop->name);
117 if (index >= 0) {
118 const IntImm *int_limit = loop->extent.as<IntImm>();
119 user_assert(int_limit != nullptr) << "For OpenGLCompute workgroup size must be a constant integer.\n";
120 int new_workgroup_size = int_limit->value;
121 user_assert(workgroup_size[index] == 0 ||
122 workgroup_size[index] == new_workgroup_size)
123 << "OpenGLCompute requires all gpu kernels have same workgroup size, "
124 << "but two different ones were encountered " << workgroup_size[index]
125 << " and " << new_workgroup_size
126 << " in dimension " << index << ".\n";
127 workgroup_size[index] = new_workgroup_size;
128 debug(4) << "Workgroup size for index " << index << " is " << workgroup_size[index] << "\n";
129 }
130
131 stream << get_indent() << print_type(Int(32)) << " " << print_name(loop->name)
132 << " = int(" << simt_intrinsic(loop->name) << ");\n";
133
134 loop->body.accept(this);
135
136 } else {
137 user_assert(loop->for_type != ForType::Parallel)
138 << "Cannot use parallel loops inside OpenGLCompute kernel\n";
139 CodeGen_C::visit(loop);
140 }
141 }
142
visit(const Ramp * op)143 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const Ramp *op) {
144 ostringstream rhs;
145 rhs << print_type(op->type) << "(";
146
147 if (op->lanes > 4) {
148 internal_error << "GLSL: ramp lanes " << op->lanes << " is not supported\n";
149 }
150
151 rhs << print_expr(op->base);
152
153 for (int i = 1; i < op->lanes; ++i) {
154 rhs << ", " << print_expr(Add::make(op->base, Mul::make(i, op->stride)));
155 }
156
157 rhs << ")";
158 print_assignment(op->base.type(), rhs.str());
159 }
160
visit(const Broadcast * op)161 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const Broadcast *op) {
162 string id_value = print_expr(op->value);
163 ostringstream oss;
164 oss << print_type(op->type.with_lanes(op->lanes)) << "(" << id_value << ")";
165 print_assignment(op->type.with_lanes(op->lanes), oss.str());
166 }
167
visit(const Load * op)168 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const Load *op) {
169 user_assert(is_one(op->predicate)) << "GLSL: predicated load is not supported.\n";
170 // TODO: support vectors
171 // https://github.com/halide/Halide/issues/4975
172 internal_assert(op->type.is_scalar());
173 string id_index = print_expr(op->index);
174
175 ostringstream oss;
176 oss << print_name(op->name);
177 if (!allocations.contains(op->name)) {
178 oss << ".data";
179 }
180 oss << "[" << id_index << "]";
181 print_assignment(op->type, oss.str());
182 }
183
visit(const Store * op)184 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const Store *op) {
185 user_assert(is_one(op->predicate)) << "GLSL: predicated store is not supported.\n";
186 // TODO: support vectors
187 // https://github.com/halide/Halide/issues/4975
188 internal_assert(op->value.type().is_scalar());
189 string id_index = print_expr(op->index);
190
191 string id_value = print_expr(op->value);
192
193 stream << get_indent() << print_name(op->name);
194 if (!allocations.contains(op->name)) {
195 stream << ".data";
196 }
197 stream << "[" << id_index << "] = " << print_type(op->value.type()) << "(" << id_value << ");\n";
198
199 // Need a cache clear on stores to avoid reusing stale loaded
200 // values from before the store.
201 cache.clear();
202 }
203
visit(const Select * op)204 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const Select *op) {
205 ostringstream rhs;
206 string true_val = print_expr(op->true_value);
207 string false_val = print_expr(op->false_value);
208 string cond = print_expr(op->condition);
209 rhs << print_type(op->type)
210 << "(" << cond
211 << " ? " << true_val
212 << " : " << false_val
213 << ")";
214 print_assignment(op->type, rhs.str());
215 }
216
add_kernel(Stmt s,const string & name,const vector<DeviceArgument> & args)217 void CodeGen_OpenGLCompute_Dev::add_kernel(Stmt s,
218 const string &name,
219 const vector<DeviceArgument> &args) {
220 debug(2) << "CodeGen_OpenGLCompute_Dev::compile " << name << "\n";
221
222 // TODO: do we have to uniquify these names, or can we trust that they are safe?
223 cur_kernel_name = name;
224 glc.add_kernel(s, name, args);
225 }
226
227 namespace {
228 class FindSharedAllocations : public IRVisitor {
229 using IRVisitor::visit;
230
visit(const Allocate * op)231 void visit(const Allocate *op) override {
232 op->body.accept(this);
233 if (op->memory_type == MemoryType::GPUShared) {
234 allocs.push_back(op);
235 }
236 }
237
238 public:
239 vector<const Allocate *> allocs;
240 };
241 } // namespace
242
add_kernel(const Stmt & s,const string & name,const vector<DeviceArgument> & args)243 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::add_kernel(const Stmt &s,
244 const string &name,
245 const vector<DeviceArgument> &args) {
246
247 debug(2) << "Adding OpenGLCompute kernel " << name << "\n";
248 cache.clear();
249
250 if (target.os == Target::Android) {
251 stream << "#version 310 es\n"
252 << "#extension GL_ANDROID_extension_pack_es31a : require\n";
253 } else if (target.has_feature(Target::EGL)) {
254 stream << "#version 310 es\n";
255 } else {
256 stream << "#version 430\n";
257 }
258 add_common_macros(stream);
259 stream << "float float_from_bits(int x) { return intBitsToFloat(int(x)); }\n";
260 stream << "#define halide_unused(x) (void)(x)\n";
261
262 for (size_t i = 0; i < args.size(); i++) {
263 if (args[i].is_buffer) {
264 //
265 // layout(binding = 10) buffer buffer10 {
266 // vec3 data[];
267 // } inBuffer;
268 //
269 stream << "layout(binding=" << i << ")"
270 << " buffer buffer" << i << " { "
271 << print_type(args[i].type) << " data[]; } "
272 << print_name(args[i].name) << ";\n";
273 } else {
274 stream << "layout(location = " << i << ") uniform " << print_type(args[i].type)
275 << " " << print_name(args[i].name) << ";\n";
276 }
277 }
278
279 // Find all the shared allocations and declare them at global scope.
280 FindSharedAllocations fsa;
281 s.accept(&fsa);
282 for (const Allocate *op : fsa.allocs) {
283 internal_assert(op->extents.size() == 1 && is_const(op->extents[0]));
284 stream << "shared "
285 << print_type(op->type) << " "
286 << print_name(op->name) << "["
287 << op->extents[0] << "];\n";
288 }
289
290 // We'll figure out the workgroup size while traversing the stmt
291 workgroup_size[0] = 0;
292 workgroup_size[1] = 0;
293 workgroup_size[2] = 0;
294
295 stream << "void main()\n{\n";
296 indent += 2;
297 print(s);
298 indent -= 2;
299 stream << "}\n";
300
301 // Declare the workgroup size.
302 indent += 2;
303 stream << "layout(local_size_x = " << workgroup_size[0];
304 if (workgroup_size[1] > 1) {
305 stream << ", local_size_y = " << workgroup_size[1];
306 }
307 if (workgroup_size[2] > 1) {
308 stream << ", local_size_z = " << workgroup_size[2];
309 }
310 stream << ") in;\n// end of kernel " << name << "\n";
311 indent -= 2;
312 }
313
init_module()314 void CodeGen_OpenGLCompute_Dev::init_module() {
315 src_stream.str("");
316 src_stream.clear();
317 cur_kernel_name = "";
318 }
319
visit(const Allocate * op)320 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const Allocate *op) {
321 debug(2) << "OpenGLCompute: Allocate " << op->name << " of type " << op->type << " on device\n";
322
323 stream << get_indent();
324 Allocation alloc;
325 alloc.type = op->type;
326 allocations.push(op->name, alloc);
327
328 internal_assert(!op->extents.empty());
329 Expr extent = 1;
330 for (Expr e : op->extents) {
331 extent *= e;
332 }
333 extent = simplify(extent);
334 internal_assert(is_const(extent));
335
336 if (op->memory_type != MemoryType::GPUShared) {
337 stream << "{\n";
338 indent += 2;
339 stream << get_indent();
340 // Shared allocations were already declared at global scope.
341 stream << print_type(op->type) << " "
342 << print_name(op->name) << "["
343 << op->extents[0] << "];\n";
344 }
345 op->body.accept(this);
346
347 if (op->memory_type != MemoryType::GPUShared) {
348 indent -= 2;
349 stream << get_indent() << "}\n";
350 }
351 }
352
visit(const Free * op)353 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const Free *op) {
354 debug(2) << "OpenGLCompute: Free on device for " << op->name << "\n";
355
356 allocations.pop(op->name);
357 }
358
visit(const Evaluate * op)359 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const Evaluate *op) {
360 if (is_const(op->value)) return;
361 print_expr(op->value);
362 }
363
visit(const IntImm * op)364 void CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_C::visit(const IntImm *op) {
365 if (op->type == Int(32)) {
366 // GL seems to interpret some large int immediates as uints.
367 id = "int(" + std::to_string(op->value) + ")";
368 } else {
369 id = print_type(op->type) + "(" + std::to_string(op->value) + ")";
370 }
371 }
372
compile_to_src()373 vector<char> CodeGen_OpenGLCompute_Dev::compile_to_src() {
374 string str = src_stream.str();
375 debug(1) << "GLSL Compute source:\n"
376 << str << "\n";
377 vector<char> buffer(str.begin(), str.end());
378 buffer.push_back(0);
379 return buffer;
380 }
381
get_current_kernel_name()382 string CodeGen_OpenGLCompute_Dev::get_current_kernel_name() {
383 return cur_kernel_name;
384 }
385
dump()386 void CodeGen_OpenGLCompute_Dev::dump() {
387 std::cerr << src_stream.str() << "\n";
388 }
389
print_gpu_name(const std::string & name)390 std::string CodeGen_OpenGLCompute_Dev::print_gpu_name(const std::string &name) {
391 return name;
392 }
393
394 } // namespace Internal
395 } // namespace Halide
396