1 #include "CodeGen_OpenGL_Dev.h"
2 #include "CSE.h"
3 #include "Debug.h"
4 #include "Deinterleave.h"
5 #include "IRMatch.h"
6 #include "IRMutator.h"
7 #include "IROperator.h"
8 #include "Simplify.h"
9 #include "VaryingAttributes.h"
10 #include <iomanip>
11 #include <limits>
12 #include <map>
13
14 namespace Halide {
15 namespace Internal {
16
17 using std::ostringstream;
18 using std::string;
19 using std::vector;
20
21 namespace {
22
is_opengl_es(const Target & target)23 bool is_opengl_es(const Target &target) {
24 // TODO: we need a better way to switch between the different OpenGL
25 // versions (desktop GL, GLES2, GLES3, ...), probably by making it part of
26 // Target.
27 return (target.os == Target::Android ||
28 target.os == Target::IOS);
29 }
30
get_lane_suffix(int i)31 char get_lane_suffix(int i) {
32 internal_assert(i >= 0 && i < 4);
33 return "rgba"[i];
34 }
35
36 } // namespace
37
CodeGen_OpenGL_Dev(const Target & target)38 CodeGen_OpenGL_Dev::CodeGen_OpenGL_Dev(const Target &target)
39 : target(target) {
40 debug(1) << "Creating GLSL codegen\n";
41 glc = new CodeGen_GLSL(src_stream, target);
42 }
43
~CodeGen_OpenGL_Dev()44 CodeGen_OpenGL_Dev::~CodeGen_OpenGL_Dev() {
45 delete glc;
46 }
47
add_kernel(Stmt s,const string & name,const vector<DeviceArgument> & args)48 void CodeGen_OpenGL_Dev::add_kernel(Stmt s, const string &name,
49 const vector<DeviceArgument> &args) {
50 cur_kernel_name = name;
51 glc->add_kernel(s, name, args);
52 }
53
init_module()54 void CodeGen_OpenGL_Dev::init_module() {
55 src_stream.str("");
56 src_stream.clear();
57 glc->add_common_macros(src_stream);
58 cur_kernel_name = "";
59 }
60
compile_to_src()61 vector<char> CodeGen_OpenGL_Dev::compile_to_src() {
62 string str = src_stream.str();
63 debug(1) << "GLSL source:\n"
64 << str << "\n";
65 vector<char> buffer(str.begin(), str.end());
66 buffer.push_back(0);
67 return buffer;
68 }
69
get_current_kernel_name()70 string CodeGen_OpenGL_Dev::get_current_kernel_name() {
71 return cur_kernel_name;
72 }
73
dump()74 void CodeGen_OpenGL_Dev::dump() {
75 std::cerr << src_stream.str() << "\n";
76 }
77
print_gpu_name(const string & name)78 string CodeGen_OpenGL_Dev::print_gpu_name(const string &name) {
79 return glc->print_name(name);
80 }
81
82 //
83 // CodeGen_GLSLBase
84 //
CodeGen_GLSLBase(std::ostream & s,Target target)85 CodeGen_GLSLBase::CodeGen_GLSLBase(std::ostream &s, Target target)
86 : CodeGen_C(s, target) {
87 builtin["sin_f32"] = "sin";
88 builtin["sqrt_f32"] = "sqrt";
89 builtin["cos_f32"] = "cos";
90 builtin["exp_f32"] = "exp";
91 builtin["log_f32"] = "log";
92 builtin["abs_f32"] = "abs";
93 builtin["floor_f32"] = "floor";
94 builtin["ceil_f32"] = "ceil";
95 builtin["asin_f32"] = "asin";
96 builtin["acos_f32"] = "acos";
97 builtin["tan_f32"] = "tan";
98 builtin["atan_f32"] = "atan";
99 builtin["atan2_f32"] = "atan"; // also called atan in GLSL
100 builtin["sinh_f32"] = "sinh";
101 builtin["cosh_f32"] = "cosh";
102 builtin["tanh_f32"] = "tanh";
103 builtin["asinh_f32"] = "asinh";
104 builtin["acosh_f32"] = "acosh";
105 builtin["atanh_f32"] = "atanh";
106 builtin["min"] = "min";
107 builtin["max"] = "max";
108 builtin["mix"] = "mix";
109 builtin["mod"] = "mod";
110 builtin["abs"] = "abs";
111 builtin["isnan"] = "isnan";
112 builtin["round_f32"] = "roundEven";
113 builtin["fast_inverse_sqrt_f32"] = "inversesqrt";
114
115 // functions that produce bvecs
116 builtin["equal"] = "equal";
117 builtin["notEqual"] = "notEqual";
118 builtin["lessThan"] = "lessThan";
119 builtin["lessThanEqual"] = "lessThanEqual";
120 builtin["greaterThan"] = "greaterThan";
121 builtin["greaterThanEqual"] = "greaterThanEqual";
122 }
123
124 // Maps Halide types to appropriate GLSL types or emit error if no equivalent
125 // type is available.
map_type(const Type & type)126 Type CodeGen_GLSLBase::map_type(const Type &type) {
127 Type result = type;
128 if (type.is_scalar()) {
129 if (type.is_float()) {
130 user_assert(type.bits() <= 32)
131 << "GLSL: Can't represent a float with " << type.bits() << " bits.\n";
132 result = Float(32);
133 } else if (type.is_bool()) {
134 // unchanged
135 } else if (type.is_int() && type.bits() <= 32) {
136 result = Int(32);
137 } else if (type.is_uint() && type.bits() <= 32) {
138 result = UInt(32);
139 } else {
140 user_error << "GLSL: Can't represent type '" << type << "'.\n";
141 }
142 } else {
143 user_assert(type.lanes() <= 4)
144 << "GLSL: vector types wider than 4 aren't supported\n";
145 user_assert(type.is_bool() || type.is_int() || type.is_uint() || type.is_float())
146 << "GLSL: Can't represent vector type '" << type << "'.\n";
147 Type scalar_type = type.element_of();
148 result = map_type(scalar_type).with_lanes(type.lanes());
149 }
150 return result;
151 }
152
visit(const FloatImm * op)153 void CodeGen_GLSLBase::visit(const FloatImm *op) {
154 ostringstream oss;
155 // Print integral numbers with trailing ".0". For fractional numbers use a
156 // precision of 9 digits, which should be enough to recover the binary
157 // float unambiguously from the decimal representation (if iostreams
158 // implements correct rounding).
159 const float truncated = (op->value < 0 ? std::ceil(op->value) : std::floor(op->value));
160 if (truncated == op->value) {
161 oss << std::fixed << std::setprecision(1) << op->value;
162 } else {
163 oss << std::setprecision(9) << op->value;
164 }
165 id = oss.str();
166 }
167
visit(const IntImm * op)168 void CodeGen_GLSLBase::visit(const IntImm *op) {
169 id = print_type(op->type) + "(" + std::to_string(op->value) + ")";
170 }
171
visit(const UIntImm * op)172 void CodeGen_GLSLBase::visit(const UIntImm *op) {
173 if (op->type == Bool()) {
174 if (op->value == 1) {
175 id = "true";
176 } else {
177 id = "false";
178 }
179 } else {
180 id = std::to_string(op->value) + "u";
181 }
182 }
183
visit(const Max * op)184 void CodeGen_GLSLBase::visit(const Max *op) {
185 print_expr(Call::make(op->type, "max", {op->a, op->b}, Call::PureExtern));
186 }
187
visit(const Min * op)188 void CodeGen_GLSLBase::visit(const Min *op) {
189 print_expr(Call::make(op->type, "min", {op->a, op->b}, Call::PureExtern));
190 }
191
visit(const Mod * op)192 void CodeGen_GLSLBase::visit(const Mod *op) {
193 if (op->type.is_int() || op->type.is_uint()) {
194 // Just exploit the Euclidean identity
195 // FIXME: Why doesn't lower_euclidean_mod work for glsl?
196 // https://github.com/halide/Halide/issues/4979
197 Expr zero = make_zero(op->type);
198 Expr equiv = select(op->a == zero, zero,
199 op->a - (op->a / op->b) * op->b);
200 equiv = common_subexpression_elimination(equiv);
201 print_expr(equiv);
202 } else {
203 print_expr(Call::make(op->type, "mod", {op->a, op->b}, Call::Extern));
204 }
205 }
206
visit(const Call * op)207 void CodeGen_GLSLBase::visit(const Call *op) {
208 if (op->is_intrinsic(Call::lerp)) {
209 // Implement lerp using GLSL's mix() function, which always uses
210 // floating point arithmetic.
211 Expr zero_val = op->args[0];
212 Expr one_val = op->args[1];
213 Expr weight = op->args[2];
214
215 internal_assert(weight.type().is_uint() || weight.type().is_float());
216 if (weight.type().is_uint()) {
217 // Normalize integer weights to [0.0f, 1.0f] range.
218 internal_assert(weight.type().bits() < 32);
219 weight = Div::make(Cast::make(Float(32), weight),
220 Cast::make(Float(32), weight.type().max()));
221 } else if (op->type.is_uint()) {
222 // Round float weights down to next multiple of (1/op->type.imax())
223 // to give same results as lerp based on integer arithmetic.
224 internal_assert(op->type.bits() < 32);
225 weight = floor(weight * op->type.max()) / op->type.max();
226 }
227
228 Type result_type = Float(32, op->type.lanes());
229 Expr e = Call::make(result_type, "mix", {zero_val, one_val, weight}, Call::Extern);
230
231 if (!op->type.is_float()) {
232 // Mirror rounding implementation of Halide's integer lerp.
233 e = Cast::make(op->type, floor(e + 0.5f));
234 }
235 print_expr(e);
236 return;
237 } else if (op->is_intrinsic(Call::absd)) {
238 internal_assert(op->args.size() == 2);
239 Expr a = op->args[0];
240 Expr b = op->args[1];
241 Expr e = cast(op->type, select(a < b, b - a, a - b));
242 print_expr(e);
243 return;
244 } else if (op->is_intrinsic(Call::return_second)) {
245 internal_assert(op->args.size() == 2);
246 // Simply discard the first argument, which is generally a call to
247 // 'halide_printf'.
248 print_expr(op->args[1]);
249 return;
250 } else if (op->name == "fast_inverse_f32") {
251 print_expr(make_one(op->type) / op->args[0]);
252 return;
253 } else if (op->name == "fast_inverse_sqrt_f32") {
254 print_expr(make_one(op->type) / sqrt(op->args[0]));
255 return;
256 } else if (op->name == "pow_f32") {
257 if (can_prove(op->args[0] > 0)) {
258 ostringstream rhs;
259 rhs << "pow(" << print_expr(op->args[0]) << ", " << print_expr(op->args[1]) << ")";
260 print_assignment(op->type, rhs.str());
261 return;
262 } else {
263 ostringstream base;
264 string a = print_expr(op->args[0]);
265 string b = print_expr(op->args[1]);
266 base << "pow(abs(" << a << "), " << b << ")";
267 string c = print_assignment(op->type, base.str());
268 Expr a_var = Variable::make(op->type, a);
269 Expr b_var = Variable::make(op->type, b);
270 Expr c_var = Variable::make(op->type, c);
271 // OpenGL isn't required to produce NaNs, so we return
272 // zero in the undefined case.
273 Expr equiv = select(a_var > 0 || b_var % 2 == 0, c_var,
274 b_var % 2 == 1, -c_var,
275 0.0f);
276 print_expr(equiv);
277 return;
278 }
279 } else if (op->is_intrinsic(Call::shift_right)) {
280 print_assignment(op->type, print_expr(op->args[0]) + " >> " + print_expr(op->args[1]));
281 } else if (op->is_intrinsic(Call::shift_left)) {
282 print_assignment(op->type, print_expr(op->args[0]) + " << " + print_expr(op->args[1]));
283 } else if (op->is_intrinsic(Call::bitwise_not)) {
284 print_assignment(op->type, "~" + print_expr(op->args[0]));
285 } else if (op->is_intrinsic(Call::bitwise_and)) {
286 print_assignment(op->type, print_expr(op->args[0]) + " & " + print_expr(op->args[1]));
287 } else if (op->is_intrinsic(Call::bitwise_or)) {
288 print_assignment(op->type, print_expr(op->args[0]) + " | " + print_expr(op->args[1]));
289 } else if (op->is_intrinsic(Call::bitwise_xor)) {
290 print_assignment(op->type, print_expr(op->args[0]) + " ^ " + print_expr(op->args[1]));
291 } else if (op->is_intrinsic(Call::div_round_to_zero)) {
292 print_assignment(op->type, print_expr(op->args[0]) + " / " + print_expr(op->args[1]));
293 } else if (op->is_intrinsic(Call::mod_round_to_zero)) {
294 print_assignment(op->type, print_expr(op->args[0]) + " % " + print_expr(op->args[1]));
295 } else {
296 ostringstream rhs;
297 if (builtin.count(op->name) == 0) {
298 user_error << "GLSL: unknown function '" << op->name << "' encountered.\n";
299 }
300
301 rhs << builtin[op->name] << "(";
302 for (size_t i = 0; i < op->args.size(); i++) {
303 if (i > 0) rhs << ", ";
304 rhs << print_expr(op->args[i]);
305 }
306 rhs << ")";
307 print_assignment(op->type, rhs.str());
308 }
309 }
310
print_type(Type type,AppendSpaceIfNeeded space_option)311 string CodeGen_GLSLBase::print_type(Type type, AppendSpaceIfNeeded space_option) {
312 ostringstream oss;
313 type = map_type(type);
314 if (type.is_scalar()) {
315 if (type.is_float()) {
316 oss << "float";
317 } else if (type.is_bool()) {
318 oss << "bool";
319 } else if (type.is_int()) {
320 oss << "int";
321 } else if (type.is_uint()) {
322 oss << "uint";
323 } else {
324 internal_error << "GLSL: invalid type '" << type << "' encountered.\n";
325 }
326 } else {
327 if (type.is_float()) {
328 // no prefix for float vectors
329 } else if (type.is_bool()) {
330 oss << "b";
331 } else if (type.is_int()) {
332 oss << "i";
333 } else if (type.is_uint()) {
334 oss << "u";
335 } else {
336 internal_error << "GLSL: invalid type '" << type << "' encountered.\n";
337 }
338 oss << "vec" << type.lanes();
339 }
340
341 if (space_option == AppendSpace) {
342 oss << " ";
343 }
344
345 return oss.str();
346 }
347
348 // The following comparisons are defined for ivec and vec
349 // types, so we don't use call_builtin
visit(const EQ * op)350 void CodeGen_GLSLBase::visit(const EQ *op) {
351 if (op->type.is_vector()) {
352 print_expr(Call::make(op->type, "equal", {op->a, op->b}, Call::Extern));
353 } else {
354 CodeGen_C::visit(op);
355 }
356 }
357
visit(const NE * op)358 void CodeGen_GLSLBase::visit(const NE *op) {
359 if (op->type.is_vector()) {
360 print_expr(Call::make(op->type, "notEqual", {op->a, op->b}, Call::Extern));
361 } else {
362 CodeGen_C::visit(op);
363 }
364 }
365
visit(const LT * op)366 void CodeGen_GLSLBase::visit(const LT *op) {
367 if (op->type.is_vector()) {
368 print_expr(Call::make(op->type, "lessThan", {op->a, op->b}, Call::Extern));
369 } else {
370 CodeGen_C::visit(op);
371 }
372 }
373
visit(const LE * op)374 void CodeGen_GLSLBase::visit(const LE *op) {
375 if (op->type.is_vector()) {
376 print_expr(Call::make(op->type, "lessThanEqual", {op->a, op->b}, Call::Extern));
377 } else {
378 CodeGen_C::visit(op);
379 }
380 }
381
visit(const GT * op)382 void CodeGen_GLSLBase::visit(const GT *op) {
383 if (op->type.is_vector()) {
384 print_expr(Call::make(op->type, "greaterThan", {op->a, op->b}, Call::Extern));
385 } else {
386 CodeGen_C::visit(op);
387 }
388 }
389
visit(const GE * op)390 void CodeGen_GLSLBase::visit(const GE *op) {
391 if (op->type.is_vector()) {
392 print_expr(Call::make(op->type, "greaterThanEqual", {op->a, op->b}, Call::Extern));
393 } else {
394 CodeGen_C::visit(op);
395 }
396 }
397
visit(const Shuffle * op)398 void CodeGen_GLSLBase::visit(const Shuffle *op) {
399 // The halide Shuffle represents the llvm intrinisc
400 // shufflevector, however, for GLSL its use is limited to swizzling
401 // up to a four channel vec type.
402
403 internal_assert(op->vectors.size() == 1);
404
405 int shuffle_lanes = op->type.lanes();
406 internal_assert(shuffle_lanes <= 4);
407
408 string expr = print_expr(op->vectors[0]);
409
410 // Create a swizzle expression for the shuffle
411 string swizzle;
412 for (int i = 0; i != shuffle_lanes; ++i) {
413 int channel = op->indices[i];
414 internal_assert(channel < 4) << "Shuffle of invalid channel";
415 swizzle += get_lane_suffix(channel);
416 }
417
418 print_assignment(op->type, expr + "." + swizzle);
419 }
420
421 // Identifiers containing double underscores '__' are reserved in GLSL, so we
422 // have to use a different name mangling scheme than in the C code generator.
print_name(const string & name)423 string CodeGen_GLSLBase::print_name(const string &name) {
424 const string mangled = CodeGen_C::print_name(name);
425 return replace_all(mangled, "__", "XX");
426 }
427
visit(const Cast * op)428 void CodeGen_GLSLBase::visit(const Cast *op) {
429 Type value_type = op->value.type();
430 // If both types are represented by the same GLSL type, no explicit cast
431 // is necessary.
432 if (map_type(op->type) == map_type(value_type)) {
433 Expr value = op->value;
434 if (value_type.code() == Type::Float) {
435 // float->int conversions may need explicit truncation if an
436 // integer type is embedded into a float. (Note: overflows are
437 // considered undefined behavior, so we do nothing about values
438 // that are out of range of the target type.)
439 if (op->type.code() == Type::UInt) {
440 value = simplify(floor(value));
441 } else if (op->type.code() == Type::Int) {
442 value = simplify(trunc(value));
443 }
444 }
445 // FIXME: Overflow is not UB for most Halide types
446 // https://github.com/halide/Halide/issues/4975
447 value.accept(this);
448 } else {
449 Type target_type = map_type(op->type);
450 print_assignment(target_type, print_type(target_type) + "(" + print_expr(op->value) + ")");
451 }
452 }
453
454 //
455 // CodeGen_GLSL
456 //
457
CodeGen_GLSL(std::ostream & s,const Target & t)458 CodeGen_GLSL::CodeGen_GLSL(std::ostream &s, const Target &t)
459 : CodeGen_GLSLBase(s, t) {
460 builtin["trunc_f32"] = "_trunc_f32";
461 }
462
visit(const Let * op)463 void CodeGen_GLSL::visit(const Let *op) {
464
465 if (op->name.find(".varying") != string::npos) {
466
467 // Skip let statements for varying attributes
468 op->body.accept(this);
469
470 return;
471 }
472
473 CodeGen_C::visit(op);
474 }
475
visit(const For * loop)476 void CodeGen_GLSL::visit(const For *loop) {
477 user_assert(loop->for_type != ForType::GPULane)
478 << "The GLSL backend does not support the gpu_lanes() scheduling directive.";
479
480 if (ends_with(loop->name, ".__block_id_x") ||
481 ends_with(loop->name, ".__block_id_y")) {
482 internal_assert(loop->for_type == ForType::GPUBlock)
483 << "kernel loop must be gpu block\n";
484
485 debug(1) << "Dropping loop " << loop->name << " (" << loop->min << ", " << loop->extent << ")\n";
486
487 string idx;
488 if (ends_with(loop->name, ".__block_id_x")) {
489 idx = "int(_varyingf0[0])";
490 } else if (ends_with(loop->name, ".__block_id_y")) {
491 idx = "int(_varyingf0[1])";
492 }
493 stream << get_indent() << print_type(Int(32)) << " " << print_name(loop->name) << " = " << idx << ";\n";
494 loop->body.accept(this);
495 } else {
496 user_assert(loop->for_type != ForType::Parallel) << "GLSL: parallel loops aren't allowed inside kernel.\n";
497 CodeGen_C::visit(loop);
498 }
499 }
500
evaluate_vector_select(const Select * op)501 vector<Expr> evaluate_vector_select(const Select *op) {
502 const int lanes = op->type.lanes();
503 vector<Expr> result(lanes);
504 for (int i = 0; i < lanes; i++) {
505 Expr cond = extract_lane(op->condition, i);
506 Expr true_value = extract_lane(op->true_value, i);
507 Expr false_value = extract_lane(op->false_value, i);
508
509 if (is_const(cond)) {
510 result[i] = is_one(cond) ? true_value : false_value;
511 } else {
512 result[i] = Select::make(cond, true_value, false_value);
513 }
514 }
515 return result;
516 }
517
visit(const Select * op)518 void CodeGen_GLSL::visit(const Select *op) {
519 string id_value;
520 if (op->condition.type().is_scalar()) {
521 id_value = unique_name('_');
522 stream << get_indent() << print_type(op->type) << " " << id_value << ";\n";
523 string cond = print_expr(op->condition);
524 stream << get_indent() << "if (" << cond << ") ";
525 open_scope();
526 {
527 string true_val = print_expr(op->true_value);
528 stream << get_indent() << id_value << " = " << true_val << ";\n";
529 }
530 close_scope("");
531
532 stream << get_indent() << "else ";
533 open_scope();
534 {
535 string false_val = print_expr(op->false_value);
536 stream << get_indent() << id_value << " = " << false_val << ";\n";
537 }
538 close_scope("");
539 } else {
540 // Selects with vector conditions are typically used for constructing
541 // vector types. If the select condition can be evaluated at
542 // compile-time (which is often the case), we can built the vector
543 // directly without lowering to a sequence of "if" statements.
544 internal_assert(op->condition.type().lanes() == op->type.lanes());
545 int lanes = op->type.lanes();
546 vector<Expr> result = evaluate_vector_select(op);
547 vector<string> ids(lanes);
548 for (int i = 0; i < lanes; i++) {
549 ids[i] = print_expr(result[i]);
550 }
551 id_value = unique_name('_');
552 stream << get_indent() << print_type(op->type) << " " << id_value << " = "
553 << print_type(op->type) << "(";
554 for (int i = 0; i < lanes; i++) {
555 stream << ids[i] << ((i < lanes - 1) ? ", " : ");\n");
556 }
557 }
558
559 id = id_value;
560 }
561
get_vector_suffix(const Expr & e)562 string CodeGen_GLSL::get_vector_suffix(const Expr &e) {
563 vector<Expr> matches;
564 Expr w = Variable::make(Int(32), "*");
565
566 // The vectorize pass will insert a ramp in the color dimension argument.
567 const Ramp *r = e.as<Ramp>();
568 if (r && is_zero(r->base) && is_one(r->stride) && r->lanes == 4) {
569 // No suffix is needed when accessing a full RGBA vector.
570 return "";
571 } else if (r && is_zero(r->base) && is_one(r->stride) && r->lanes == 3) {
572 return ".rgb";
573 } else if (r && is_zero(r->base) && is_one(r->stride) && r->lanes == 2) {
574 return ".rg";
575 } else {
576 // GLSL 1.0 Section 5.5 supports subscript based vector indexing
577 internal_assert(e.type().is_scalar());
578 string id = print_expr(e);
579 if (e.type() != Int(32)) {
580 id = "int(" + id + ")";
581 }
582 return string("[" + id + "]");
583 }
584 }
585
print_lanes(const Expr & e)586 vector<string> CodeGen_GLSL::print_lanes(const Expr &e) {
587 int l = e.type().lanes();
588 internal_assert(e.type().is_vector());
589 vector<string> result(l);
590 if (const Broadcast *b = e.as<Broadcast>()) {
591 string val = print_expr(b->value);
592 for (int i = 0; i < l; i++) {
593 result[i] = val;
594 }
595 } else if (const Ramp *r = e.as<Ramp>()) {
596 for (int i = 0; i < l; i++) {
597 result[i] = print_expr(simplify(r->base + i * r->stride));
598 }
599 } else {
600 string val = print_expr(e);
601 for (int i = 0; i < l; i++) {
602 result[i] = val + "[" + std::to_string(i) + "]";
603 }
604 }
605 return result;
606 }
607
visit(const Load * op)608 void CodeGen_GLSL::visit(const Load *op) {
609 user_assert(is_one(op->predicate)) << "GLSL: predicated load is not supported.\n";
610 if (scalar_vars.contains(op->name)) {
611 internal_assert(is_zero(op->index));
612 id = print_name(op->name);
613 } else if (vector_vars.contains(op->name)) {
614 id = print_name(op->name) + get_vector_suffix(op->index);
615 } else if (op->type.is_scalar()) {
616 string idx = print_expr(op->index);
617 print_assignment(op->type, print_name(op->name) + "[" + idx + "]");
618 } else {
619 vector<string> indices = print_lanes(op->index);
620 ostringstream rhs;
621 rhs << print_type(op->type) << "(";
622 for (int i = 0; i < op->type.lanes(); i++) {
623 if (i > 0) {
624 rhs << ", ";
625 }
626 rhs << print_name(op->name) << "[" + indices[i] + "]";
627 }
628 rhs << ")";
629 print_assignment(op->type, rhs.str());
630 }
631 }
632
visit(const Store * op)633 void CodeGen_GLSL::visit(const Store *op) {
634 user_assert(is_one(op->predicate)) << "GLSL: predicated store is not supported.\n";
635 if (scalar_vars.contains(op->name)) {
636 internal_assert(is_zero(op->index));
637 string val = print_expr(op->value);
638 stream << get_indent() << print_name(op->name) << " = " << val << ";\n";
639 } else if (vector_vars.contains(op->name)) {
640 string val = print_expr(op->value);
641 stream << get_indent() << print_name(op->name) << get_vector_suffix(op->index)
642 << " = " << val << ";\n";
643 } else if (op->value.type().is_scalar()) {
644 string val = print_expr(op->value);
645 string idx = print_expr(op->index);
646 stream << get_indent() << print_name(op->name) << "[" << idx << "] = " << val << ";\n";
647 } else {
648 vector<string> indices = print_lanes(op->index);
649 vector<string> values = print_lanes(op->value);
650 for (int i = 0; i < op->value.type().lanes(); i++) {
651 stream << get_indent() << print_name(op->name)
652 << "[" << indices[i] << "] = "
653 << values[i] << ";\n";
654 }
655 }
656 }
657
visit(const Evaluate * op)658 void CodeGen_GLSL::visit(const Evaluate *op) {
659 print_expr(op->value);
660 }
661
visit(const Call * op)662 void CodeGen_GLSL::visit(const Call *op) {
663 ostringstream rhs;
664 if (op->is_intrinsic(Call::glsl_texture_load)) {
665 // This intrinsic takes five arguments
666 // glsl_texture_load(<tex name>, <buffer>, <x>, <y>, <c>)
667 internal_assert(op->args.size() == 5);
668
669 // The argument to the call is either a StringImm or a broadcasted
670 // StringImm if this is part of a vectorized expression
671 internal_assert(op->args[0].as<StringImm>() ||
672 (op->args[0].as<Broadcast>() && op->args[0].as<Broadcast>()->value.as<StringImm>()));
673
674 const StringImm *string_imm = op->args[0].as<StringImm>();
675 if (!string_imm) {
676 string_imm = op->args[0].as<Broadcast>()->value.as<StringImm>();
677 }
678
679 // Determine the halide buffer associated with this load
680 string buffername = string_imm->value;
681
682 internal_assert((op->type.code() == Type::UInt || op->type.code() == Type::Float) &&
683 (op->type.lanes() >= 1 && op->type.lanes() <= 4));
684
685 if (op->type.is_vector()) {
686 // The channel argument must be a ramp or a broadcast of a constant.
687 Expr c = op->args[4];
688 internal_assert(is_const(c));
689
690 const Ramp *rc = c.as<Ramp>();
691 const Broadcast *bx = op->args[2].as<Broadcast>();
692 const Broadcast *by = op->args[3].as<Broadcast>();
693 if (rc && is_zero(rc->base) && is_one(rc->stride) && bx && by) {
694 // If the x and y coordinates are broadcasts, and the c
695 // coordinate is a dense ramp, we can do a single
696 // texture2D call.
697 rhs << "texture2D(" << print_name(buffername) << ", vec2("
698 << print_expr(bx->value) << ", "
699 << print_expr(by->value) << "))";
700
701 // texture2D always returns a vec4. Swizzle out the lanes we want.
702 switch (op->type.lanes()) {
703 case 1:
704 rhs << ".r";
705 break;
706 case 2:
707 rhs << ".rg";
708 break;
709 case 3:
710 rhs << ".rgb";
711 break;
712 default:
713 break;
714 }
715 } else {
716 // Otherwise do one load per lane and make a vector
717 vector<string> xs = print_lanes(op->args[2]);
718 vector<string> ys = print_lanes(op->args[3]);
719 vector<string> cs = print_lanes(op->args[4]);
720 string name = print_name(buffername);
721
722 string x = print_expr(op->args[2]), y = print_expr(op->args[3]);
723 rhs << print_type(op->type) << "(";
724 for (int i = 0; i < op->type.lanes(); i++) {
725 if (i > 0) {
726 rhs << ", ";
727 }
728 rhs << "texture2D(" << name << ", vec2("
729 << xs[i] << ", " << ys[i] << "))[" << cs[i] << "]";
730 }
731 rhs << ")";
732 }
733 } else if (const int64_t *ic = as_const_int(op->args[4])) {
734 internal_assert(*ic >= 0 && *ic < 4);
735 rhs << "texture2D(" << print_name(buffername) << ", vec2("
736 << print_expr(op->args[2]) << ", "
737 << print_expr(op->args[3]) << "))."
738 << get_lane_suffix(*ic);
739 } else {
740 rhs << "texture2D(" << print_name(buffername) << ", vec2("
741 << print_expr(op->args[2]) << ", "
742 << print_expr(op->args[3]) << "))["
743 << print_expr(op->args[4]) << "]";
744 }
745
746 if (op->type.is_uint()) {
747 rhs << " * " << print_expr(cast<float>(op->type.max()));
748 }
749
750 } else if (op->is_intrinsic(Call::glsl_texture_store)) {
751 internal_assert(op->args.size() == 6);
752 string sval = print_expr(op->args[5]);
753 string suffix = get_vector_suffix(op->args[4]);
754 stream << get_indent() << "gl_FragColor" << suffix
755 << " = " << sval;
756 if (op->args[5].type().is_uint()) {
757 stream << " / " << print_expr(cast<float>(op->args[5].type().max()));
758 }
759 stream << ";\n";
760 // glsl_texture_store is called only for its side effect; there is
761 // no return value.
762 id = "";
763 return;
764 } else if (op->is_intrinsic(Call::glsl_varying)) {
765 // Varying attributes should be substituted out by this point in
766 // codegen.
767 debug(2) << "Found skipped varying attribute: " << op->args[0] << "\n";
768
769 // Output the tagged expression.
770 print_expr(op->args[1]);
771 return;
772 } else {
773 CodeGen_GLSLBase::visit(op);
774 return;
775 }
776 print_assignment(op->type, rhs.str());
777 }
778
779 namespace {
780 class AllAccessConstant : public IRVisitor {
781 using IRVisitor::visit;
782
visit(const Load * op)783 void visit(const Load *op) override {
784 if (op->name == buf && !is_const(op->index)) {
785 result = false;
786 }
787 IRVisitor::visit(op);
788 }
789
visit(const Store * op)790 void visit(const Store *op) override {
791 if (op->name == buf && !is_const(op->index)) {
792 result = false;
793 }
794 IRVisitor::visit(op);
795 }
796
797 public:
798 bool result = true;
799 string buf;
800 };
801 } // namespace
802
visit(const Allocate * op)803 void CodeGen_GLSL::visit(const Allocate *op) {
804 int32_t size = op->constant_allocation_size();
805 user_assert(size) << "Allocations inside GLSL kernels must be constant-sized\n";
806
807 // Check if all access to the allocation uses a constant index
808 AllAccessConstant all_access_constant;
809 all_access_constant.buf = op->name;
810 op->body.accept(&all_access_constant);
811
812 stream << get_indent();
813 if (size == 1) {
814 // We can use a variable
815 stream << print_type(op->type) << " " << print_name(op->name) << ";\n";
816 ScopedBinding<int> p(scalar_vars, op->name, 0);
817 op->body.accept(this);
818 } else if (size <= 4 && all_access_constant.result) {
819 // We can just use a vector variable
820 stream << print_type(op->type.with_lanes(size)) << " " << print_name(op->name) << ";\n";
821 ScopedBinding<int> p(vector_vars, op->name, 0);
822 op->body.accept(this);
823 } else {
824 stream << print_type(op->type) << " " << print_name(op->name) << "[" << size << "];\n";
825 op->body.accept(this);
826 }
827 }
828
visit(const Free * op)829 void CodeGen_GLSL::visit(const Free *op) {
830 }
831
visit(const AssertStmt *)832 void CodeGen_GLSL::visit(const AssertStmt *) {
833 internal_error << "GLSL: unexpected Assertion node encountered.\n";
834 }
835
visit(const Ramp * op)836 void CodeGen_GLSL::visit(const Ramp *op) {
837 ostringstream rhs;
838 rhs << print_type(op->type) << "(";
839
840 if (op->lanes > 4)
841 internal_error << "GLSL: ramp lanes " << op->lanes << " is not supported\n";
842
843 rhs << print_expr(op->base);
844
845 for (int i = 1; i < op->lanes; ++i) {
846 rhs << ", " << print_expr(Add::make(op->base, Mul::make(i, op->stride)));
847 }
848
849 rhs << ")";
850 print_assignment(op->type, rhs.str());
851 }
852
visit(const Broadcast * op)853 void CodeGen_GLSL::visit(const Broadcast *op) {
854 ostringstream rhs;
855 rhs << print_type(op->type) << "(" << print_expr(op->value) << ")";
856 print_assignment(op->type, rhs.str());
857 }
858
visit(const Atomic * op)859 void CodeGen_GLSL::visit(const Atomic *op) {
860 // Floating point atomics can be tricky as there are no floating point atomics
861 // operations, and GLSL does not allow converting a floating point buffer to an
862 // integer buffer.
863 // Plus, OpenGL supports atomics starting from 4.3, but Halide doesn't distinguish
864 // between OpenGL versions yet.
865 user_assert(false) << "GLSL: atomics are not supported.\n";
866 }
867
add_kernel(const Stmt & stmt,const string & name,const vector<DeviceArgument> & args)868 void CodeGen_GLSL::add_kernel(const Stmt &stmt, const string &name,
869 const vector<DeviceArgument> &args) {
870
871 // This function produces fragment shader source for the halide statement.
872 // The corresponding vertex shader will be generated by the halide opengl
873 // runtime based on the arguments passed in comments below. Host codegen
874 // outputs expressions that are evaluated at runtime to produce vertex data
875 // and varying attribute values at the vertices.
876
877 // Emit special header that declares the kernel name and its arguments.
878 // There is currently no standard way of passing information from the code
879 // generator to the runtime, and the information Halide passes to the
880 // runtime are fairly limited. We use these special comments to know the
881 // data types of arguments and whether textures are used for input or
882 // output.
883
884 // Keep track of the number of uniform and varying attributes
885 int num_uniform_floats = 0;
886 int num_uniform_ints = 0;
887
888 // The spatial x and y coordinates are always passed in the first two
889 // varying float attribute slots
890 int num_varying_floats = 2;
891
892 ostringstream header;
893 header << "/// KERNEL " << name << "\n";
894 for (size_t i = 0; i < args.size(); i++) {
895 if (args[i].is_buffer) {
896 Type t = args[i].type.element_of();
897
898 user_assert(args[i].read != args[i].write) << "GLSL: buffers may only be read OR written inside a kernel loop.\n";
899 string type_name;
900 if (t == UInt(8)) {
901 type_name = "uint8_t";
902 } else if (t == UInt(16)) {
903 type_name = "uint16_t";
904 } else if (t == Float(32)) {
905 type_name = "float";
906 } else {
907 user_error << "GLSL: buffer " << args[i].name << " has invalid type " << t << ".\n";
908 }
909 header << "/// " << (args[i].read ? "IN_BUFFER " : "OUT_BUFFER ")
910 << type_name << " " << print_name(args[i].name) << "\n";
911 } else if (ends_with(args[i].name, ".varying")) {
912 header << "/// VARYING "
913 // GLSL requires that varying attributes are float. Integer
914 // expressions for vertex attributes are cast to float during
915 // host codegen
916 << "float " << print_name(args[i].name) << " varyingf" << args[i].packed_index / 4 << "[" << args[i].packed_index % 4 << "]\n";
917 ++num_varying_floats;
918 } else if (args[i].type.is_float()) {
919 header << "/// UNIFORM "
920 << CodeGen_GLSLBase::print_type(args[i].type) << " "
921 << print_name(args[i].name) << " uniformf" << args[i].packed_index / 4 << "[" << args[i].packed_index % 4 << "]\n";
922 ++num_uniform_floats;
923 } else if (args[i].type.is_int()) {
924 header << "/// UNIFORM "
925 << CodeGen_GLSLBase::print_type(args[i].type) << " "
926 << print_name(args[i].name) << " uniformi" << args[i].packed_index / 4 << "[" << args[i].packed_index % 4 << "]\n";
927 ++num_uniform_ints;
928 }
929 }
930
931 // Compute the number of vec4's needed to pack the arguments
932 num_varying_floats = (num_varying_floats + 3) / 4;
933 num_uniform_floats = (num_uniform_floats + 3) / 4;
934 num_uniform_ints = (num_uniform_ints + 3) / 4;
935
936 stream << header.str();
937
938 // Specify default float precision when compiling for OpenGL ES.
939 // TODO: emit correct #version
940 if (is_opengl_es(target)) {
941 stream << "#ifdef GL_FRAGMENT_PRECISION_HIGH\n"
942 << "precision highp float;\n"
943 << "#endif\n";
944 }
945
946 // Declare input textures and variables
947 for (size_t i = 0; i < args.size(); i++) {
948 if (args[i].is_buffer && args[i].read) {
949 stream << "uniform sampler2D " << print_name(args[i].name) << ";\n";
950 }
951 }
952
953 for (int i = 0; i != num_varying_floats; ++i) {
954 stream << "varying vec4 _varyingf" << i << ";\n";
955 }
956
957 for (int i = 0; i != num_uniform_floats; ++i) {
958 stream << "uniform vec4 _uniformf" << i << ";\n";
959 }
960
961 for (int i = 0; i != num_uniform_ints; ++i) {
962 stream << "uniform ivec4 _uniformi" << i << ";\n";
963 }
964
965 // Output additional builtin functions.
966 stream << "float _trunc_f32(float x) {\n"
967 " return floor(abs(x)) * sign(x);\n"
968 "}\n";
969
970 stream << "void main() {\n";
971 indent += 2;
972
973 // Unpack the uniform and varying parameters
974 for (size_t i = 0; i < args.size(); i++) {
975 if (args[i].is_buffer) {
976 continue;
977 } else if (ends_with(args[i].name, ".varying")) {
978 stream << get_indent() << "float " << print_name(args[i].name)
979 << " = _varyingf" << args[i].packed_index / 4
980 << "[" << args[i].packed_index % 4 << "];\n";
981 } else if (args[i].type.is_float()) {
982 stream << get_indent() << print_type(args[i].type) << " "
983 << print_name(args[i].name)
984 << " = _uniformf" << args[i].packed_index / 4
985 << "[" << args[i].packed_index % 4 << "];\n";
986 } else if (args[i].type.is_int()) {
987 stream << get_indent() << print_type(args[i].type) << " "
988 << print_name(args[i].name)
989 << " = _uniformi" << args[i].packed_index / 4
990 << "[" << args[i].packed_index % 4 << "];\n";
991 }
992 }
993
994 print(stmt);
995 indent -= 2;
996 stream << "}\n";
997 }
998
999 namespace {
1000 // Replace all temporary variables names like _1234 with '$'. This is done to
1001 // make the individual tests below self-contained.
normalize_temporaries(const string & s)1002 string normalize_temporaries(const string &s) {
1003 string result;
1004 for (size_t i = 0; i < s.size();) {
1005 if (s[i] == '_') {
1006 result += '$';
1007 for (i++; i < s.size() && isdigit(s[i]); i++) {
1008 }
1009 } else {
1010 result += s[i++];
1011 }
1012 }
1013 return result;
1014 }
1015
check(Expr e,const string & result)1016 void check(Expr e, const string &result) {
1017 ostringstream source;
1018 CodeGen_GLSL cg(source, Target());
1019 if (e.as<FloatImm>() || e.as<IntImm>()) {
1020 // Hack: CodeGen_C doesn't treat immediates like other expressions, so
1021 // wrap them to obtain useful output.
1022 e = Halide::print(e);
1023 }
1024 Evaluate::make(e).accept(&cg);
1025 string src = normalize_temporaries(source.str());
1026 if (!ends_with(src, result)) {
1027 internal_error
1028 << "Codegen failed for " << e << "\n"
1029 << " Correct source code:\n"
1030 << result
1031 << " Actual source code:\n"
1032 << src;
1033 }
1034 }
1035
1036 } // namespace
1037
test()1038 void CodeGen_GLSL::test() {
1039 vector<Expr> e;
1040
1041 // Check that float constants are printed correctly.
1042 check(1.0f, "float $ = 1.0;\n");
1043 check(1.0f + std::numeric_limits<float>::epsilon(), "float $ = 1.00000012;\n");
1044 check(1.19209290e-07f, "float $ = 1.1920929e-07;\n");
1045 check(8388608.f, "float $ = 8388608.0;\n");
1046 check(-2.1e19f, "float $ = -20999999189405401088.0;\n");
1047 check(3.1415926536f, "float $ = 3.14159274;\n");
1048
1049 // Uint8 is embedded in GLSL floats, so no cast necessary
1050 check(cast<float>(Variable::make(UInt(8), "x") * 1.0f),
1051 "float $ = $x * 1.0;\n");
1052 // But truncation is necessary for the reverse direction
1053 check(cast<uint8_t>(Variable::make(Float(32), "x")),
1054 "float $ = floor($x);\n");
1055
1056 check(Min::make(Expr(1), Expr(5)),
1057 "float $ = min(1.0, 5.0);\n"
1058 "int $ = int($);\n");
1059
1060 check(Max::make(Expr(1), Expr(5)),
1061 "float $ = max(1.0, 5.0);\n"
1062 "int $ = int($);\n");
1063
1064 check(Max::make(Broadcast::make(1, 4), Broadcast::make(5, 4)),
1065 "vec4 $ = vec4(1.0);\n"
1066 "vec4 $ = vec4(5.0);\n"
1067 "vec4 $ = max($, $);\n"
1068 "ivec4 $ = ivec4($);\n");
1069
1070 check(Variable::make(Int(32), "x") / Expr(3),
1071 "float $ = float($x);\n"
1072 "float $ = $ * 0.333333343;\n"
1073 "float $ = floor($);\n"
1074 "int $ = int($);\n");
1075 check(Variable::make(Int(32, 4), "x") / Variable::make(Int(32, 4), "y"),
1076 "vec4 $ = vec4($x);\n"
1077 "vec4 $ = vec4($y);\n"
1078 "vec4 $ = $ / $;\n"
1079 "vec4 $ = floor($);\n"
1080 "ivec4 $ = ivec4($);\n");
1081 check(Variable::make(Float(32, 4), "x") / Variable::make(Float(32, 4), "y"),
1082 "vec4 $ = $x / $y;\n");
1083
1084 // Integer lerp with integer weight
1085 check(lerp(cast<uint8_t>(0), cast<uint8_t>(255), cast<uint8_t>(127)),
1086 "float $ = mix(0.0, 255.0, 0.498039216);\n"
1087 "float $ = $ + 0.5;\n"
1088 "float $ = floor($);\n");
1089
1090 // Integer lerp with float weight
1091 check(lerp(cast<uint8_t>(0), cast<uint8_t>(255), 0.3f),
1092 "float $ = mix(0.0, 255.0, 0.298039228);\n"
1093 "float $ = $ + 0.5;\n"
1094 "float $ = floor($);\n");
1095
1096 // Floating point lerp
1097 check(lerp(0.0f, 1.0f, 0.3f),
1098 "float $ = mix(0.0, 1.0, 0.300000012);\n");
1099
1100 // Vectorized lerp
1101 check(lerp(Variable::make(Float(32, 4), "x"), Variable::make(Float(32, 4), "y"), Broadcast::make(0.25f, 4)),
1102 "vec4 $ = vec4(0.25);\n"
1103 "vec4 $ = mix($x, $y, $);\n");
1104
1105 // Sin with scalar arg
1106 check(sin(3.0f), "float $ = sin(3.0);\n");
1107
1108 // Sin with vector arg
1109 check(Call::make(Float(32, 4), "sin_f32", {Broadcast::make(1.f, 4)}, Internal::Call::Extern),
1110 "vec4 $ = vec4(1.0);\n"
1111 "vec4 $ = sin($);\n");
1112
1113 // use float version of abs in GLSL
1114 check(abs(-2),
1115 "float $ = abs(-2.0);\n"
1116 "int $ = int($);\n");
1117
1118 check(Halide::print(3.0f), "float $ = 3.0;\n");
1119
1120 // Test rounding behavior of integer division.
1121 check(Variable::make(Int(32), "x") / Variable::make(Int(32), "y"),
1122 "float $ = float($x);\n"
1123 "float $ = float($y);\n"
1124 "float $ = $ / $;\n"
1125 "float $ = floor($);\n"
1126 "int $ = int($);\n");
1127
1128 // Select with scalar condition
1129 check(Select::make(EQ::make(Variable::make(Float(32), "x"), 1.0f),
1130 Broadcast::make(1.f, 4),
1131 Broadcast::make(2.f, 4)),
1132 "vec4 $;\n"
1133 "bool $ = $x == 1.0;\n"
1134 "if ($) {\n"
1135 " vec4 $ = vec4(1.0);\n"
1136 " $ = $;\n"
1137 "}\n"
1138 "else {\n"
1139 " vec4 $ = vec4(2.0);\n"
1140 " $ = $;\n"
1141 "}\n");
1142
1143 // Select with vector condition
1144 check(Select::make(EQ::make(Ramp::make(-1, 1, 4), Broadcast::make(0, 4)),
1145 Broadcast::make(1.f, 4),
1146 Broadcast::make(2.f, 4)),
1147 "vec4 $ = vec4(2.0, 1.0, 2.0, 2.0);\n");
1148
1149 // Test codegen for texture loads
1150 Expr load4 = Call::make(Float(32, 4), Call::glsl_texture_load,
1151 {string("buf"),
1152 0,
1153 Broadcast::make(0, 4),
1154 Broadcast::make(0, 4),
1155 Ramp::make(0, 1, 4)},
1156 Call::Intrinsic);
1157 check(load4, "vec4 $ = texture2D($buf, vec2(0, 0));\n");
1158
1159 check(log(1.0f), "float $ = log(1.0);\n");
1160 check(exp(1.0f), "float $ = exp(1.0);\n");
1161
1162 // Integer powers are expanded
1163 check(pow(1.4f, 2), "float $ = 1.39999998 * 1.39999998;\n");
1164 check(pow(1.0f, 2.1f), "float $ = pow(1.0, 2.0999999);\n");
1165
1166 std::cout << "CodeGen_GLSL test passed\n";
1167 }
1168
1169 } // namespace Internal
1170 } // namespace Halide
1171