1 #include "CodeGen_OpenGL_Dev.h"
2 #include "CSE.h"
3 #include "Debug.h"
4 #include "Deinterleave.h"
5 #include "IRMatch.h"
6 #include "IRMutator.h"
7 #include "IROperator.h"
8 #include "Simplify.h"
9 #include "VaryingAttributes.h"
10 #include <iomanip>
11 #include <limits>
12 #include <map>
13 
14 namespace Halide {
15 namespace Internal {
16 
17 using std::ostringstream;
18 using std::string;
19 using std::vector;
20 
21 namespace {
22 
is_opengl_es(const Target & target)23 bool is_opengl_es(const Target &target) {
24     // TODO: we need a better way to switch between the different OpenGL
25     // versions (desktop GL, GLES2, GLES3, ...), probably by making it part of
26     // Target.
27     return (target.os == Target::Android ||
28             target.os == Target::IOS);
29 }
30 
get_lane_suffix(int i)31 char get_lane_suffix(int i) {
32     internal_assert(i >= 0 && i < 4);
33     return "rgba"[i];
34 }
35 
36 }  // namespace
37 
CodeGen_OpenGL_Dev(const Target & target)38 CodeGen_OpenGL_Dev::CodeGen_OpenGL_Dev(const Target &target)
39     : target(target) {
40     debug(1) << "Creating GLSL codegen\n";
41     glc = new CodeGen_GLSL(src_stream, target);
42 }
43 
~CodeGen_OpenGL_Dev()44 CodeGen_OpenGL_Dev::~CodeGen_OpenGL_Dev() {
45     delete glc;
46 }
47 
add_kernel(Stmt s,const string & name,const vector<DeviceArgument> & args)48 void CodeGen_OpenGL_Dev::add_kernel(Stmt s, const string &name,
49                                     const vector<DeviceArgument> &args) {
50     cur_kernel_name = name;
51     glc->add_kernel(s, name, args);
52 }
53 
init_module()54 void CodeGen_OpenGL_Dev::init_module() {
55     src_stream.str("");
56     src_stream.clear();
57     glc->add_common_macros(src_stream);
58     cur_kernel_name = "";
59 }
60 
compile_to_src()61 vector<char> CodeGen_OpenGL_Dev::compile_to_src() {
62     string str = src_stream.str();
63     debug(1) << "GLSL source:\n"
64              << str << "\n";
65     vector<char> buffer(str.begin(), str.end());
66     buffer.push_back(0);
67     return buffer;
68 }
69 
get_current_kernel_name()70 string CodeGen_OpenGL_Dev::get_current_kernel_name() {
71     return cur_kernel_name;
72 }
73 
dump()74 void CodeGen_OpenGL_Dev::dump() {
75     std::cerr << src_stream.str() << "\n";
76 }
77 
print_gpu_name(const string & name)78 string CodeGen_OpenGL_Dev::print_gpu_name(const string &name) {
79     return glc->print_name(name);
80 }
81 
82 //
83 // CodeGen_GLSLBase
84 //
CodeGen_GLSLBase(std::ostream & s,Target target)85 CodeGen_GLSLBase::CodeGen_GLSLBase(std::ostream &s, Target target)
86     : CodeGen_C(s, target) {
87     builtin["sin_f32"] = "sin";
88     builtin["sqrt_f32"] = "sqrt";
89     builtin["cos_f32"] = "cos";
90     builtin["exp_f32"] = "exp";
91     builtin["log_f32"] = "log";
92     builtin["abs_f32"] = "abs";
93     builtin["floor_f32"] = "floor";
94     builtin["ceil_f32"] = "ceil";
95     builtin["asin_f32"] = "asin";
96     builtin["acos_f32"] = "acos";
97     builtin["tan_f32"] = "tan";
98     builtin["atan_f32"] = "atan";
99     builtin["atan2_f32"] = "atan";  // also called atan in GLSL
100     builtin["sinh_f32"] = "sinh";
101     builtin["cosh_f32"] = "cosh";
102     builtin["tanh_f32"] = "tanh";
103     builtin["asinh_f32"] = "asinh";
104     builtin["acosh_f32"] = "acosh";
105     builtin["atanh_f32"] = "atanh";
106     builtin["min"] = "min";
107     builtin["max"] = "max";
108     builtin["mix"] = "mix";
109     builtin["mod"] = "mod";
110     builtin["abs"] = "abs";
111     builtin["isnan"] = "isnan";
112     builtin["round_f32"] = "roundEven";
113     builtin["fast_inverse_sqrt_f32"] = "inversesqrt";
114 
115     // functions that produce bvecs
116     builtin["equal"] = "equal";
117     builtin["notEqual"] = "notEqual";
118     builtin["lessThan"] = "lessThan";
119     builtin["lessThanEqual"] = "lessThanEqual";
120     builtin["greaterThan"] = "greaterThan";
121     builtin["greaterThanEqual"] = "greaterThanEqual";
122 }
123 
124 // Maps Halide types to appropriate GLSL types or emit error if no equivalent
125 // type is available.
map_type(const Type & type)126 Type CodeGen_GLSLBase::map_type(const Type &type) {
127     Type result = type;
128     if (type.is_scalar()) {
129         if (type.is_float()) {
130             user_assert(type.bits() <= 32)
131                 << "GLSL: Can't represent a float with " << type.bits() << " bits.\n";
132             result = Float(32);
133         } else if (type.is_bool()) {
134             // unchanged
135         } else if (type.is_int() && type.bits() <= 32) {
136             result = Int(32);
137         } else if (type.is_uint() && type.bits() <= 32) {
138             result = UInt(32);
139         } else {
140             user_error << "GLSL: Can't represent type '" << type << "'.\n";
141         }
142     } else {
143         user_assert(type.lanes() <= 4)
144             << "GLSL: vector types wider than 4 aren't supported\n";
145         user_assert(type.is_bool() || type.is_int() || type.is_uint() || type.is_float())
146             << "GLSL: Can't represent vector type '" << type << "'.\n";
147         Type scalar_type = type.element_of();
148         result = map_type(scalar_type).with_lanes(type.lanes());
149     }
150     return result;
151 }
152 
visit(const FloatImm * op)153 void CodeGen_GLSLBase::visit(const FloatImm *op) {
154     ostringstream oss;
155     // Print integral numbers with trailing ".0". For fractional numbers use a
156     // precision of 9 digits, which should be enough to recover the binary
157     // float unambiguously from the decimal representation (if iostreams
158     // implements correct rounding).
159     const float truncated = (op->value < 0 ? std::ceil(op->value) : std::floor(op->value));
160     if (truncated == op->value) {
161         oss << std::fixed << std::setprecision(1) << op->value;
162     } else {
163         oss << std::setprecision(9) << op->value;
164     }
165     id = oss.str();
166 }
167 
visit(const IntImm * op)168 void CodeGen_GLSLBase::visit(const IntImm *op) {
169     id = print_type(op->type) + "(" + std::to_string(op->value) + ")";
170 }
171 
visit(const UIntImm * op)172 void CodeGen_GLSLBase::visit(const UIntImm *op) {
173     if (op->type == Bool()) {
174         if (op->value == 1) {
175             id = "true";
176         } else {
177             id = "false";
178         }
179     } else {
180         id = std::to_string(op->value) + "u";
181     }
182 }
183 
visit(const Max * op)184 void CodeGen_GLSLBase::visit(const Max *op) {
185     print_expr(Call::make(op->type, "max", {op->a, op->b}, Call::PureExtern));
186 }
187 
visit(const Min * op)188 void CodeGen_GLSLBase::visit(const Min *op) {
189     print_expr(Call::make(op->type, "min", {op->a, op->b}, Call::PureExtern));
190 }
191 
visit(const Mod * op)192 void CodeGen_GLSLBase::visit(const Mod *op) {
193     if (op->type.is_int() || op->type.is_uint()) {
194         // Just exploit the Euclidean identity
195         // FIXME: Why doesn't lower_euclidean_mod work for glsl?
196         // https://github.com/halide/Halide/issues/4979
197         Expr zero = make_zero(op->type);
198         Expr equiv = select(op->a == zero, zero,
199                             op->a - (op->a / op->b) * op->b);
200         equiv = common_subexpression_elimination(equiv);
201         print_expr(equiv);
202     } else {
203         print_expr(Call::make(op->type, "mod", {op->a, op->b}, Call::Extern));
204     }
205 }
206 
visit(const Call * op)207 void CodeGen_GLSLBase::visit(const Call *op) {
208     if (op->is_intrinsic(Call::lerp)) {
209         // Implement lerp using GLSL's mix() function, which always uses
210         // floating point arithmetic.
211         Expr zero_val = op->args[0];
212         Expr one_val = op->args[1];
213         Expr weight = op->args[2];
214 
215         internal_assert(weight.type().is_uint() || weight.type().is_float());
216         if (weight.type().is_uint()) {
217             // Normalize integer weights to [0.0f, 1.0f] range.
218             internal_assert(weight.type().bits() < 32);
219             weight = Div::make(Cast::make(Float(32), weight),
220                                Cast::make(Float(32), weight.type().max()));
221         } else if (op->type.is_uint()) {
222             // Round float weights down to next multiple of (1/op->type.imax())
223             // to give same results as lerp based on integer arithmetic.
224             internal_assert(op->type.bits() < 32);
225             weight = floor(weight * op->type.max()) / op->type.max();
226         }
227 
228         Type result_type = Float(32, op->type.lanes());
229         Expr e = Call::make(result_type, "mix", {zero_val, one_val, weight}, Call::Extern);
230 
231         if (!op->type.is_float()) {
232             // Mirror rounding implementation of Halide's integer lerp.
233             e = Cast::make(op->type, floor(e + 0.5f));
234         }
235         print_expr(e);
236         return;
237     } else if (op->is_intrinsic(Call::absd)) {
238         internal_assert(op->args.size() == 2);
239         Expr a = op->args[0];
240         Expr b = op->args[1];
241         Expr e = cast(op->type, select(a < b, b - a, a - b));
242         print_expr(e);
243         return;
244     } else if (op->is_intrinsic(Call::return_second)) {
245         internal_assert(op->args.size() == 2);
246         // Simply discard the first argument, which is generally a call to
247         // 'halide_printf'.
248         print_expr(op->args[1]);
249         return;
250     } else if (op->name == "fast_inverse_f32") {
251         print_expr(make_one(op->type) / op->args[0]);
252         return;
253     } else if (op->name == "fast_inverse_sqrt_f32") {
254         print_expr(make_one(op->type) / sqrt(op->args[0]));
255         return;
256     } else if (op->name == "pow_f32") {
257         if (can_prove(op->args[0] > 0)) {
258             ostringstream rhs;
259             rhs << "pow(" << print_expr(op->args[0]) << ", " << print_expr(op->args[1]) << ")";
260             print_assignment(op->type, rhs.str());
261             return;
262         } else {
263             ostringstream base;
264             string a = print_expr(op->args[0]);
265             string b = print_expr(op->args[1]);
266             base << "pow(abs(" << a << "), " << b << ")";
267             string c = print_assignment(op->type, base.str());
268             Expr a_var = Variable::make(op->type, a);
269             Expr b_var = Variable::make(op->type, b);
270             Expr c_var = Variable::make(op->type, c);
271             // OpenGL isn't required to produce NaNs, so we return
272             // zero in the undefined case.
273             Expr equiv = select(a_var > 0 || b_var % 2 == 0, c_var,
274                                 b_var % 2 == 1, -c_var,
275                                 0.0f);
276             print_expr(equiv);
277             return;
278         }
279     } else if (op->is_intrinsic(Call::shift_right)) {
280         print_assignment(op->type, print_expr(op->args[0]) + " >> " + print_expr(op->args[1]));
281     } else if (op->is_intrinsic(Call::shift_left)) {
282         print_assignment(op->type, print_expr(op->args[0]) + " << " + print_expr(op->args[1]));
283     } else if (op->is_intrinsic(Call::bitwise_not)) {
284         print_assignment(op->type, "~" + print_expr(op->args[0]));
285     } else if (op->is_intrinsic(Call::bitwise_and)) {
286         print_assignment(op->type, print_expr(op->args[0]) + " & " + print_expr(op->args[1]));
287     } else if (op->is_intrinsic(Call::bitwise_or)) {
288         print_assignment(op->type, print_expr(op->args[0]) + " | " + print_expr(op->args[1]));
289     } else if (op->is_intrinsic(Call::bitwise_xor)) {
290         print_assignment(op->type, print_expr(op->args[0]) + " ^ " + print_expr(op->args[1]));
291     } else if (op->is_intrinsic(Call::div_round_to_zero)) {
292         print_assignment(op->type, print_expr(op->args[0]) + " / " + print_expr(op->args[1]));
293     } else if (op->is_intrinsic(Call::mod_round_to_zero)) {
294         print_assignment(op->type, print_expr(op->args[0]) + " % " + print_expr(op->args[1]));
295     } else {
296         ostringstream rhs;
297         if (builtin.count(op->name) == 0) {
298             user_error << "GLSL: unknown function '" << op->name << "' encountered.\n";
299         }
300 
301         rhs << builtin[op->name] << "(";
302         for (size_t i = 0; i < op->args.size(); i++) {
303             if (i > 0) rhs << ", ";
304             rhs << print_expr(op->args[i]);
305         }
306         rhs << ")";
307         print_assignment(op->type, rhs.str());
308     }
309 }
310 
print_type(Type type,AppendSpaceIfNeeded space_option)311 string CodeGen_GLSLBase::print_type(Type type, AppendSpaceIfNeeded space_option) {
312     ostringstream oss;
313     type = map_type(type);
314     if (type.is_scalar()) {
315         if (type.is_float()) {
316             oss << "float";
317         } else if (type.is_bool()) {
318             oss << "bool";
319         } else if (type.is_int()) {
320             oss << "int";
321         } else if (type.is_uint()) {
322             oss << "uint";
323         } else {
324             internal_error << "GLSL: invalid type '" << type << "' encountered.\n";
325         }
326     } else {
327         if (type.is_float()) {
328             // no prefix for float vectors
329         } else if (type.is_bool()) {
330             oss << "b";
331         } else if (type.is_int()) {
332             oss << "i";
333         } else if (type.is_uint()) {
334             oss << "u";
335         } else {
336             internal_error << "GLSL: invalid type '" << type << "' encountered.\n";
337         }
338         oss << "vec" << type.lanes();
339     }
340 
341     if (space_option == AppendSpace) {
342         oss << " ";
343     }
344 
345     return oss.str();
346 }
347 
348 // The following comparisons are defined for ivec and vec
349 // types, so we don't use call_builtin
visit(const EQ * op)350 void CodeGen_GLSLBase::visit(const EQ *op) {
351     if (op->type.is_vector()) {
352         print_expr(Call::make(op->type, "equal", {op->a, op->b}, Call::Extern));
353     } else {
354         CodeGen_C::visit(op);
355     }
356 }
357 
visit(const NE * op)358 void CodeGen_GLSLBase::visit(const NE *op) {
359     if (op->type.is_vector()) {
360         print_expr(Call::make(op->type, "notEqual", {op->a, op->b}, Call::Extern));
361     } else {
362         CodeGen_C::visit(op);
363     }
364 }
365 
visit(const LT * op)366 void CodeGen_GLSLBase::visit(const LT *op) {
367     if (op->type.is_vector()) {
368         print_expr(Call::make(op->type, "lessThan", {op->a, op->b}, Call::Extern));
369     } else {
370         CodeGen_C::visit(op);
371     }
372 }
373 
visit(const LE * op)374 void CodeGen_GLSLBase::visit(const LE *op) {
375     if (op->type.is_vector()) {
376         print_expr(Call::make(op->type, "lessThanEqual", {op->a, op->b}, Call::Extern));
377     } else {
378         CodeGen_C::visit(op);
379     }
380 }
381 
visit(const GT * op)382 void CodeGen_GLSLBase::visit(const GT *op) {
383     if (op->type.is_vector()) {
384         print_expr(Call::make(op->type, "greaterThan", {op->a, op->b}, Call::Extern));
385     } else {
386         CodeGen_C::visit(op);
387     }
388 }
389 
visit(const GE * op)390 void CodeGen_GLSLBase::visit(const GE *op) {
391     if (op->type.is_vector()) {
392         print_expr(Call::make(op->type, "greaterThanEqual", {op->a, op->b}, Call::Extern));
393     } else {
394         CodeGen_C::visit(op);
395     }
396 }
397 
visit(const Shuffle * op)398 void CodeGen_GLSLBase::visit(const Shuffle *op) {
399     // The halide Shuffle represents the llvm intrinisc
400     // shufflevector, however, for GLSL its use is limited to swizzling
401     // up to a four channel vec type.
402 
403     internal_assert(op->vectors.size() == 1);
404 
405     int shuffle_lanes = op->type.lanes();
406     internal_assert(shuffle_lanes <= 4);
407 
408     string expr = print_expr(op->vectors[0]);
409 
410     // Create a swizzle expression for the shuffle
411     string swizzle;
412     for (int i = 0; i != shuffle_lanes; ++i) {
413         int channel = op->indices[i];
414         internal_assert(channel < 4) << "Shuffle of invalid channel";
415         swizzle += get_lane_suffix(channel);
416     }
417 
418     print_assignment(op->type, expr + "." + swizzle);
419 }
420 
421 // Identifiers containing double underscores '__' are reserved in GLSL, so we
422 // have to use a different name mangling scheme than in the C code generator.
print_name(const string & name)423 string CodeGen_GLSLBase::print_name(const string &name) {
424     const string mangled = CodeGen_C::print_name(name);
425     return replace_all(mangled, "__", "XX");
426 }
427 
visit(const Cast * op)428 void CodeGen_GLSLBase::visit(const Cast *op) {
429     Type value_type = op->value.type();
430     // If both types are represented by the same GLSL type, no explicit cast
431     // is necessary.
432     if (map_type(op->type) == map_type(value_type)) {
433         Expr value = op->value;
434         if (value_type.code() == Type::Float) {
435             // float->int conversions may need explicit truncation if an
436             // integer type is embedded into a float. (Note: overflows are
437             // considered undefined behavior, so we do nothing about values
438             // that are out of range of the target type.)
439             if (op->type.code() == Type::UInt) {
440                 value = simplify(floor(value));
441             } else if (op->type.code() == Type::Int) {
442                 value = simplify(trunc(value));
443             }
444         }
445         // FIXME: Overflow is not UB for most Halide types
446         // https://github.com/halide/Halide/issues/4975
447         value.accept(this);
448     } else {
449         Type target_type = map_type(op->type);
450         print_assignment(target_type, print_type(target_type) + "(" + print_expr(op->value) + ")");
451     }
452 }
453 
454 //
455 // CodeGen_GLSL
456 //
457 
CodeGen_GLSL(std::ostream & s,const Target & t)458 CodeGen_GLSL::CodeGen_GLSL(std::ostream &s, const Target &t)
459     : CodeGen_GLSLBase(s, t) {
460     builtin["trunc_f32"] = "_trunc_f32";
461 }
462 
visit(const Let * op)463 void CodeGen_GLSL::visit(const Let *op) {
464 
465     if (op->name.find(".varying") != string::npos) {
466 
467         // Skip let statements for varying attributes
468         op->body.accept(this);
469 
470         return;
471     }
472 
473     CodeGen_C::visit(op);
474 }
475 
visit(const For * loop)476 void CodeGen_GLSL::visit(const For *loop) {
477     user_assert(loop->for_type != ForType::GPULane)
478         << "The GLSL backend does not support the gpu_lanes() scheduling directive.";
479 
480     if (ends_with(loop->name, ".__block_id_x") ||
481         ends_with(loop->name, ".__block_id_y")) {
482         internal_assert(loop->for_type == ForType::GPUBlock)
483             << "kernel loop must be gpu block\n";
484 
485         debug(1) << "Dropping loop " << loop->name << " (" << loop->min << ", " << loop->extent << ")\n";
486 
487         string idx;
488         if (ends_with(loop->name, ".__block_id_x")) {
489             idx = "int(_varyingf0[0])";
490         } else if (ends_with(loop->name, ".__block_id_y")) {
491             idx = "int(_varyingf0[1])";
492         }
493         stream << get_indent() << print_type(Int(32)) << " " << print_name(loop->name) << " = " << idx << ";\n";
494         loop->body.accept(this);
495     } else {
496         user_assert(loop->for_type != ForType::Parallel) << "GLSL: parallel loops aren't allowed inside kernel.\n";
497         CodeGen_C::visit(loop);
498     }
499 }
500 
evaluate_vector_select(const Select * op)501 vector<Expr> evaluate_vector_select(const Select *op) {
502     const int lanes = op->type.lanes();
503     vector<Expr> result(lanes);
504     for (int i = 0; i < lanes; i++) {
505         Expr cond = extract_lane(op->condition, i);
506         Expr true_value = extract_lane(op->true_value, i);
507         Expr false_value = extract_lane(op->false_value, i);
508 
509         if (is_const(cond)) {
510             result[i] = is_one(cond) ? true_value : false_value;
511         } else {
512             result[i] = Select::make(cond, true_value, false_value);
513         }
514     }
515     return result;
516 }
517 
visit(const Select * op)518 void CodeGen_GLSL::visit(const Select *op) {
519     string id_value;
520     if (op->condition.type().is_scalar()) {
521         id_value = unique_name('_');
522         stream << get_indent() << print_type(op->type) << " " << id_value << ";\n";
523         string cond = print_expr(op->condition);
524         stream << get_indent() << "if (" << cond << ") ";
525         open_scope();
526         {
527             string true_val = print_expr(op->true_value);
528             stream << get_indent() << id_value << " = " << true_val << ";\n";
529         }
530         close_scope("");
531 
532         stream << get_indent() << "else ";
533         open_scope();
534         {
535             string false_val = print_expr(op->false_value);
536             stream << get_indent() << id_value << " = " << false_val << ";\n";
537         }
538         close_scope("");
539     } else {
540         // Selects with vector conditions are typically used for constructing
541         // vector types. If the select condition can be evaluated at
542         // compile-time (which is often the case), we can built the vector
543         // directly without lowering to a sequence of "if" statements.
544         internal_assert(op->condition.type().lanes() == op->type.lanes());
545         int lanes = op->type.lanes();
546         vector<Expr> result = evaluate_vector_select(op);
547         vector<string> ids(lanes);
548         for (int i = 0; i < lanes; i++) {
549             ids[i] = print_expr(result[i]);
550         }
551         id_value = unique_name('_');
552         stream << get_indent() << print_type(op->type) << " " << id_value << " = "
553                << print_type(op->type) << "(";
554         for (int i = 0; i < lanes; i++) {
555             stream << ids[i] << ((i < lanes - 1) ? ", " : ");\n");
556         }
557     }
558 
559     id = id_value;
560 }
561 
get_vector_suffix(const Expr & e)562 string CodeGen_GLSL::get_vector_suffix(const Expr &e) {
563     vector<Expr> matches;
564     Expr w = Variable::make(Int(32), "*");
565 
566     // The vectorize pass will insert a ramp in the color dimension argument.
567     const Ramp *r = e.as<Ramp>();
568     if (r && is_zero(r->base) && is_one(r->stride) && r->lanes == 4) {
569         // No suffix is needed when accessing a full RGBA vector.
570         return "";
571     } else if (r && is_zero(r->base) && is_one(r->stride) && r->lanes == 3) {
572         return ".rgb";
573     } else if (r && is_zero(r->base) && is_one(r->stride) && r->lanes == 2) {
574         return ".rg";
575     } else {
576         // GLSL 1.0 Section 5.5 supports subscript based vector indexing
577         internal_assert(e.type().is_scalar());
578         string id = print_expr(e);
579         if (e.type() != Int(32)) {
580             id = "int(" + id + ")";
581         }
582         return string("[" + id + "]");
583     }
584 }
585 
print_lanes(const Expr & e)586 vector<string> CodeGen_GLSL::print_lanes(const Expr &e) {
587     int l = e.type().lanes();
588     internal_assert(e.type().is_vector());
589     vector<string> result(l);
590     if (const Broadcast *b = e.as<Broadcast>()) {
591         string val = print_expr(b->value);
592         for (int i = 0; i < l; i++) {
593             result[i] = val;
594         }
595     } else if (const Ramp *r = e.as<Ramp>()) {
596         for (int i = 0; i < l; i++) {
597             result[i] = print_expr(simplify(r->base + i * r->stride));
598         }
599     } else {
600         string val = print_expr(e);
601         for (int i = 0; i < l; i++) {
602             result[i] = val + "[" + std::to_string(i) + "]";
603         }
604     }
605     return result;
606 }
607 
visit(const Load * op)608 void CodeGen_GLSL::visit(const Load *op) {
609     user_assert(is_one(op->predicate)) << "GLSL: predicated load is not supported.\n";
610     if (scalar_vars.contains(op->name)) {
611         internal_assert(is_zero(op->index));
612         id = print_name(op->name);
613     } else if (vector_vars.contains(op->name)) {
614         id = print_name(op->name) + get_vector_suffix(op->index);
615     } else if (op->type.is_scalar()) {
616         string idx = print_expr(op->index);
617         print_assignment(op->type, print_name(op->name) + "[" + idx + "]");
618     } else {
619         vector<string> indices = print_lanes(op->index);
620         ostringstream rhs;
621         rhs << print_type(op->type) << "(";
622         for (int i = 0; i < op->type.lanes(); i++) {
623             if (i > 0) {
624                 rhs << ", ";
625             }
626             rhs << print_name(op->name) << "[" + indices[i] + "]";
627         }
628         rhs << ")";
629         print_assignment(op->type, rhs.str());
630     }
631 }
632 
visit(const Store * op)633 void CodeGen_GLSL::visit(const Store *op) {
634     user_assert(is_one(op->predicate)) << "GLSL: predicated store is not supported.\n";
635     if (scalar_vars.contains(op->name)) {
636         internal_assert(is_zero(op->index));
637         string val = print_expr(op->value);
638         stream << get_indent() << print_name(op->name) << " = " << val << ";\n";
639     } else if (vector_vars.contains(op->name)) {
640         string val = print_expr(op->value);
641         stream << get_indent() << print_name(op->name) << get_vector_suffix(op->index)
642                << " = " << val << ";\n";
643     } else if (op->value.type().is_scalar()) {
644         string val = print_expr(op->value);
645         string idx = print_expr(op->index);
646         stream << get_indent() << print_name(op->name) << "[" << idx << "] = " << val << ";\n";
647     } else {
648         vector<string> indices = print_lanes(op->index);
649         vector<string> values = print_lanes(op->value);
650         for (int i = 0; i < op->value.type().lanes(); i++) {
651             stream << get_indent() << print_name(op->name)
652                    << "[" << indices[i] << "] = "
653                    << values[i] << ";\n";
654         }
655     }
656 }
657 
visit(const Evaluate * op)658 void CodeGen_GLSL::visit(const Evaluate *op) {
659     print_expr(op->value);
660 }
661 
visit(const Call * op)662 void CodeGen_GLSL::visit(const Call *op) {
663     ostringstream rhs;
664     if (op->is_intrinsic(Call::glsl_texture_load)) {
665         // This intrinsic takes five arguments
666         // glsl_texture_load(<tex name>, <buffer>, <x>, <y>, <c>)
667         internal_assert(op->args.size() == 5);
668 
669         // The argument to the call is either a StringImm or a broadcasted
670         // StringImm if this is part of a vectorized expression
671         internal_assert(op->args[0].as<StringImm>() ||
672                         (op->args[0].as<Broadcast>() && op->args[0].as<Broadcast>()->value.as<StringImm>()));
673 
674         const StringImm *string_imm = op->args[0].as<StringImm>();
675         if (!string_imm) {
676             string_imm = op->args[0].as<Broadcast>()->value.as<StringImm>();
677         }
678 
679         // Determine the halide buffer associated with this load
680         string buffername = string_imm->value;
681 
682         internal_assert((op->type.code() == Type::UInt || op->type.code() == Type::Float) &&
683                         (op->type.lanes() >= 1 && op->type.lanes() <= 4));
684 
685         if (op->type.is_vector()) {
686             // The channel argument must be a ramp or a broadcast of a constant.
687             Expr c = op->args[4];
688             internal_assert(is_const(c));
689 
690             const Ramp *rc = c.as<Ramp>();
691             const Broadcast *bx = op->args[2].as<Broadcast>();
692             const Broadcast *by = op->args[3].as<Broadcast>();
693             if (rc && is_zero(rc->base) && is_one(rc->stride) && bx && by) {
694                 // If the x and y coordinates are broadcasts, and the c
695                 // coordinate is a dense ramp, we can do a single
696                 // texture2D call.
697                 rhs << "texture2D(" << print_name(buffername) << ", vec2("
698                     << print_expr(bx->value) << ", "
699                     << print_expr(by->value) << "))";
700 
701                 // texture2D always returns a vec4. Swizzle out the lanes we want.
702                 switch (op->type.lanes()) {
703                 case 1:
704                     rhs << ".r";
705                     break;
706                 case 2:
707                     rhs << ".rg";
708                     break;
709                 case 3:
710                     rhs << ".rgb";
711                     break;
712                 default:
713                     break;
714                 }
715             } else {
716                 // Otherwise do one load per lane and make a vector
717                 vector<string> xs = print_lanes(op->args[2]);
718                 vector<string> ys = print_lanes(op->args[3]);
719                 vector<string> cs = print_lanes(op->args[4]);
720                 string name = print_name(buffername);
721 
722                 string x = print_expr(op->args[2]), y = print_expr(op->args[3]);
723                 rhs << print_type(op->type) << "(";
724                 for (int i = 0; i < op->type.lanes(); i++) {
725                     if (i > 0) {
726                         rhs << ", ";
727                     }
728                     rhs << "texture2D(" << name << ", vec2("
729                         << xs[i] << ", " << ys[i] << "))[" << cs[i] << "]";
730                 }
731                 rhs << ")";
732             }
733         } else if (const int64_t *ic = as_const_int(op->args[4])) {
734             internal_assert(*ic >= 0 && *ic < 4);
735             rhs << "texture2D(" << print_name(buffername) << ", vec2("
736                 << print_expr(op->args[2]) << ", "
737                 << print_expr(op->args[3]) << "))."
738                 << get_lane_suffix(*ic);
739         } else {
740             rhs << "texture2D(" << print_name(buffername) << ", vec2("
741                 << print_expr(op->args[2]) << ", "
742                 << print_expr(op->args[3]) << "))["
743                 << print_expr(op->args[4]) << "]";
744         }
745 
746         if (op->type.is_uint()) {
747             rhs << " * " << print_expr(cast<float>(op->type.max()));
748         }
749 
750     } else if (op->is_intrinsic(Call::glsl_texture_store)) {
751         internal_assert(op->args.size() == 6);
752         string sval = print_expr(op->args[5]);
753         string suffix = get_vector_suffix(op->args[4]);
754         stream << get_indent() << "gl_FragColor" << suffix
755                << " = " << sval;
756         if (op->args[5].type().is_uint()) {
757             stream << " / " << print_expr(cast<float>(op->args[5].type().max()));
758         }
759         stream << ";\n";
760         // glsl_texture_store is called only for its side effect; there is
761         // no return value.
762         id = "";
763         return;
764     } else if (op->is_intrinsic(Call::glsl_varying)) {
765         // Varying attributes should be substituted out by this point in
766         // codegen.
767         debug(2) << "Found skipped varying attribute: " << op->args[0] << "\n";
768 
769         // Output the tagged expression.
770         print_expr(op->args[1]);
771         return;
772     } else {
773         CodeGen_GLSLBase::visit(op);
774         return;
775     }
776     print_assignment(op->type, rhs.str());
777 }
778 
779 namespace {
780 class AllAccessConstant : public IRVisitor {
781     using IRVisitor::visit;
782 
visit(const Load * op)783     void visit(const Load *op) override {
784         if (op->name == buf && !is_const(op->index)) {
785             result = false;
786         }
787         IRVisitor::visit(op);
788     }
789 
visit(const Store * op)790     void visit(const Store *op) override {
791         if (op->name == buf && !is_const(op->index)) {
792             result = false;
793         }
794         IRVisitor::visit(op);
795     }
796 
797 public:
798     bool result = true;
799     string buf;
800 };
801 }  // namespace
802 
visit(const Allocate * op)803 void CodeGen_GLSL::visit(const Allocate *op) {
804     int32_t size = op->constant_allocation_size();
805     user_assert(size) << "Allocations inside GLSL kernels must be constant-sized\n";
806 
807     // Check if all access to the allocation uses a constant index
808     AllAccessConstant all_access_constant;
809     all_access_constant.buf = op->name;
810     op->body.accept(&all_access_constant);
811 
812     stream << get_indent();
813     if (size == 1) {
814         // We can use a variable
815         stream << print_type(op->type) << " " << print_name(op->name) << ";\n";
816         ScopedBinding<int> p(scalar_vars, op->name, 0);
817         op->body.accept(this);
818     } else if (size <= 4 && all_access_constant.result) {
819         // We can just use a vector variable
820         stream << print_type(op->type.with_lanes(size)) << " " << print_name(op->name) << ";\n";
821         ScopedBinding<int> p(vector_vars, op->name, 0);
822         op->body.accept(this);
823     } else {
824         stream << print_type(op->type) << " " << print_name(op->name) << "[" << size << "];\n";
825         op->body.accept(this);
826     }
827 }
828 
visit(const Free * op)829 void CodeGen_GLSL::visit(const Free *op) {
830 }
831 
visit(const AssertStmt *)832 void CodeGen_GLSL::visit(const AssertStmt *) {
833     internal_error << "GLSL: unexpected Assertion node encountered.\n";
834 }
835 
visit(const Ramp * op)836 void CodeGen_GLSL::visit(const Ramp *op) {
837     ostringstream rhs;
838     rhs << print_type(op->type) << "(";
839 
840     if (op->lanes > 4)
841         internal_error << "GLSL: ramp lanes " << op->lanes << " is not supported\n";
842 
843     rhs << print_expr(op->base);
844 
845     for (int i = 1; i < op->lanes; ++i) {
846         rhs << ", " << print_expr(Add::make(op->base, Mul::make(i, op->stride)));
847     }
848 
849     rhs << ")";
850     print_assignment(op->type, rhs.str());
851 }
852 
visit(const Broadcast * op)853 void CodeGen_GLSL::visit(const Broadcast *op) {
854     ostringstream rhs;
855     rhs << print_type(op->type) << "(" << print_expr(op->value) << ")";
856     print_assignment(op->type, rhs.str());
857 }
858 
visit(const Atomic * op)859 void CodeGen_GLSL::visit(const Atomic *op) {
860     // Floating point atomics can be tricky as there are no floating point atomics
861     // operations, and GLSL does not allow converting a  floating point buffer to an
862     // integer buffer.
863     // Plus, OpenGL supports atomics starting from 4.3, but Halide doesn't distinguish
864     // between OpenGL versions yet.
865     user_assert(false) << "GLSL: atomics are not supported.\n";
866 }
867 
add_kernel(const Stmt & stmt,const string & name,const vector<DeviceArgument> & args)868 void CodeGen_GLSL::add_kernel(const Stmt &stmt, const string &name,
869                               const vector<DeviceArgument> &args) {
870 
871     // This function produces fragment shader source for the halide statement.
872     // The corresponding vertex shader will be generated by the halide opengl
873     // runtime based on the arguments passed in comments below. Host codegen
874     // outputs expressions that are evaluated at runtime to produce vertex data
875     // and varying attribute values at the vertices.
876 
877     // Emit special header that declares the kernel name and its arguments.
878     // There is currently no standard way of passing information from the code
879     // generator to the runtime, and the information Halide passes to the
880     // runtime are fairly limited.  We use these special comments to know the
881     // data types of arguments and whether textures are used for input or
882     // output.
883 
884     // Keep track of the number of uniform and varying attributes
885     int num_uniform_floats = 0;
886     int num_uniform_ints = 0;
887 
888     // The spatial x and y coordinates are always passed in the first two
889     // varying float attribute slots
890     int num_varying_floats = 2;
891 
892     ostringstream header;
893     header << "/// KERNEL " << name << "\n";
894     for (size_t i = 0; i < args.size(); i++) {
895         if (args[i].is_buffer) {
896             Type t = args[i].type.element_of();
897 
898             user_assert(args[i].read != args[i].write) << "GLSL: buffers may only be read OR written inside a kernel loop.\n";
899             string type_name;
900             if (t == UInt(8)) {
901                 type_name = "uint8_t";
902             } else if (t == UInt(16)) {
903                 type_name = "uint16_t";
904             } else if (t == Float(32)) {
905                 type_name = "float";
906             } else {
907                 user_error << "GLSL: buffer " << args[i].name << " has invalid type " << t << ".\n";
908             }
909             header << "/// " << (args[i].read ? "IN_BUFFER " : "OUT_BUFFER ")
910                    << type_name << " " << print_name(args[i].name) << "\n";
911         } else if (ends_with(args[i].name, ".varying")) {
912             header << "/// VARYING "
913                    // GLSL requires that varying attributes are float. Integer
914                    // expressions for vertex attributes are cast to float during
915                    // host codegen
916                    << "float " << print_name(args[i].name) << " varyingf" << args[i].packed_index / 4 << "[" << args[i].packed_index % 4 << "]\n";
917             ++num_varying_floats;
918         } else if (args[i].type.is_float()) {
919             header << "/// UNIFORM "
920                    << CodeGen_GLSLBase::print_type(args[i].type) << " "
921                    << print_name(args[i].name) << " uniformf" << args[i].packed_index / 4 << "[" << args[i].packed_index % 4 << "]\n";
922             ++num_uniform_floats;
923         } else if (args[i].type.is_int()) {
924             header << "/// UNIFORM "
925                    << CodeGen_GLSLBase::print_type(args[i].type) << " "
926                    << print_name(args[i].name) << " uniformi" << args[i].packed_index / 4 << "[" << args[i].packed_index % 4 << "]\n";
927             ++num_uniform_ints;
928         }
929     }
930 
931     // Compute the number of vec4's needed to pack the arguments
932     num_varying_floats = (num_varying_floats + 3) / 4;
933     num_uniform_floats = (num_uniform_floats + 3) / 4;
934     num_uniform_ints = (num_uniform_ints + 3) / 4;
935 
936     stream << header.str();
937 
938     // Specify default float precision when compiling for OpenGL ES.
939     // TODO: emit correct #version
940     if (is_opengl_es(target)) {
941         stream << "#ifdef GL_FRAGMENT_PRECISION_HIGH\n"
942                << "precision highp float;\n"
943                << "#endif\n";
944     }
945 
946     // Declare input textures and variables
947     for (size_t i = 0; i < args.size(); i++) {
948         if (args[i].is_buffer && args[i].read) {
949             stream << "uniform sampler2D " << print_name(args[i].name) << ";\n";
950         }
951     }
952 
953     for (int i = 0; i != num_varying_floats; ++i) {
954         stream << "varying vec4 _varyingf" << i << ";\n";
955     }
956 
957     for (int i = 0; i != num_uniform_floats; ++i) {
958         stream << "uniform vec4 _uniformf" << i << ";\n";
959     }
960 
961     for (int i = 0; i != num_uniform_ints; ++i) {
962         stream << "uniform ivec4 _uniformi" << i << ";\n";
963     }
964 
965     // Output additional builtin functions.
966     stream << "float _trunc_f32(float x) {\n"
967               "  return floor(abs(x)) * sign(x);\n"
968               "}\n";
969 
970     stream << "void main() {\n";
971     indent += 2;
972 
973     // Unpack the uniform and varying parameters
974     for (size_t i = 0; i < args.size(); i++) {
975         if (args[i].is_buffer) {
976             continue;
977         } else if (ends_with(args[i].name, ".varying")) {
978             stream << get_indent() << "float " << print_name(args[i].name)
979                    << " = _varyingf" << args[i].packed_index / 4
980                    << "[" << args[i].packed_index % 4 << "];\n";
981         } else if (args[i].type.is_float()) {
982             stream << get_indent() << print_type(args[i].type) << " "
983                    << print_name(args[i].name)
984                    << " = _uniformf" << args[i].packed_index / 4
985                    << "[" << args[i].packed_index % 4 << "];\n";
986         } else if (args[i].type.is_int()) {
987             stream << get_indent() << print_type(args[i].type) << " "
988                    << print_name(args[i].name)
989                    << " = _uniformi" << args[i].packed_index / 4
990                    << "[" << args[i].packed_index % 4 << "];\n";
991         }
992     }
993 
994     print(stmt);
995     indent -= 2;
996     stream << "}\n";
997 }
998 
999 namespace {
1000 // Replace all temporary variables names like _1234 with '$'. This is done to
1001 // make the individual tests below self-contained.
normalize_temporaries(const string & s)1002 string normalize_temporaries(const string &s) {
1003     string result;
1004     for (size_t i = 0; i < s.size();) {
1005         if (s[i] == '_') {
1006             result += '$';
1007             for (i++; i < s.size() && isdigit(s[i]); i++) {
1008             }
1009         } else {
1010             result += s[i++];
1011         }
1012     }
1013     return result;
1014 }
1015 
check(Expr e,const string & result)1016 void check(Expr e, const string &result) {
1017     ostringstream source;
1018     CodeGen_GLSL cg(source, Target());
1019     if (e.as<FloatImm>() || e.as<IntImm>()) {
1020         // Hack: CodeGen_C doesn't treat immediates like other expressions, so
1021         // wrap them to obtain useful output.
1022         e = Halide::print(e);
1023     }
1024     Evaluate::make(e).accept(&cg);
1025     string src = normalize_temporaries(source.str());
1026     if (!ends_with(src, result)) {
1027         internal_error
1028             << "Codegen failed for " << e << "\n"
1029             << "  Correct source code:\n"
1030             << result
1031             << "  Actual source code:\n"
1032             << src;
1033     }
1034 }
1035 
1036 }  // namespace
1037 
test()1038 void CodeGen_GLSL::test() {
1039     vector<Expr> e;
1040 
1041     // Check that float constants are printed correctly.
1042     check(1.0f, "float $ = 1.0;\n");
1043     check(1.0f + std::numeric_limits<float>::epsilon(), "float $ = 1.00000012;\n");
1044     check(1.19209290e-07f, "float $ = 1.1920929e-07;\n");
1045     check(8388608.f, "float $ = 8388608.0;\n");
1046     check(-2.1e19f, "float $ = -20999999189405401088.0;\n");
1047     check(3.1415926536f, "float $ = 3.14159274;\n");
1048 
1049     // Uint8 is embedded in GLSL floats, so no cast necessary
1050     check(cast<float>(Variable::make(UInt(8), "x") * 1.0f),
1051           "float $ = $x * 1.0;\n");
1052     // But truncation is necessary for the reverse direction
1053     check(cast<uint8_t>(Variable::make(Float(32), "x")),
1054           "float $ = floor($x);\n");
1055 
1056     check(Min::make(Expr(1), Expr(5)),
1057           "float $ = min(1.0, 5.0);\n"
1058           "int $ = int($);\n");
1059 
1060     check(Max::make(Expr(1), Expr(5)),
1061           "float $ = max(1.0, 5.0);\n"
1062           "int $ = int($);\n");
1063 
1064     check(Max::make(Broadcast::make(1, 4), Broadcast::make(5, 4)),
1065           "vec4 $ = vec4(1.0);\n"
1066           "vec4 $ = vec4(5.0);\n"
1067           "vec4 $ = max($, $);\n"
1068           "ivec4 $ = ivec4($);\n");
1069 
1070     check(Variable::make(Int(32), "x") / Expr(3),
1071           "float $ = float($x);\n"
1072           "float $ = $ * 0.333333343;\n"
1073           "float $ = floor($);\n"
1074           "int $ = int($);\n");
1075     check(Variable::make(Int(32, 4), "x") / Variable::make(Int(32, 4), "y"),
1076           "vec4 $ = vec4($x);\n"
1077           "vec4 $ = vec4($y);\n"
1078           "vec4 $ = $ / $;\n"
1079           "vec4 $ = floor($);\n"
1080           "ivec4 $ = ivec4($);\n");
1081     check(Variable::make(Float(32, 4), "x") / Variable::make(Float(32, 4), "y"),
1082           "vec4 $ = $x / $y;\n");
1083 
1084     // Integer lerp with integer weight
1085     check(lerp(cast<uint8_t>(0), cast<uint8_t>(255), cast<uint8_t>(127)),
1086           "float $ = mix(0.0, 255.0, 0.498039216);\n"
1087           "float $ = $ + 0.5;\n"
1088           "float $ = floor($);\n");
1089 
1090     // Integer lerp with float weight
1091     check(lerp(cast<uint8_t>(0), cast<uint8_t>(255), 0.3f),
1092           "float $ = mix(0.0, 255.0, 0.298039228);\n"
1093           "float $ = $ + 0.5;\n"
1094           "float $ = floor($);\n");
1095 
1096     // Floating point lerp
1097     check(lerp(0.0f, 1.0f, 0.3f),
1098           "float $ = mix(0.0, 1.0, 0.300000012);\n");
1099 
1100     // Vectorized lerp
1101     check(lerp(Variable::make(Float(32, 4), "x"), Variable::make(Float(32, 4), "y"), Broadcast::make(0.25f, 4)),
1102           "vec4 $ = vec4(0.25);\n"
1103           "vec4 $ = mix($x, $y, $);\n");
1104 
1105     // Sin with scalar arg
1106     check(sin(3.0f), "float $ = sin(3.0);\n");
1107 
1108     // Sin with vector arg
1109     check(Call::make(Float(32, 4), "sin_f32", {Broadcast::make(1.f, 4)}, Internal::Call::Extern),
1110           "vec4 $ = vec4(1.0);\n"
1111           "vec4 $ = sin($);\n");
1112 
1113     // use float version of abs in GLSL
1114     check(abs(-2),
1115           "float $ = abs(-2.0);\n"
1116           "int $ = int($);\n");
1117 
1118     check(Halide::print(3.0f), "float $ = 3.0;\n");
1119 
1120     // Test rounding behavior of integer division.
1121     check(Variable::make(Int(32), "x") / Variable::make(Int(32), "y"),
1122           "float $ = float($x);\n"
1123           "float $ = float($y);\n"
1124           "float $ = $ / $;\n"
1125           "float $ = floor($);\n"
1126           "int $ = int($);\n");
1127 
1128     // Select with scalar condition
1129     check(Select::make(EQ::make(Variable::make(Float(32), "x"), 1.0f),
1130                        Broadcast::make(1.f, 4),
1131                        Broadcast::make(2.f, 4)),
1132           "vec4 $;\n"
1133           "bool $ = $x == 1.0;\n"
1134           "if ($) {\n"
1135           " vec4 $ = vec4(1.0);\n"
1136           " $ = $;\n"
1137           "}\n"
1138           "else {\n"
1139           " vec4 $ = vec4(2.0);\n"
1140           " $ = $;\n"
1141           "}\n");
1142 
1143     // Select with vector condition
1144     check(Select::make(EQ::make(Ramp::make(-1, 1, 4), Broadcast::make(0, 4)),
1145                        Broadcast::make(1.f, 4),
1146                        Broadcast::make(2.f, 4)),
1147           "vec4 $ = vec4(2.0, 1.0, 2.0, 2.0);\n");
1148 
1149     // Test codegen for texture loads
1150     Expr load4 = Call::make(Float(32, 4), Call::glsl_texture_load,
1151                             {string("buf"),
1152                              0,
1153                              Broadcast::make(0, 4),
1154                              Broadcast::make(0, 4),
1155                              Ramp::make(0, 1, 4)},
1156                             Call::Intrinsic);
1157     check(load4, "vec4 $ = texture2D($buf, vec2(0, 0));\n");
1158 
1159     check(log(1.0f), "float $ = log(1.0);\n");
1160     check(exp(1.0f), "float $ = exp(1.0);\n");
1161 
1162     // Integer powers are expanded
1163     check(pow(1.4f, 2), "float $ = 1.39999998 * 1.39999998;\n");
1164     check(pow(1.0f, 2.1f), "float $ = pow(1.0, 2.0999999);\n");
1165 
1166     std::cout << "CodeGen_GLSL test passed\n";
1167 }
1168 
1169 }  // namespace Internal
1170 }  // namespace Halide
1171