1 #include <algorithm>
2 #include <sstream>
3 #include <utility>
4 
5 #include "CSE.h"
6 #include "CodeGen_Internal.h"
7 #include "CodeGen_OpenCL_Dev.h"
8 #include "Debug.h"
9 #include "EliminateBoolVectors.h"
10 #include "EmulateFloat16Math.h"
11 #include "ExprUsesVar.h"
12 #include "IRMutator.h"
13 #include "IROperator.h"
14 #include "Simplify.h"
15 
16 namespace Halide {
17 namespace Internal {
18 
19 using std::ostringstream;
20 using std::sort;
21 using std::string;
22 using std::vector;
23 
CodeGen_OpenCL_Dev(Target t)24 CodeGen_OpenCL_Dev::CodeGen_OpenCL_Dev(Target t)
25     : clc(src_stream, t) {
26 }
27 
print_type(Type type,AppendSpaceIfNeeded space)28 string CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::print_type(Type type, AppendSpaceIfNeeded space) {
29     ostringstream oss;
30     if (type.is_float()) {
31         if (type.bits() == 16) {
32             user_assert(target.has_feature(Target::CLHalf))
33                 << "OpenCL kernel uses half type, but CLHalf target flag not enabled\n";
34             oss << "half";
35         } else if (type.bits() == 32) {
36             oss << "float";
37         } else if (type.bits() == 64) {
38             oss << "double";
39         } else {
40             user_error << "Can't represent a float with this many bits in OpenCL C: " << type << "\n";
41         }
42 
43     } else {
44         if (type.is_uint() && type.bits() > 1) {
45             oss << "u";
46         }
47         switch (type.bits()) {
48         case 1:
49             internal_assert(type.lanes() == 1) << "Encountered vector of bool\n";
50             oss << "bool";
51             break;
52         case 8:
53             oss << "char";
54             break;
55         case 16:
56             oss << "short";
57             break;
58         case 32:
59             oss << "int";
60             break;
61         case 64:
62             oss << "long";
63             break;
64         default:
65             user_error << "Can't represent an integer with this many bits in OpenCL C: " << type << "\n";
66         }
67     }
68     if (type.lanes() != 1) {
69         switch (type.lanes()) {
70         case 2:
71         case 3:
72         case 4:
73         case 8:
74         case 16:
75             oss << type.lanes();
76             break;
77         default:
78             user_error << "Unsupported vector width in OpenCL C: " << type << "\n";
79         }
80     }
81     if (space == AppendSpace) {
82         oss << " ";
83     }
84     return oss.str();
85 }
86 
87 // These are built-in types in OpenCL
add_vector_typedefs(const std::set<Type> & vector_types)88 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::add_vector_typedefs(const std::set<Type> &vector_types) {
89 }
90 
print_reinterpret(Type type,const Expr & e)91 string CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::print_reinterpret(Type type, const Expr &e) {
92     ostringstream oss;
93     oss << "as_" << print_type(type) << "(" << print_expr(e) << ")";
94     return oss.str();
95 }
96 
97 namespace {
simt_intrinsic(const string & name)98 string simt_intrinsic(const string &name) {
99     if (ends_with(name, ".__thread_id_x")) {
100         return "get_local_id(0)";
101     } else if (ends_with(name, ".__thread_id_y")) {
102         return "get_local_id(1)";
103     } else if (ends_with(name, ".__thread_id_z")) {
104         return "get_local_id(2)";
105     } else if (ends_with(name, ".__thread_id_w")) {
106         return "get_local_id(3)";
107     } else if (ends_with(name, ".__block_id_x")) {
108         return "get_group_id(0)";
109     } else if (ends_with(name, ".__block_id_y")) {
110         return "get_group_id(1)";
111     } else if (ends_with(name, ".__block_id_z")) {
112         return "get_group_id(2)";
113     } else if (ends_with(name, ".__block_id_w")) {
114         return "get_group_id(3)";
115     }
116     internal_error << "simt_intrinsic called on bad variable name: " << name << "\n";
117     return "";
118 }
119 }  // namespace
120 
visit(const For * loop)121 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const For *loop) {
122     user_assert(loop->for_type != ForType::GPULane)
123         << "The OpenCL backend does not support the gpu_lanes() scheduling directive.";
124 
125     if (is_gpu_var(loop->name)) {
126         internal_assert((loop->for_type == ForType::GPUBlock) ||
127                         (loop->for_type == ForType::GPUThread))
128             << "kernel loop must be either gpu block or gpu thread\n";
129         internal_assert(is_zero(loop->min));
130 
131         stream << get_indent() << print_type(Int(32)) << " " << print_name(loop->name)
132                << " = " << simt_intrinsic(loop->name) << ";\n";
133 
134         loop->body.accept(this);
135 
136     } else {
137         user_assert(loop->for_type != ForType::Parallel) << "Cannot use parallel loops inside OpenCL kernel\n";
138         CodeGen_C::visit(loop);
139     }
140 }
141 
visit(const Ramp * op)142 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Ramp *op) {
143     string id_base = print_expr(op->base);
144     string id_stride = print_expr(op->stride);
145 
146     ostringstream rhs;
147     rhs << id_base << " + " << id_stride << " * ("
148         << print_type(op->type.with_lanes(op->lanes)) << ")(0";
149     // Note 0 written above.
150     for (int i = 1; i < op->lanes; ++i) {
151         rhs << ", " << i;
152     }
153     rhs << ")";
154     print_assignment(op->type.with_lanes(op->lanes), rhs.str());
155 }
156 
visit(const Broadcast * op)157 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Broadcast *op) {
158     string id_value = print_expr(op->value);
159 
160     print_assignment(op->type.with_lanes(op->lanes), id_value);
161 }
162 
163 namespace {
164 // Mapping of integer vector indices to OpenCL ".s" syntax.
165 const char *vector_elements = "0123456789ABCDEF";
166 
167 }  // namespace
168 
get_memory_space(const string & buf)169 string CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::get_memory_space(const string &buf) {
170     if (buf == shared_name) {
171         return "__local";
172     } else {
173         return "__address_space_" + print_name(buf);
174     }
175 }
176 
visit(const Call * op)177 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) {
178     if (op->is_intrinsic(Call::bool_to_mask)) {
179         if (op->args[0].type().is_vector()) {
180             // The argument is already a mask of the right width. Just
181             // sign-extend to the expected type.
182             op->args[0].accept(this);
183         } else {
184             // The argument is a scalar bool. Casting it to an int
185             // produces zero or one. Convert it to -1 of the requested
186             // type.
187             Expr equiv = -Cast::make(op->type, op->args[0]);
188             equiv.accept(this);
189         }
190     } else if (op->is_intrinsic(Call::cast_mask)) {
191         // Sign-extension is fine
192         Expr equiv = Cast::make(op->type, op->args[0]);
193         equiv.accept(this);
194     } else if (op->is_intrinsic(Call::select_mask)) {
195         internal_assert(op->args.size() == 3);
196         string cond = print_expr(op->args[0]);
197         string true_val = print_expr(op->args[1]);
198         string false_val = print_expr(op->args[2]);
199 
200         // Yes, you read this right. OpenCL's select function is declared
201         // 'select(false_case, true_case, condition)'.
202         ostringstream rhs;
203         rhs << "select(" << false_val << ", " << true_val << ", " << cond << ")";
204         print_assignment(op->type, rhs.str());
205     } else if (op->is_intrinsic(Call::abs)) {
206         if (op->type.is_float()) {
207             ostringstream rhs;
208             rhs << "abs_f" << op->type.bits() << "(" << print_expr(op->args[0]) << ")";
209             print_assignment(op->type, rhs.str());
210         } else {
211             ostringstream rhs;
212             rhs << "abs(" << print_expr(op->args[0]) << ")";
213             print_assignment(op->type, rhs.str());
214         }
215     } else if (op->is_intrinsic(Call::absd)) {
216         ostringstream rhs;
217         rhs << "abs_diff(" << print_expr(op->args[0]) << ", " << print_expr(op->args[1]) << ")";
218         print_assignment(op->type, rhs.str());
219     } else if (op->is_intrinsic(Call::gpu_thread_barrier)) {
220         internal_assert(op->args.size() == 1) << "gpu_thread_barrier() intrinsic must specify memory fence type.\n";
221 
222         auto fence_type_ptr = as_const_int(op->args[0]);
223         internal_assert(fence_type_ptr) << "gpu_thread_barrier() parameter is not a constant integer.\n";
224         auto fence_type = *fence_type_ptr;
225 
226         stream << get_indent() << "barrier(0";
227         if (fence_type & CodeGen_GPU_Dev::MemoryFenceType::Device) {
228             stream << " | CLK_GLOBAL_MEM_FENCE";
229         }
230         if (fence_type & CodeGen_GPU_Dev::MemoryFenceType::Shared) {
231             stream << " | CLK_LOCAL_MEM_FENCE";
232         }
233         stream << ");\n";
234         print_assignment(op->type, "0");
235     } else if (op->is_intrinsic(Call::shift_left) || op->is_intrinsic(Call::shift_right)) {
236         // Some OpenCL implementations forbid mixing signed-and-unsigned shift values;
237         // if the RHS is uint, quietly cast it back to int if the LHS is int
238         if (op->args[0].type().is_int() && op->args[1].type().is_uint()) {
239             Type t = op->args[0].type().with_code(halide_type_int);
240             Expr e = Call::make(op->type, op->name, {op->args[0], cast(t, op->args[1])}, op->call_type);
241             e.accept(this);
242         } else {
243             CodeGen_C::visit(op);
244         }
245     } else {
246         CodeGen_C::visit(op);
247     }
248 }
249 
print_extern_call(const Call * op)250 string CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::print_extern_call(const Call *op) {
251     internal_assert(!function_takes_user_context(op->name));
252     vector<string> args(op->args.size());
253     for (size_t i = 0; i < op->args.size(); i++) {
254         args[i] = print_expr(op->args[i]);
255     }
256     ostringstream rhs;
257     rhs << op->name << "(" << with_commas(args) << ")";
258     return rhs.str();
259 }
260 
print_array_access(const string & name,const Type & type,const string & id_index)261 string CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::print_array_access(const string &name,
262                                                                 const Type &type,
263                                                                 const string &id_index) {
264     ostringstream rhs;
265     bool type_cast_needed = !(allocations.contains(name) &&
266                               allocations.get(name).type == type);
267 
268     if (type_cast_needed) {
269         rhs << "((" << get_memory_space(name) << " "
270             << print_type(type) << " *)"
271             << print_name(name)
272             << ")";
273     } else {
274         rhs << print_name(name);
275     }
276     rhs << "[" << id_index << "]";
277 
278     return rhs.str();
279 }
280 
visit(const Load * op)281 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Load *op) {
282     user_assert(is_one(op->predicate)) << "Predicated load is not supported inside OpenCL kernel.\n";
283 
284     // If we're loading a contiguous ramp into a vector, use vload instead.
285     Expr ramp_base = strided_ramp_base(op->index);
286     if (ramp_base.defined()) {
287         internal_assert(op->type.is_vector());
288 
289         ostringstream rhs;
290         if ((op->alignment.modulus % op->type.lanes() == 0) &&
291             (op->alignment.remainder % op->type.lanes() == 0)) {
292             // Get the rhs just for the cache.
293             string id_ramp_base = print_expr(ramp_base / op->type.lanes());
294             string array_indexing = print_array_access(op->name, op->type, id_ramp_base);
295 
296             rhs << array_indexing;
297         } else {
298             string id_ramp_base = print_expr(ramp_base);
299             rhs << "vload" << op->type.lanes()
300                 << "(0, (" << get_memory_space(op->name) << " "
301                 << print_type(op->type.element_of()) << "*)"
302                 << print_name(op->name) << " + " << id_ramp_base << ")";
303         }
304         print_assignment(op->type, rhs.str());
305         return;
306     }
307 
308     string id_index = print_expr(op->index);
309 
310     // Get the rhs just for the cache.
311     string array_indexing = print_array_access(op->name, op->type, id_index);
312 
313     std::map<string, string>::iterator cached = cache.find(array_indexing);
314     if (cached != cache.end()) {
315         id = cached->second;
316         return;
317     }
318 
319     if (op->index.type().is_vector()) {
320         // If index is a vector, gather vector elements.
321         internal_assert(op->type.is_vector());
322 
323         id = "_" + unique_name('V');
324         cache[array_indexing] = id;
325 
326         stream << get_indent() << print_type(op->type)
327                << " " << id << ";\n";
328 
329         for (int i = 0; i < op->type.lanes(); ++i) {
330             stream << get_indent();
331             stream
332                 << id << ".s" << vector_elements[i]
333                 << " = ((" << get_memory_space(op->name) << " "
334                 << print_type(op->type.element_of()) << "*)"
335                 << print_name(op->name) << ")"
336                 << "[" << id_index << ".s" << vector_elements[i] << "];\n";
337         }
338     } else {
339         print_assignment(op->type, array_indexing);
340     }
341 }
342 
visit(const Store * op)343 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Store *op) {
344     user_assert(is_one(op->predicate)) << "Predicated store is not supported inside OpenCL kernel.\n";
345 
346     if (emit_atomic_stores) {
347         // Currently only support scalar atomics.
348         user_assert(op->value.type().is_scalar()) << "OpenCL atomic store does not support vectorization.\n";
349         user_assert(op->value.type().bits() >= 32) << "OpenCL only support 32 and 64 bit atomics.\n";
350         if (op->value.type().bits() == 64) {
351             user_assert(target.has_feature(Target::CLAtomics64))
352                 << "Enable feature CLAtomics64 for 64-bit atomics in OpenCL.\n";
353         }
354         // Detect whether we can describe this as an atomic-read-modify-write,
355         // otherwise fallback to a compare-and-swap loop.
356         // Current only test for atomic add.
357         Expr val_expr = op->value;
358         Type t = val_expr.type();
359         Expr equiv_load = Load::make(t, op->name, op->index, Buffer<>(), op->param, op->predicate, op->alignment);
360         Expr delta = simplify(common_subexpression_elimination(op->value - equiv_load));
361         // For atomicAdd, we check if op->value - store[index] is independent of store.
362         // The atomicAdd operations in OpenCL only supports integers so we also check that.
363         bool is_atomic_add = t.is_int_or_uint() && !expr_uses_var(delta, op->name);
364         bool type_cast_needed = !(allocations.contains(op->name) &&
365                                   allocations.get(op->name).type == t);
366         auto print_store_var = [&]() {
367             if (type_cast_needed) {
368                 stream << "(("
369                        << get_memory_space(op->name) << " "
370                        << print_type(t)
371                        << " *)"
372                        << print_name(op->name)
373                        << ")";
374             } else {
375                 stream << print_name(op->name);
376             }
377         };
378         if (is_atomic_add) {
379             string id_index = print_expr(op->index);
380             string id_delta = print_expr(delta);
381             stream << get_indent();
382             // atomic_add(&x[i], delta);
383             if (t.bits() == 32) {
384                 stream << "atomic_add(&";
385             } else {
386                 stream << "atom_add(&";
387             }
388 
389             print_store_var();
390             stream << "[" << id_index << "]";
391             stream << "," << id_delta << ");\n";
392         } else {
393             // CmpXchg loop
394             // {
395             //   union {unsigned int i; float f;} old_val;
396             //   union {unsigned int i; float f;} new_val;
397             //   do {
398             //     old_val.f = x[id_index];
399             //     new_val.f = ...
400             //   } while(atomic_cmpxchg((volatile address_space unsigned int*)&x[id_index], old_val.i, new_val.i) != old_val.i);
401             // }
402             stream << get_indent() << "{\n";
403             indent += 2;
404             string id_index = print_expr(op->index);
405             std::string int_type = t.bits() == 32 ? "int" : "long";
406             if (t.is_float() || t.is_uint()) {
407                 int_type = "unsigned " + int_type;
408             }
409             if (t.is_float()) {
410                 stream << get_indent() << "union {" << int_type << " i; " << print_type(t) << " f;} old_val;\n";
411                 stream << get_indent() << "union {" << int_type << " i; " << print_type(t) << " f;} new_val;\n";
412             } else {
413                 stream << get_indent() << int_type << " old_val;\n";
414                 stream << get_indent() << int_type << " new_val;\n";
415             }
416             stream << get_indent() << "do {\n";
417             indent += 2;
418             stream << get_indent();
419             if (t.is_float()) {
420                 stream << "old_val.f = ";
421             } else {
422                 stream << "old_val = ";
423             }
424             print_store_var();
425             stream << "[" << id_index << "];\n";
426             string id_value = print_expr(op->value);
427             stream << get_indent();
428             if (t.is_float()) {
429                 stream << "new_val.f = ";
430             } else {
431                 stream << "new_val = ";
432             }
433             stream << id_value << ";\n";
434             indent -= 2;
435             std::string old_val = t.is_float() ? "old_val.i" : "old_val";
436             std::string new_val = t.is_float() ? "new_val.i" : "new_val";
437             stream << get_indent()
438                    << "} while(atomic_cmpxchg((volatile "
439                    << get_memory_space(op->name) << " " << int_type << "*)&"
440                    << print_name(op->name) << "[" << id_index << "], "
441                    << old_val << ", " << new_val << ") != " << old_val << ");\n"
442                    << get_indent() << "}\n";
443             indent -= 2;
444         }
445         cache.clear();
446         return;
447     }
448 
449     string id_value = print_expr(op->value);
450     Type t = op->value.type();
451 
452     // If we're writing a contiguous ramp, use vstore instead.
453     Expr ramp_base = strided_ramp_base(op->index);
454     if (ramp_base.defined()) {
455         internal_assert(op->value.type().is_vector());
456 
457         if ((op->alignment.modulus % op->value.type().lanes() == 0) &&
458             (op->alignment.remainder % op->value.type().lanes() == 0)) {
459             string id_ramp_base = print_expr(ramp_base / op->value.type().lanes());
460             string array_indexing = print_array_access(op->name, t, id_ramp_base);
461             stream << get_indent() << array_indexing << " = " << id_value << ";\n";
462         } else {
463             string id_ramp_base = print_expr(ramp_base);
464             stream << get_indent() << "vstore" << t.lanes() << "("
465                    << id_value << ","
466                    << 0 << ", (" << get_memory_space(op->name) << " "
467                    << print_type(t.element_of()) << "*)"
468                    << print_name(op->name) << " + " << id_ramp_base
469                    << ");\n";
470         }
471     } else if (op->index.type().is_vector()) {
472         // If index is a vector, scatter vector elements.
473         internal_assert(t.is_vector());
474 
475         string id_index = print_expr(op->index);
476 
477         for (int i = 0; i < t.lanes(); ++i) {
478             stream << get_indent() << "((" << get_memory_space(op->name) << " "
479                    << print_type(t.element_of()) << " *)"
480                    << print_name(op->name)
481                    << ")["
482                    << id_index << ".s" << vector_elements[i] << "] = "
483                    << id_value << ".s" << vector_elements[i] << ";\n";
484         }
485     } else {
486         string id_index = print_expr(op->index);
487         stream << get_indent();
488         std::string array_indexing = print_array_access(op->name, t, id_index);
489         stream << array_indexing << " = " << id_value << ";\n";
490     }
491 
492     cache.clear();
493 }
494 
495 namespace {
496 }
497 
visit(const EQ * op)498 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const EQ *op) {
499     visit_binop(eliminated_bool_type(op->type, op->a.type()), op->a, op->b, "==");
500 }
501 
visit(const NE * op)502 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const NE *op) {
503     visit_binop(eliminated_bool_type(op->type, op->a.type()), op->a, op->b, "!=");
504 }
505 
visit(const LT * op)506 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const LT *op) {
507     visit_binop(eliminated_bool_type(op->type, op->a.type()), op->a, op->b, "<");
508 }
509 
visit(const LE * op)510 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const LE *op) {
511     visit_binop(eliminated_bool_type(op->type, op->a.type()), op->a, op->b, "<=");
512 }
513 
visit(const GT * op)514 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const GT *op) {
515     visit_binop(eliminated_bool_type(op->type, op->a.type()), op->a, op->b, ">");
516 }
517 
visit(const GE * op)518 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const GE *op) {
519     visit_binop(eliminated_bool_type(op->type, op->a.type()), op->a, op->b, ">=");
520 }
521 
visit(const Cast * op)522 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Cast *op) {
523     if (!target.has_feature(Target::CLHalf) &&
524         ((op->type.is_float() && op->type.bits() < 32) ||
525          (op->value.type().is_float() && op->value.type().bits() < 32))) {
526         Expr equiv = lower_float16_cast(op);
527         equiv.accept(this);
528         return;
529     }
530 
531     if (op->type.is_vector()) {
532         print_assignment(op->type, "convert_" + print_type(op->type) + "(" + print_expr(op->value) + ")");
533     } else {
534         CodeGen_C::visit(op);
535     }
536 }
537 
visit(const Select * op)538 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Select *op) {
539     if (!op->condition.type().is_scalar()) {
540         // A vector of bool was recursively introduced while
541         // performing codegen. Eliminate it.
542         Expr equiv = eliminate_bool_vectors(op);
543         equiv.accept(this);
544         return;
545     }
546     CodeGen_C::visit(op);
547 }
548 
visit(const Allocate * op)549 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Allocate *op) {
550     user_assert(!op->new_expr.defined()) << "Allocate node inside OpenCL kernel has custom new expression.\n"
551                                          << "(Memoization is not supported inside GPU kernels at present.)\n";
552 
553     if (op->memory_type == MemoryType::GPUShared) {
554         // Already handled
555         op->body.accept(this);
556     } else {
557         open_scope();
558 
559         debug(2) << "Allocate " << op->name << " on device\n";
560 
561         debug(3) << "Pushing allocation called " << op->name << " onto the symbol table\n";
562 
563         // Allocation is not a shared memory allocation, just make a local declaration.
564         // It must have a constant size.
565         int32_t size = op->constant_allocation_size();
566         user_assert(size > 0)
567             << "Allocation " << op->name << " has a dynamic size. "
568             << "Only fixed-size allocations are supported on the gpu. "
569             << "Try storing into shared memory instead.";
570 
571         stream << get_indent() << print_type(op->type) << " "
572                << print_name(op->name) << "[" << size << "];\n";
573         stream << get_indent() << "#define " << get_memory_space(op->name) << " __private\n";
574 
575         Allocation alloc;
576         alloc.type = op->type;
577         allocations.push(op->name, alloc);
578 
579         op->body.accept(this);
580 
581         // Should have been freed internally
582         internal_assert(!allocations.contains(op->name));
583 
584         close_scope("alloc " + print_name(op->name));
585     }
586 }
587 
visit(const Free * op)588 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Free *op) {
589     if (op->name == shared_name) {
590         return;
591     } else {
592         // Should have been freed internally
593         internal_assert(allocations.contains(op->name));
594         allocations.pop(op->name);
595         stream << get_indent() << "#undef " << get_memory_space(op->name) << "\n";
596     }
597 }
598 
visit(const AssertStmt * op)599 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const AssertStmt *op) {
600     user_warning << "Ignoring assertion inside OpenCL kernel: " << op->condition << "\n";
601 }
602 
visit(const Shuffle * op)603 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) {
604     if (op->is_interleave()) {
605         int op_lanes = op->type.lanes();
606         internal_assert(!op->vectors.empty());
607         int arg_lanes = op->vectors[0].type().lanes();
608         if (op->vectors.size() == 1) {
609             // 1 argument, just do a simple assignment
610             internal_assert(op_lanes == arg_lanes);
611             print_assignment(op->type, print_expr(op->vectors[0]));
612         } else if (op->vectors.size() == 2) {
613             // 2 arguments, set the .even to the first arg and the
614             // .odd to the second arg
615             internal_assert(op->vectors[1].type().lanes() == arg_lanes);
616             internal_assert(op_lanes / 2 == arg_lanes);
617             string a1 = print_expr(op->vectors[0]);
618             string a2 = print_expr(op->vectors[1]);
619             id = unique_name('_');
620             stream << get_indent() << print_type(op->type) << " " << id << ";\n";
621             stream << get_indent() << id << ".even = " << a1 << ";\n";
622             stream << get_indent() << id << ".odd = " << a2 << ";\n";
623         } else {
624             // 3+ arguments, interleave via a vector literal
625             // selecting the appropriate elements of the vectors
626             int dest_lanes = op->type.lanes();
627             internal_assert(dest_lanes <= 16);
628             int num_vectors = op->vectors.size();
629             vector<string> arg_exprs(num_vectors);
630             for (int i = 0; i < num_vectors; i++) {
631                 internal_assert(op->vectors[i].type().lanes() == arg_lanes);
632                 arg_exprs[i] = print_expr(op->vectors[i]);
633             }
634             internal_assert(num_vectors * arg_lanes >= dest_lanes);
635             id = unique_name('_');
636             stream << get_indent() << print_type(op->type) << " " << id;
637             stream << " = (" << print_type(op->type) << ")(";
638             for (int i = 0; i < dest_lanes; i++) {
639                 int arg = i % num_vectors;
640                 int arg_idx = i / num_vectors;
641                 internal_assert(arg_idx <= arg_lanes);
642                 stream << arg_exprs[arg] << ".s" << vector_elements[arg_idx];
643                 if (i != dest_lanes - 1) {
644                     stream << ", ";
645                 }
646             }
647             stream << ");\n";
648         }
649     } else {
650         internal_error << "Shuffle not implemented.\n";
651     }
652 }
653 
visit(const Max * op)654 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Max *op) {
655     print_expr(Call::make(op->type, "max", {op->a, op->b}, Call::Extern));
656 }
657 
visit(const Min * op)658 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Min *op) {
659     print_expr(Call::make(op->type, "min", {op->a, op->b}, Call::Extern));
660 }
661 
visit(const Atomic * op)662 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Atomic *op) {
663     // Most GPUs require all the threads in a warp to perform the same operations,
664     // which means our mutex will lead to deadlock.
665     user_assert(op->mutex_name.empty())
666         << "The atomic update requires a mutex lock, which is not supported in OpenCL.\n";
667 
668     // Issue atomic stores.
669     ScopedValue<bool> old_emit_atomic_stores(emit_atomic_stores, true);
670     CodeGen_C::visit(op);
671 }
672 
add_kernel(Stmt s,const string & name,const vector<DeviceArgument> & args)673 void CodeGen_OpenCL_Dev::add_kernel(Stmt s,
674                                     const string &name,
675                                     const vector<DeviceArgument> &args) {
676     debug(2) << "CodeGen_OpenCL_Dev::compile " << name << "\n";
677 
678     // TODO: do we have to uniquify these names, or can we trust that they are safe?
679     cur_kernel_name = name;
680     clc.add_kernel(s, name, args);
681 }
682 
683 namespace {
684 struct BufferSize {
685     string name;
686     size_t size;
687 
BufferSizeHalide::Internal::__anon78b39abc0511::BufferSize688     BufferSize()
689         : size(0) {
690     }
BufferSizeHalide::Internal::__anon78b39abc0511::BufferSize691     BufferSize(string name, size_t size)
692         : name(std::move(name)), size(size) {
693     }
694 
operator <Halide::Internal::__anon78b39abc0511::BufferSize695     bool operator<(const BufferSize &r) const {
696         return size < r.size;
697     }
698 };
699 }  // namespace
700 
add_kernel(Stmt s,const string & name,const vector<DeviceArgument> & args)701 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::add_kernel(Stmt s,
702                                                       const string &name,
703                                                       const vector<DeviceArgument> &args) {
704 
705     debug(2) << "Adding OpenCL kernel " << name << "\n";
706 
707     debug(2) << "Eliminating bool vectors\n";
708     s = eliminate_bool_vectors(s);
709     debug(2) << "After eliminating bool vectors:\n"
710              << s << "\n";
711 
712     // Figure out which arguments should be passed in __constant.
713     // Such arguments should be:
714     // - not written to,
715     // - loads are block-uniform,
716     // - constant size,
717     // - and all allocations together should be less than the max constant
718     //   buffer size given by CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE.
719     // The last condition is handled via the preprocessor in the kernel
720     // declaration.
721     vector<BufferSize> constants;
722     for (size_t i = 0; i < args.size(); i++) {
723         if (args[i].is_buffer &&
724             CodeGen_GPU_Dev::is_buffer_constant(s, args[i].name) &&
725             args[i].size > 0) {
726             constants.emplace_back(args[i].name, args[i].size);
727         }
728     }
729 
730     // Sort the constant candidates from smallest to largest. This will put
731     // as many of the constant allocations in __constant as possible.
732     // Ideally, we would prioritize constant buffers by how frequently they
733     // are accessed.
734     sort(constants.begin(), constants.end());
735 
736     // Compute the cumulative sum of the constants.
737     for (size_t i = 1; i < constants.size(); i++) {
738         constants[i].size += constants[i - 1].size;
739     }
740 
741     // Create preprocessor replacements for the address spaces of all our buffers.
742     stream << "// Address spaces for " << name << "\n";
743     for (size_t i = 0; i < args.size(); i++) {
744         if (args[i].is_buffer) {
745             vector<BufferSize>::iterator constant = constants.begin();
746             while (constant != constants.end() &&
747                    constant->name != args[i].name) {
748                 constant++;
749             }
750 
751             if (constant != constants.end()) {
752                 stream << "#if " << constant->size << " <= MAX_CONSTANT_BUFFER_SIZE && "
753                        << constant - constants.begin() << " < MAX_CONSTANT_ARGS\n";
754                 stream << "#define " << get_memory_space(args[i].name) << " __constant\n";
755                 stream << "#else\n";
756                 stream << "#define " << get_memory_space(args[i].name) << " __global\n";
757                 stream << "#endif\n";
758             } else {
759                 stream << "#define " << get_memory_space(args[i].name) << " __global\n";
760             }
761         }
762     }
763 
764     // Emit the function prototype.
765     stream << "__kernel void " << name << "(\n";
766     for (size_t i = 0; i < args.size(); i++) {
767         if (args[i].is_buffer) {
768             stream << " " << get_memory_space(args[i].name) << " ";
769             if (!args[i].write) stream << "const ";
770             stream << print_type(args[i].type) << " *"
771                    << "restrict "
772                    << print_name(args[i].name);
773             Allocation alloc;
774             alloc.type = args[i].type;
775             allocations.push(args[i].name, alloc);
776         } else {
777             Type t = args[i].type;
778             string name = args[i].name;
779             // Bools are passed as a uint8.
780             t = t.with_bits(t.bytes() * 8);
781             // float16 are passed as uints
782             if (t.is_float() && t.bits() < 32) {
783                 t = t.with_code(halide_type_uint);
784                 name += "_bits";
785             }
786             stream << " const "
787                    << print_type(t)
788                    << " "
789                    << print_name(name);
790         }
791 
792         if (i < args.size() - 1) stream << ",\n";
793     }
794 
795     class FindShared : public IRVisitor {
796         using IRVisitor::visit;
797         void visit(const Allocate *op) override {
798             if (op->memory_type == MemoryType::GPUShared) {
799                 internal_assert(alloc == nullptr)
800                     << "Found multiple shared allocations in metal kernel\n";
801                 alloc = op;
802             }
803         }
804 
805     public:
806         const Allocate *alloc = nullptr;
807     } find_shared;
808     s.accept(&find_shared);
809 
810     if (find_shared.alloc) {
811         shared_name = find_shared.alloc->name;
812     } else {
813         shared_name = "__shared";
814     }
815     // Note that int16 below is an int32x16, not an int16_t. The type
816     // is chosen to be large to maximize alignment.
817     stream << ",\n"
818            << " __local int16* "
819            << print_name(shared_name)
820            << ")\n";
821 
822     open_scope();
823 
824     // Reinterpret half args passed as uint16 back to half
825     for (size_t i = 0; i < args.size(); i++) {
826         if (!args[i].is_buffer &&
827             args[i].type.is_float() &&
828             args[i].type.bits() < 32) {
829             stream << " const " << print_type(args[i].type)
830                    << " " << print_name(args[i].name)
831                    << " = half_from_bits(" << print_name(args[i].name + "_bits") << ");\n";
832         }
833     }
834 
835     print(s);
836     close_scope("kernel " + name);
837 
838     for (size_t i = 0; i < args.size(); i++) {
839         // Remove buffer arguments from allocation scope
840         if (args[i].is_buffer) {
841             allocations.pop(args[i].name);
842         }
843     }
844 
845     // Undef all the buffer address spaces, in case they're different in another kernel.
846     for (size_t i = 0; i < args.size(); i++) {
847         if (args[i].is_buffer) {
848             stream << "#undef " << get_memory_space(args[i].name) << "\n";
849         }
850     }
851 }
852 
init_module()853 void CodeGen_OpenCL_Dev::init_module() {
854     debug(2) << "OpenCL device codegen init_module\n";
855 
856     // wipe the internal kernel source
857     src_stream.str("");
858     src_stream.clear();
859 
860     const Target &target = clc.get_target();
861 
862     // This identifies the program as OpenCL C (as opposed to SPIR).
863     src_stream << "/*OpenCL C " << target.to_string() << "*/\n";
864 
865     src_stream << "#pragma OPENCL FP_CONTRACT ON\n";
866 
867     // Write out the Halide math functions.
868     src_stream << "inline float float_from_bits(unsigned int x) {return as_float(x);}\n"
869                << "inline float nan_f32() { return NAN; }\n"
870                << "inline float neg_inf_f32() { return -INFINITY; }\n"
871                << "inline float inf_f32() { return INFINITY; }\n"
872                << "inline bool is_nan_f32(float x) {return isnan(x); }\n"
873                << "inline bool is_inf_f32(float x) {return isinf(x); }\n"
874                << "inline bool is_finite_f32(float x) {return isfinite(x); }\n"
875                << "#define sqrt_f32 sqrt \n"
876                << "#define sin_f32 sin \n"
877                << "#define cos_f32 cos \n"
878                << "#define exp_f32 exp \n"
879                << "#define log_f32 log \n"
880                << "#define abs_f32 fabs \n"
881                << "#define floor_f32 floor \n"
882                << "#define ceil_f32 ceil \n"
883                << "#define round_f32 round \n"
884                << "#define trunc_f32 trunc \n"
885                << "#define pow_f32 pow\n"
886                << "#define asin_f32 asin \n"
887                << "#define acos_f32 acos \n"
888                << "#define tan_f32 tan \n"
889                << "#define atan_f32 atan \n"
890                << "#define atan2_f32 atan2\n"
891                << "#define sinh_f32 sinh \n"
892                << "#define asinh_f32 asinh \n"
893                << "#define cosh_f32 cosh \n"
894                << "#define acosh_f32 acosh \n"
895                << "#define tanh_f32 tanh \n"
896                << "#define atanh_f32 atanh \n"
897                << "#define fast_inverse_f32 native_recip \n"
898                << "#define fast_inverse_sqrt_f32 native_rsqrt \n";
899 
900     // There does not appear to be a reliable way to safely ignore unused
901     // variables in OpenCL C. See https://github.com/halide/Halide/issues/4918.
902     src_stream << "#define halide_unused(x)";
903 
904     if (target.has_feature(Target::CLDoubles)) {
905         src_stream << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
906                    << "inline bool is_nan_f64(double x) {return isnan(x); }\n"
907                    << "inline bool is_inf_f64(double x) {return isinf(x); }\n"
908                    << "inline bool is_finite_f64(double x) {return isfinite(x); }\n"
909                    << "#define sqrt_f64 sqrt\n"
910                    << "#define sin_f64 sin\n"
911                    << "#define cos_f64 cos\n"
912                    << "#define exp_f64 exp\n"
913                    << "#define log_f64 log\n"
914                    << "#define abs_f64 fabs\n"
915                    << "#define floor_f64 floor\n"
916                    << "#define ceil_f64 ceil\n"
917                    << "#define round_f64 round\n"
918                    << "#define trunc_f64 trunc\n"
919                    << "#define pow_f64 pow\n"
920                    << "#define asin_f64 asin\n"
921                    << "#define acos_f64 acos\n"
922                    << "#define tan_f64 tan\n"
923                    << "#define atan_f64 atan\n"
924                    << "#define atan2_f64 atan2\n"
925                    << "#define sinh_f64 sinh\n"
926                    << "#define asinh_f64 asinh\n"
927                    << "#define cosh_f64 cosh\n"
928                    << "#define acosh_f64 acosh\n"
929                    << "#define tanh_f64 tanh\n"
930                    << "#define atanh_f64 atanh\n";
931     }
932 
933     if (target.has_feature(Target::CLHalf)) {
934         src_stream << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
935                    << "inline half half_from_bits(unsigned short x) {return __builtin_astype(x, half);}\n"
936                    << "inline half nan_f16() { return half_from_bits(32767); }\n"
937                    << "inline half neg_inf_f16() { return half_from_bits(31744); }\n"
938                    << "inline half inf_f16() { return half_from_bits(64512); }\n"
939                    << "inline bool is_nan_f16(half x) {return isnan(x); }\n"
940                    << "inline bool is_inf_f16(half x) {return isinf(x); }\n"
941                    << "inline bool is_finite_f16(half x) {return isfinite(x); }\n"
942                    << "#define sqrt_f16 sqrt\n"
943                    << "#define sin_f16 sin\n"
944                    << "#define cos_f16 cos\n"
945                    << "#define exp_f16 exp\n"
946                    << "#define log_f16 log\n"
947                    << "#define abs_f16 fabs\n"
948                    << "#define floor_f16 floor\n"
949                    << "#define ceil_f16 ceil\n"
950                    << "#define round_f16 round\n"
951                    << "#define trunc_f16 trunc\n"
952                    << "#define pow_f16 pow\n"
953                    << "#define asin_f16 asin\n"
954                    << "#define acos_f16 acos\n"
955                    << "#define tan_f16 tan\n"
956                    << "#define atan_f16 atan\n"
957                    << "#define atan2_f16 atan2\n"
958                    << "#define sinh_f16 sinh\n"
959                    << "#define asinh_f16 asinh\n"
960                    << "#define cosh_f16 cosh\n"
961                    << "#define acosh_f16 acosh\n"
962                    << "#define tanh_f16 tanh\n"
963                    << "#define atanh_f16 atanh\n";
964     }
965 
966     if (target.has_feature(Target::CLAtomics64)) {
967         src_stream << "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
968         src_stream << "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n";
969     }
970 
971     src_stream << "\n";
972 
973     clc.add_common_macros(src_stream);
974 
975     // Add at least one kernel to avoid errors on some implementations for functions
976     // without any GPU schedules.
977     src_stream << "__kernel void _at_least_one_kernel(int x) { }\n";
978 
979     cur_kernel_name = "";
980 }
981 
compile_to_src()982 vector<char> CodeGen_OpenCL_Dev::compile_to_src() {
983     string str = src_stream.str();
984     debug(1) << "OpenCL kernel:\n"
985              << str << "\n";
986     vector<char> buffer(str.begin(), str.end());
987     buffer.push_back(0);
988     return buffer;
989 }
990 
get_current_kernel_name()991 string CodeGen_OpenCL_Dev::get_current_kernel_name() {
992     return cur_kernel_name;
993 }
994 
dump()995 void CodeGen_OpenCL_Dev::dump() {
996     std::cerr << src_stream.str() << "\n";
997 }
998 
print_gpu_name(const std::string & name)999 std::string CodeGen_OpenCL_Dev::print_gpu_name(const std::string &name) {
1000     return name;
1001 }
1002 
1003 }  // namespace Internal
1004 }  // namespace Halide
1005