1 #include <iostream>
2 #include <limits>
3
4 #include "CodeGen_C.h"
5 #include "CodeGen_Internal.h"
6 #include "Deinterleave.h"
7 #include "IROperator.h"
8 #include "Lerp.h"
9 #include "Param.h"
10 #include "Simplify.h"
11 #include "Substitute.h"
12 #include "Type.h"
13 #include "Util.h"
14 #include "Var.h"
15
16 namespace Halide {
17 namespace Internal {
18
19 using std::map;
20 using std::ostream;
21 using std::ostringstream;
22 using std::string;
23 using std::vector;
24
25 extern "C" unsigned char halide_internal_initmod_inlined_c[];
26 extern "C" unsigned char halide_internal_runtime_header_HalideRuntime_h[];
27 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeCuda_h[];
28 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeHexagonHost_h[];
29 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeMetal_h[];
30 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeOpenCL_h[];
31 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeOpenGLCompute_h[];
32 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeOpenGL_h[];
33 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeQurt_h[];
34 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeD3D12Compute_h[];
35
36 namespace {
37
38 // HALIDE_MUST_USE_RESULT defined here is intended to exactly
39 // duplicate the definition in HalideRuntime.h (so that either or
40 // both can be present, in any order).
41 const char *const kDefineMustUseResult = R"INLINE_CODE(#ifndef HALIDE_MUST_USE_RESULT
42 #ifdef __has_attribute
43 #if __has_attribute(nodiscard)
44 #define HALIDE_MUST_USE_RESULT [[nodiscard]]
45 #elif __has_attribute(warn_unused_result)
46 #define HALIDE_MUST_USE_RESULT __attribute__((warn_unused_result))
47 #else
48 #define HALIDE_MUST_USE_RESULT
49 #endif
50 #else
51 #define HALIDE_MUST_USE_RESULT
52 #endif
53 #endif
54 )INLINE_CODE";
55
56 const string headers =
57 "#include <iostream>\n"
58 "#include <math.h>\n"
59 "#include <float.h>\n"
60 "#include <assert.h>\n"
61 "#include <limits.h>\n"
62 "#include <string.h>\n"
63 "#include <stdio.h>\n"
64 "#include <stdint.h>\n";
65
66 // We now add definitions of things in the runtime which are
67 // intended to be inlined into every module but are only expressed
68 // in .ll. The redundancy is regrettable (FIXME).
69 const string globals = R"INLINE_CODE(
70 extern "C" {
71 int64_t halide_current_time_ns(void *ctx);
72 void halide_profiler_pipeline_end(void *, void *);
73 }
74
75 #ifdef _WIN32
76 __declspec(dllimport) float __cdecl roundf(float);
77 __declspec(dllimport) double __cdecl round(double);
78 #else
79 inline float asinh_f32(float x) {return asinhf(x);}
80 inline float acosh_f32(float x) {return acoshf(x);}
81 inline float atanh_f32(float x) {return atanhf(x);}
82 inline double asinh_f64(double x) {return asinh(x);}
83 inline double acosh_f64(double x) {return acosh(x);}
84 inline double atanh_f64(double x) {return atanh(x);}
85 #endif
86 inline float sqrt_f32(float x) {return sqrtf(x);}
87 inline float sin_f32(float x) {return sinf(x);}
88 inline float asin_f32(float x) {return asinf(x);}
89 inline float cos_f32(float x) {return cosf(x);}
90 inline float acos_f32(float x) {return acosf(x);}
91 inline float tan_f32(float x) {return tanf(x);}
92 inline float atan_f32(float x) {return atanf(x);}
93 inline float atan2_f32(float x, float y) {return atan2f(x, y);}
94 inline float sinh_f32(float x) {return sinhf(x);}
95 inline float cosh_f32(float x) {return coshf(x);}
96 inline float tanh_f32(float x) {return tanhf(x);}
97 inline float hypot_f32(float x, float y) {return hypotf(x, y);}
98 inline float exp_f32(float x) {return expf(x);}
99 inline float log_f32(float x) {return logf(x);}
100 inline float pow_f32(float x, float y) {return powf(x, y);}
101 inline float floor_f32(float x) {return floorf(x);}
102 inline float ceil_f32(float x) {return ceilf(x);}
103 inline float round_f32(float x) {return roundf(x);}
104
105 inline double sqrt_f64(double x) {return sqrt(x);}
106 inline double sin_f64(double x) {return sin(x);}
107 inline double asin_f64(double x) {return asin(x);}
108 inline double cos_f64(double x) {return cos(x);}
109 inline double acos_f64(double x) {return acos(x);}
110 inline double tan_f64(double x) {return tan(x);}
111 inline double atan_f64(double x) {return atan(x);}
112 inline double atan2_f64(double x, double y) {return atan2(x, y);}
113 inline double sinh_f64(double x) {return sinh(x);}
114 inline double cosh_f64(double x) {return cosh(x);}
115 inline double tanh_f64(double x) {return tanh(x);}
116 inline double hypot_f64(double x, double y) {return hypot(x, y);}
117 inline double exp_f64(double x) {return exp(x);}
118 inline double log_f64(double x) {return log(x);}
119 inline double pow_f64(double x, double y) {return pow(x, y);}
120 inline double floor_f64(double x) {return floor(x);}
121 inline double ceil_f64(double x) {return ceil(x);}
122 inline double round_f64(double x) {return round(x);}
123
124 inline float nan_f32() {return NAN;}
125 inline float neg_inf_f32() {return -INFINITY;}
126 inline float inf_f32() {return INFINITY;}
127 inline bool is_nan_f32(float x) {return isnan(x);}
128 inline bool is_nan_f64(double x) {return isnan(x);}
129 inline bool is_inf_f32(float x) {return isinf(x);}
130 inline bool is_inf_f64(double x) {return isinf(x);}
131 inline bool is_finite_f32(float x) {return isfinite(x);}
132 inline bool is_finite_f64(double x) {return isfinite(x);}
133
134 template<typename A, typename B>
135 inline A reinterpret(const B &b) {
136 #if __cplusplus >= 201103L
137 static_assert(sizeof(A) == sizeof(B), "type size mismatch");
138 #endif
139 A a;
140 memcpy(&a, &b, sizeof(a));
141 return a;
142 }
143 inline float float_from_bits(uint32_t bits) {
144 return reinterpret<float, uint32_t>(bits);
145 }
146
147 template<typename T>
148 inline int halide_popcount(T a) {
149 int bits_set = 0;
150 while (a != 0) {
151 bits_set += a & 1;
152 a >>= 1;
153 }
154 return bits_set;
155 }
156
157 template<typename T>
158 inline int halide_count_leading_zeros(T a) {
159 int leading_zeros = 0;
160 int bit = sizeof(a) * 8 - 1;
161 while (bit >= 0 && (a & (((T)1) << bit)) == 0) {
162 leading_zeros++;
163 bit--;
164 }
165 return leading_zeros;
166 }
167
168 template<typename T>
169 inline int halide_count_trailing_zeros(T a) {
170 int trailing_zeros = 0;
171 constexpr int bits = sizeof(a) * 8;
172 int bit = 0;
173 while (bit < bits && (a & (((T)1) << bit)) == 0) {
174 trailing_zeros++;
175 bit++;
176 }
177 return trailing_zeros;
178 }
179
180 template<typename T>
181 inline T halide_cpp_max(const T &a, const T &b) {return (a > b) ? a : b;}
182
183 template<typename T>
184 inline T halide_cpp_min(const T &a, const T &b) {return (a < b) ? a : b;}
185
186 template<typename T>
187 inline void halide_unused(const T&) {}
188
189 template<typename A, typename B>
190 const B &return_second(const A &a, const B &b) {
191 halide_unused(a);
192 return b;
193 }
194
195 namespace {
196 class HalideFreeHelper {
197 typedef void (*FreeFunction)(void *user_context, void *p);
198 void * user_context;
199 void *p;
200 FreeFunction free_function;
201 public:
202 HalideFreeHelper(void *user_context, void *p, FreeFunction free_function)
203 : user_context(user_context), p(p), free_function(free_function) {}
204 ~HalideFreeHelper() { free(); }
205 void free() {
206 if (p) {
207 // TODO: do all free_functions guarantee to ignore a nullptr?
208 free_function(user_context, p);
209 p = nullptr;
210 }
211 }
212 };
213 } // namespace
214 )INLINE_CODE";
215 } // namespace
216
217 class TypeInfoGatherer : public IRGraphVisitor {
218 private:
219 using IRGraphVisitor::include;
220 using IRGraphVisitor::visit;
221
include_type(const Type & t)222 void include_type(const Type &t) {
223 if (t.is_vector()) {
224 if (t.is_bool()) {
225 // bool vectors are always emitted as uint8 in the C++ backend
226 // TODO: on some architectures, we could do better by choosing
227 // a bitwidth that matches the other vectors in use; EliminateBoolVectors
228 // could be used for this with a bit of work.
229 vector_types_used.insert(UInt(8).with_lanes(t.lanes()));
230 } else if (!t.is_handle()) {
231 // Vector-handle types can be seen when processing (e.g.)
232 // require() statements that are vectorized, but they
233 // will all be scalarized away prior to use, so don't emit
234 // them.
235 vector_types_used.insert(t);
236 if (t.is_int()) {
237 // If we are including an int-vector type, also include
238 // the same-width uint-vector type; there are various operations
239 // that can use uint vectors for intermediate results (e.g. lerp(),
240 // but also Mod, which can generate a call to abs() for int types,
241 // which always produces uint results for int inputs in Halide);
242 // it's easier to just err on the side of extra vectors we don't
243 // use since they are just type declarations.
244 vector_types_used.insert(t.with_code(halide_type_uint));
245 }
246 }
247 }
248 }
249
include_lerp_types(const Type & t)250 void include_lerp_types(const Type &t) {
251 if (t.is_vector() && t.is_int_or_uint() && (t.bits() >= 8 && t.bits() <= 32)) {
252 Type doubled = t.with_bits(t.bits() * 2);
253 include_type(doubled);
254 }
255 }
256
257 protected:
include(const Expr & e)258 void include(const Expr &e) override {
259 include_type(e.type());
260 IRGraphVisitor::include(e);
261 }
262
263 // GCC's __builtin_shuffle takes an integer vector of
264 // the size of its input vector. Make sure this type exists.
visit(const Shuffle * op)265 void visit(const Shuffle *op) override {
266 vector_types_used.insert(Int(32, op->vectors[0].type().lanes()));
267 IRGraphVisitor::visit(op);
268 }
269
visit(const For * op)270 void visit(const For *op) override {
271 for_types_used.insert(op->for_type);
272 IRGraphVisitor::visit(op);
273 }
274
visit(const Ramp * op)275 void visit(const Ramp *op) override {
276 include_type(op->type.with_lanes(op->lanes));
277 IRGraphVisitor::visit(op);
278 }
279
visit(const Broadcast * op)280 void visit(const Broadcast *op) override {
281 include_type(op->type.with_lanes(op->lanes));
282 IRGraphVisitor::visit(op);
283 }
284
visit(const Cast * op)285 void visit(const Cast *op) override {
286 include_type(op->type);
287 IRGraphVisitor::visit(op);
288 }
289
visit(const Call * op)290 void visit(const Call *op) override {
291 include_type(op->type);
292 if (op->is_intrinsic(Call::lerp)) {
293 // lower_lerp() can synthesize wider vector types.
294 for (auto &a : op->args) {
295 include_lerp_types(a.type());
296 }
297 }
298
299 IRGraphVisitor::visit(op);
300 }
301
302 public:
303 std::set<ForType> for_types_used;
304 std::set<Type> vector_types_used;
305 };
306
CodeGen_C(ostream & s,Target t,OutputKind output_kind,const std::string & guard)307 CodeGen_C::CodeGen_C(ostream &s, Target t, OutputKind output_kind, const std::string &guard)
308 : IRPrinter(s), id("$$ BAD ID $$"), target(t), output_kind(output_kind),
309 extern_c_open(false), inside_atomic_mutex_node(false), emit_atomic_stores(false) {
310
311 if (is_header()) {
312 // If it's a header, emit an include guard.
313 stream << "#ifndef HALIDE_" << print_name(guard) << "\n"
314 << "#define HALIDE_" << print_name(guard) << "\n"
315 << "#include <stdint.h>\n"
316 << "\n"
317 << "// Forward declarations of the types used in the interface\n"
318 << "// to the Halide pipeline.\n"
319 << "//\n";
320 if (target.has_feature(Target::NoRuntime)) {
321 stream << "// For the definitions of these structs, include HalideRuntime.h\n";
322 } else {
323 stream << "// Definitions for these structs are below.\n";
324 }
325 stream << "\n"
326 << "// Halide's representation of a multi-dimensional array.\n"
327 << "// Halide::Runtime::Buffer is a more user-friendly wrapper\n"
328 << "// around this. Its declaration is in HalideBuffer.h\n"
329 << "struct halide_buffer_t;\n"
330 << "\n"
331 << "// Metadata describing the arguments to the generated function.\n"
332 << "// Used to construct calls to the _argv version of the function.\n"
333 << "struct halide_filter_metadata_t;\n"
334 << "\n";
335 // We just forward declared the following types:
336 forward_declared.insert(type_of<halide_buffer_t *>().handle_type);
337 forward_declared.insert(type_of<halide_filter_metadata_t *>().handle_type);
338 } else if (is_extern_decl()) {
339 // Extern decls to be wrapped inside other code (eg python extensions);
340 // emit the forward decls with a minimum of noise. Note that we never
341 // mess with legacy buffer types in this case.
342 stream << "struct halide_buffer_t;\n"
343 << "struct halide_filter_metadata_t;\n"
344 << "\n";
345 forward_declared.insert(type_of<halide_buffer_t *>().handle_type);
346 forward_declared.insert(type_of<halide_filter_metadata_t *>().handle_type);
347 } else {
348 // Include declarations of everything generated C source might want
349 stream
350 << headers
351 << globals
352 << halide_internal_runtime_header_HalideRuntime_h << "\n"
353 << halide_internal_initmod_inlined_c << "\n";
354 add_common_macros(stream);
355 stream << "\n";
356 }
357
358 stream << kDefineMustUseResult << "\n";
359
360 // Throw in a default (empty) definition of HALIDE_FUNCTION_ATTRS
361 // (some hosts may define this to e.g. __attribute__((warn_unused_result)))
362 stream << "#ifndef HALIDE_FUNCTION_ATTRS\n";
363 stream << "#define HALIDE_FUNCTION_ATTRS\n";
364 stream << "#endif\n";
365 }
366
~CodeGen_C()367 CodeGen_C::~CodeGen_C() {
368 set_name_mangling_mode(NameMangling::Default);
369
370 if (is_header()) {
371 if (!target.has_feature(Target::NoRuntime)) {
372 stream << "\n"
373 << "// The generated object file that goes with this header\n"
374 << "// includes a full copy of the Halide runtime so that it\n"
375 << "// can be used standalone. Declarations for the functions\n"
376 << "// in the Halide runtime are below.\n";
377 if (target.os == Target::Windows) {
378 stream
379 << "//\n"
380 << "// The inclusion of this runtime means that it is not legal\n"
381 << "// to link multiple Halide-generated object files together.\n"
382 << "// This problem is Windows-specific. On other platforms, we\n"
383 << "// use weak linkage.\n";
384 } else {
385 stream
386 << "//\n"
387 << "// The runtime is defined using weak linkage, so it is legal\n"
388 << "// to link multiple Halide-generated object files together,\n"
389 << "// or to clobber any of these functions with your own\n"
390 << "// definition.\n";
391 }
392 stream << "//\n"
393 << "// To generate an object file without a full copy of the\n"
394 << "// runtime, use the -no_runtime target flag. To generate a\n"
395 << "// standalone Halide runtime to use with such object files\n"
396 << "// use the -r flag with any Halide generator binary, e.g.:\n"
397 << "// $ ./my_generator -r halide_runtime -o . target=host\n"
398 << "\n"
399 << halide_internal_runtime_header_HalideRuntime_h << "\n";
400 if (target.has_feature(Target::CUDA)) {
401 stream << halide_internal_runtime_header_HalideRuntimeCuda_h << "\n";
402 }
403 if (target.has_feature(Target::HVX_128) ||
404 target.has_feature(Target::HVX_64)) {
405 stream << halide_internal_runtime_header_HalideRuntimeHexagonHost_h << "\n";
406 }
407 if (target.has_feature(Target::Metal)) {
408 stream << halide_internal_runtime_header_HalideRuntimeMetal_h << "\n";
409 }
410 if (target.has_feature(Target::OpenCL)) {
411 stream << halide_internal_runtime_header_HalideRuntimeOpenCL_h << "\n";
412 }
413 if (target.has_feature(Target::OpenGLCompute)) {
414 stream << halide_internal_runtime_header_HalideRuntimeOpenGLCompute_h << "\n";
415 }
416 if (target.has_feature(Target::OpenGL)) {
417 stream << halide_internal_runtime_header_HalideRuntimeOpenGL_h << "\n";
418 }
419 if (target.has_feature(Target::D3D12Compute)) {
420 stream << halide_internal_runtime_header_HalideRuntimeD3D12Compute_h << "\n";
421 }
422 }
423 stream << "#endif\n";
424 }
425 }
426
add_common_macros(std::ostream & dest)427 void CodeGen_C::add_common_macros(std::ostream &dest) {
428 const char *macros = R"INLINE_CODE(
429 // ll suffix in OpenCL is reserved for 128-bit integers.
430 #if defined __OPENCL_VERSION__
431 #define ADD_INT64_T_SUFFIX(x) x##l
432 #define ADD_UINT64_T_SUFFIX(x) x##ul
433 // HLSL doesn't have any suffixes.
434 #elif defined HLSL_VERSION
435 #define ADD_INT64_T_SUFFIX(x) x
436 #define ADD_UINT64_T_SUFFIX(x) x
437 #else
438 #define ADD_INT64_T_SUFFIX(x) x##ll
439 #define ADD_UINT64_T_SUFFIX(x) x##ull
440 #endif
441 )INLINE_CODE";
442 dest << macros;
443 }
444
add_vector_typedefs(const std::set<Type> & vector_types)445 void CodeGen_C::add_vector_typedefs(const std::set<Type> &vector_types) {
446 if (!vector_types.empty()) {
447 // MSVC has a limit of ~16k for string literals, so split
448 // up these declarations accordingly
449 const char *cpp_vector_decl = R"INLINE_CODE(
450 #if !defined(__has_attribute)
451 #define __has_attribute(x) 0
452 #endif
453
454 #if !defined(__has_builtin)
455 #define __has_builtin(x) 0
456 #endif
457
458 template <typename ElementType_, size_t Lanes_>
459 class CppVector {
460 public:
461 typedef ElementType_ ElementType;
462 static const size_t Lanes = Lanes_;
463 typedef CppVector<ElementType, Lanes> Vec;
464 typedef CppVector<uint8_t, Lanes> Mask;
465
466 CppVector &operator=(const Vec &src) {
467 if (this != &src) {
468 for (size_t i = 0; i < Lanes; i++) {
469 elements[i] = src[i];
470 }
471 }
472 return *this;
473 }
474
475 /* not-explicit */ CppVector(const Vec &src) {
476 for (size_t i = 0; i < Lanes; i++) {
477 elements[i] = src[i];
478 }
479 }
480
481 CppVector() {
482 for (size_t i = 0; i < Lanes; i++) {
483 elements[i] = 0;
484 }
485 }
486
487 static Vec broadcast(const ElementType &v) {
488 Vec r(empty);
489 for (size_t i = 0; i < Lanes; i++) {
490 r.elements[i] = v;
491 }
492 return r;
493 }
494
495 static Vec ramp(const ElementType &base, const ElementType &stride) {
496 Vec r(empty);
497 for (size_t i = 0; i < Lanes; i++) {
498 r.elements[i] = base + stride * i;
499 }
500 return r;
501 }
502
503 static Vec load(const void *base, int32_t offset) {
504 Vec r(empty);
505 memcpy(&r.elements[0], ((const ElementType*)base + offset), sizeof(r.elements));
506 return r;
507 }
508
509 // gather
510 static Vec load(const void *base, const CppVector<int32_t, Lanes> &offset) {
511 Vec r(empty);
512 for (size_t i = 0; i < Lanes; i++) {
513 r.elements[i] = ((const ElementType*)base)[offset[i]];
514 }
515 return r;
516 }
517
518 void store(void *base, int32_t offset) const {
519 memcpy(((ElementType*)base + offset), &this->elements[0], sizeof(this->elements));
520 }
521
522 // scatter
523 void store(void *base, const CppVector<int32_t, Lanes> &offset) const {
524 for (size_t i = 0; i < Lanes; i++) {
525 ((ElementType*)base)[offset[i]] = elements[i];
526 }
527 }
528
529 static Vec shuffle(const Vec &a, const int32_t indices[Lanes]) {
530 Vec r(empty);
531 for (size_t i = 0; i < Lanes; i++) {
532 if (indices[i] < 0) {
533 continue;
534 }
535 r.elements[i] = a[indices[i]];
536 }
537 return r;
538 }
539
540 template<size_t InputLanes>
541 static Vec concat(size_t count, const CppVector<ElementType, InputLanes> vecs[]) {
542 Vec r(empty);
543 for (size_t i = 0; i < Lanes; i++) {
544 r.elements[i] = vecs[i / InputLanes][i % InputLanes];
545 }
546 return r;
547 }
548
549 Vec replace(size_t i, const ElementType &b) const {
550 Vec r = *this;
551 r.elements[i] = b;
552 return r;
553 }
554
555 ElementType operator[](size_t i) const {
556 return elements[i];
557 }
558
559 Vec operator~() const {
560 Vec r(empty);
561 for (size_t i = 0; i < Lanes; i++) {
562 r.elements[i] = ~elements[i];
563 }
564 return r;
565 }
566 Vec operator!() const {
567 Vec r(empty);
568 for (size_t i = 0; i < Lanes; i++) {
569 r.elements[i] = !r.elements[i];
570 }
571 return r;
572 }
573
574 friend Vec operator+(const Vec &a, const Vec &b) {
575 Vec r(empty);
576 for (size_t i = 0; i < Lanes; i++) {
577 r.elements[i] = a[i] + b[i];
578 }
579 return r;
580 }
581 friend Vec operator-(const Vec &a, const Vec &b) {
582 Vec r(empty);
583 for (size_t i = 0; i < Lanes; i++) {
584 r.elements[i] = a[i] - b[i];
585 }
586 return r;
587 }
588 friend Vec operator*(const Vec &a, const Vec &b) {
589 Vec r(empty);
590 for (size_t i = 0; i < Lanes; i++) {
591 r.elements[i] = a[i] * b[i];
592 }
593 return r;
594 }
595 friend Vec operator/(const Vec &a, const Vec &b) {
596 Vec r(empty);
597 for (size_t i = 0; i < Lanes; i++) {
598 r.elements[i] = a[i] / b[i];
599 }
600 return r;
601 }
602 friend Vec operator%(const Vec &a, const Vec &b) {
603 Vec r(empty);
604 for (size_t i = 0; i < Lanes; i++) {
605 r.elements[i] = a[i] % b[i];
606 }
607 return r;
608 }
609 template <typename OtherElementType>
610 friend Vec operator<<(const Vec &a, const CppVector<OtherElementType, Lanes> &b) {
611 Vec r(empty);
612 for (size_t i = 0; i < Lanes; i++) {
613 r.elements[i] = a[i] << b[i];
614 }
615 return r;
616 }
617 template <typename OtherElementType>
618 friend Vec operator>>(const Vec &a, const CppVector<OtherElementType, Lanes> &b) {
619 Vec r(empty);
620 for (size_t i = 0; i < Lanes; i++) {
621 r.elements[i] = a[i] >> b[i];
622 }
623 return r;
624 }
625 friend Vec operator&(const Vec &a, const Vec &b) {
626 Vec r(empty);
627 for (size_t i = 0; i < Lanes; i++) {
628 r.elements[i] = a[i] & b[i];
629 }
630 return r;
631 }
632 friend Vec operator|(const Vec &a, const Vec &b) {
633 Vec r(empty);
634 for (size_t i = 0; i < Lanes; i++) {
635 r.elements[i] = a[i] | b[i];
636 }
637 return r;
638 }
639
640 friend Vec operator&&(const Vec &a, const Vec &b) {
641 Vec r(empty);
642 for (size_t i = 0; i < Lanes; i++) {
643 r.elements[i] = a[i] && b[i];
644 }
645 return r;
646 }
647 friend Vec operator||(const Vec &a, const Vec &b) {
648 Vec r(empty);
649 for (size_t i = 0; i < Lanes; i++) {
650 r.elements[i] = a[i] || b[i];
651 }
652 return r;
653 }
654
655 friend Vec operator+(const Vec &a, const ElementType &b) {
656 Vec r(empty);
657 for (size_t i = 0; i < Lanes; i++) {
658 r.elements[i] = a[i] + b;
659 }
660 return r;
661 }
662 friend Vec operator-(const Vec &a, const ElementType &b) {
663 Vec r(empty);
664 for (size_t i = 0; i < Lanes; i++) {
665 r.elements[i] = a[i] - b;
666 }
667 return r;
668 }
669 friend Vec operator*(const Vec &a, const ElementType &b) {
670 Vec r(empty);
671 for (size_t i = 0; i < Lanes; i++) {
672 r.elements[i] = a[i] * b;
673 }
674 return r;
675 }
676 friend Vec operator/(const Vec &a, const ElementType &b) {
677 Vec r(empty);
678 for (size_t i = 0; i < Lanes; i++) {
679 r.elements[i] = a[i] / b;
680 }
681 return r;
682 }
683 friend Vec operator%(const Vec &a, const ElementType &b) {
684 Vec r(empty);
685 for (size_t i = 0; i < Lanes; i++) {
686 r.elements[i] = a[i] % b;
687 }
688 return r;
689 }
690 friend Vec operator>>(const Vec &a, const ElementType &b) {
691 Vec r(empty);
692 for (size_t i = 0; i < Lanes; i++) {
693 r.elements[i] = a[i] >> b;
694 }
695 return r;
696 }
697 friend Vec operator<<(const Vec &a, const ElementType &b) {
698 Vec r(empty);
699 for (size_t i = 0; i < Lanes; i++) {
700 r.elements[i] = a[i] << b;
701 }
702 return r;
703 }
704 friend Vec operator&(const Vec &a, const ElementType &b) {
705 Vec r(empty);
706 for (size_t i = 0; i < Lanes; i++) {
707 r.elements[i] = a[i] & b;
708 }
709 return r;
710 }
711 friend Vec operator|(const Vec &a, const ElementType &b) {
712 Vec r(empty);
713 for (size_t i = 0; i < Lanes; i++) {
714 r.elements[i] = a[i] | b;
715 }
716 return r;
717 }
718 friend Vec operator&&(const Vec &a, const ElementType &b) {
719 Vec r(empty);
720 for (size_t i = 0; i < Lanes; i++) {
721 r.elements[i] = a[i] && b;
722 }
723 return r;
724 }
725 friend Vec operator||(const Vec &a, const ElementType &b) {
726 Vec r(empty);
727 for (size_t i = 0; i < Lanes; i++) {
728 r.elements[i] = a[i] || b;
729 }
730 return r;
731 }
732
733 friend Vec operator+(const ElementType &a, const Vec &b) {
734 Vec r(empty);
735 for (size_t i = 0; i < Lanes; i++) {
736 r.elements[i] = a + b[i];
737 }
738 return r;
739 }
740 friend Vec operator-(const ElementType &a, const Vec &b) {
741 Vec r(empty);
742 for (size_t i = 0; i < Lanes; i++) {
743 r.elements[i] = a - b[i];
744 }
745 return r;
746 }
747 friend Vec operator*(const ElementType &a, const Vec &b) {
748 Vec r(empty);
749 for (size_t i = 0; i < Lanes; i++) {
750 r.elements[i] = a * b[i];
751 }
752 return r;
753 }
754 friend Vec operator/(const ElementType &a, const Vec &b) {
755 Vec r(empty);
756 for (size_t i = 0; i < Lanes; i++) {
757 r.elements[i] = a / b[i];
758 }
759 return r;
760 }
761 friend Vec operator%(const ElementType &a, const Vec &b) {
762 Vec r(empty);
763 for (size_t i = 0; i < Lanes; i++) {
764 r.elements[i] = a % b[i];
765 }
766 return r;
767 }
768 friend Vec operator>>(const ElementType &a, const Vec &b) {
769 Vec r(empty);
770 for (size_t i = 0; i < Lanes; i++) {
771 r.elements[i] = a >> b[i];
772 }
773 return r;
774 }
775 friend Vec operator<<(const ElementType &a, const Vec &b) {
776 Vec r(empty);
777 for (size_t i = 0; i < Lanes; i++) {
778 r.elements[i] = a << b[i];
779 }
780 return r;
781 }
782 friend Vec operator&(const ElementType &a, const Vec &b) {
783 Vec r(empty);
784 for (size_t i = 0; i < Lanes; i++) {
785 r.elements[i] = a & b[i];
786 }
787 return r;
788 }
789 friend Vec operator|(const ElementType &a, const Vec &b) {
790 Vec r(empty);
791 for (size_t i = 0; i < Lanes; i++) {
792 r.elements[i] = a | b[i];
793 }
794 return r;
795 }
796 friend Vec operator&&(const ElementType &a, const Vec &b) {
797 Vec r(empty);
798 for (size_t i = 0; i < Lanes; i++) {
799 r.elements[i] = a && b[i];
800 }
801 return r;
802 }
803 friend Vec operator||(const ElementType &a, const Vec &b) {
804 Vec r(empty);
805 for (size_t i = 0; i < Lanes; i++) {
806 r.elements[i] = a || b[i];
807 }
808 return r;
809 }
810
811 friend Mask operator<(const Vec &a, const Vec &b) {
812 Mask r;
813 for (size_t i = 0; i < Lanes; i++) {
814 r.elements[i] = a[i] < b[i] ? 0xff : 0x00;
815 }
816 return r;
817 }
818
819 friend Mask operator<=(const Vec &a, const Vec &b) {
820 Mask r;
821 for (size_t i = 0; i < Lanes; i++) {
822 r.elements[i] = a[i] <= b[i] ? 0xff : 0x00;
823 }
824 return r;
825 }
826
827 friend Mask operator>(const Vec &a, const Vec &b) {
828 Mask r;
829 for (size_t i = 0; i < Lanes; i++) {
830 r.elements[i] = a[i] > b[i] ? 0xff : 0x00;
831 }
832 return r;
833 }
834
835 friend Mask operator>=(const Vec &a, const Vec &b) {
836 Mask r;
837 for (size_t i = 0; i < Lanes; i++) {
838 r.elements[i] = a[i] >= b[i] ? 0xff : 0x00;
839 }
840 return r;
841 }
842
843 friend Mask operator==(const Vec &a, const Vec &b) {
844 Mask r;
845 for (size_t i = 0; i < Lanes; i++) {
846 r.elements[i] = a[i] == b[i] ? 0xff : 0x00;
847 }
848 return r;
849 }
850
851 friend Mask operator!=(const Vec &a, const Vec &b) {
852 Mask r;
853 for (size_t i = 0; i < Lanes; i++) {
854 r.elements[i] = a[i] != b[i] ? 0xff : 0x00;
855 }
856 return r;
857 }
858
859 static Vec select(const Mask &cond, const Vec &true_value, const Vec &false_value) {
860 Vec r(empty);
861 for (size_t i = 0; i < Lanes; i++) {
862 r.elements[i] = cond[i] ? true_value[i] : false_value[i];
863 }
864 return r;
865 }
866
867 template <typename OtherVec>
868 static Vec convert_from(const OtherVec &src) {
869 #if __cplusplus >= 201103L
870 static_assert(Vec::Lanes == OtherVec::Lanes, "Lanes mismatch");
871 #endif
872 Vec r(empty);
873 for (size_t i = 0; i < Lanes; i++) {
874 r.elements[i] = static_cast<typename Vec::ElementType>(src[i]);
875 }
876 return r;
877 }
878
879 static Vec max(const Vec &a, const Vec &b) {
880 Vec r(empty);
881 for (size_t i = 0; i < Lanes; i++) {
882 r.elements[i] = ::halide_cpp_max(a[i], b[i]);
883 }
884 return r;
885 }
886
887 static Vec min(const Vec &a, const Vec &b) {
888 Vec r(empty);
889 for (size_t i = 0; i < Lanes; i++) {
890 r.elements[i] = ::halide_cpp_min(a[i], b[i]);
891 }
892 return r;
893 }
894
895 private:
896 template <typename, size_t> friend class CppVector;
897 ElementType elements[Lanes];
898
899 // Leave vector uninitialized for cases where we overwrite every entry
900 enum Empty { empty };
901 CppVector(Empty) {}
902 };
903
904 )INLINE_CODE";
905
906 const char *native_vector_decl = R"INLINE_CODE(
907 #if __has_attribute(ext_vector_type) || __has_attribute(vector_size)
908 template <typename ElementType_, size_t Lanes_>
909 class NativeVector {
910 public:
911 typedef ElementType_ ElementType;
912 static const size_t Lanes = Lanes_;
913 typedef NativeVector<ElementType, Lanes> Vec;
914 typedef NativeVector<uint8_t, Lanes> Mask;
915
916 #if __has_attribute(ext_vector_type)
917 typedef ElementType_ NativeVectorType __attribute__((ext_vector_type(Lanes), aligned(sizeof(ElementType))));
918 #elif __has_attribute(vector_size) || __GNUC__
919 typedef ElementType_ NativeVectorType __attribute__((vector_size(Lanes * sizeof(ElementType)), aligned(sizeof(ElementType))));
920 #endif
921
922 NativeVector &operator=(const Vec &src) {
923 if (this != &src) {
924 native_vector = src.native_vector;
925 }
926 return *this;
927 }
928
929 /* not-explicit */ NativeVector(const Vec &src) {
930 native_vector = src.native_vector;
931 }
932
933 NativeVector() {
934 native_vector = (NativeVectorType){};
935 }
936
937 static Vec broadcast(const ElementType &v) {
938 Vec zero; // Zero-initialized native vector.
939 return zero + v;
940 }
941
942 // TODO: this should be improved by taking advantage of native operator support.
943 static Vec ramp(const ElementType &base, const ElementType &stride) {
944 Vec r(empty);
945 for (size_t i = 0; i < Lanes; i++) {
946 r.native_vector[i] = base + stride * i;
947 }
948 return r;
949 }
950
951 // TODO: could this be improved by taking advantage of native operator support?
952 static Vec load(const void *base, int32_t offset) {
953 Vec r(empty);
954 // Note: do not use sizeof(NativeVectorType) here; if it's an unusual type
955 // (e.g. uint8x48, which could be produced by concat()), the actual implementation
956 // might be larger (e.g. it might really be a uint8x64). Only copy the amount
957 // that is in the logical type, to avoid possible overreads.
958 memcpy(&r.native_vector, ((const ElementType*)base + offset), sizeof(ElementType) * Lanes);
959 return r;
960 }
961
962 // gather
963 // TODO: could this be improved by taking advantage of native operator support?
964 static Vec load(const void *base, const NativeVector<int32_t, Lanes> &offset) {
965 Vec r(empty);
966 for (size_t i = 0; i < Lanes; i++) {
967 r.native_vector[i] = ((const ElementType*)base)[offset[i]];
968 }
969 return r;
970 }
971
972 // TODO: could this be improved by taking advantage of native operator support?
973 void store(void *base, int32_t offset) const {
974 // Note: do not use sizeof(NativeVectorType) here; if it's an unusual type
975 // (e.g. uint8x48, which could be produced by concat()), the actual implementation
976 // might be larger (e.g. it might really be a uint8x64). Only copy the amount
977 // that is in the logical type, to avoid possible overwrites.
978 memcpy(((ElementType*)base + offset), &native_vector, sizeof(ElementType) * Lanes);
979 }
980
981 // scatter
982 // TODO: could this be improved by taking advantage of native operator support?
983 void store(void *base, const NativeVector<int32_t, Lanes> &offset) const {
984 for (size_t i = 0; i < Lanes; i++) {
985 ((ElementType*)base)[offset[i]] = native_vector[i];
986 }
987 }
988
989 // TODO: this should be improved by taking advantage of native operator support.
990 static Vec shuffle(const Vec &a, const int32_t indices[Lanes]) {
991 Vec r(empty);
992 for (size_t i = 0; i < Lanes; i++) {
993 if (indices[i] < 0) {
994 continue;
995 }
996 r.native_vector[i] = a[indices[i]];
997 }
998 return r;
999 }
1000
1001 // TODO: this should be improved by taking advantage of native operator support.
1002 template<size_t InputLanes>
1003 static Vec concat(size_t count, const NativeVector<ElementType, InputLanes> vecs[]) {
1004 Vec r(empty);
1005 for (size_t i = 0; i < Lanes; i++) {
1006 r.native_vector[i] = vecs[i / InputLanes][i % InputLanes];
1007 }
1008 return r;
1009 }
1010
1011 // TODO: this should be improved by taking advantage of native operator support.
1012 Vec replace(size_t i, const ElementType &b) const {
1013 Vec r = *this;
1014 r.native_vector[i] = b;
1015 return r;
1016 }
1017
1018 ElementType operator[](size_t i) const {
1019 return native_vector[i];
1020 }
1021
1022 Vec operator~() const {
1023 return Vec(from_native_vector, ~native_vector);
1024 }
1025 Vec operator!() const {
1026 Vec r(empty);
1027 for (size_t i = 0; i < Lanes; i++) {
1028 r.native_vector[i] = !(*this)[i];
1029 }
1030 return r;
1031 }
1032
1033 friend Vec operator+(const Vec &a, const Vec &b) {
1034 return Vec(from_native_vector, a.native_vector + b.native_vector);
1035 }
1036 friend Vec operator-(const Vec &a, const Vec &b) {
1037 return Vec(from_native_vector, a.native_vector - b.native_vector);
1038 }
1039 friend Vec operator*(const Vec &a, const Vec &b) {
1040 return Vec(from_native_vector, a.native_vector * b.native_vector);
1041 }
1042 friend Vec operator/(const Vec &a, const Vec &b) {
1043 return Vec(from_native_vector, a.native_vector / b.native_vector);
1044 }
1045 friend Vec operator%(const Vec &a, const Vec &b) {
1046 return Vec(from_native_vector, a.native_vector % b.native_vector);
1047 }
1048 friend Vec operator&(const Vec &a, const Vec &b) {
1049 return Vec(from_native_vector, a.native_vector & b.native_vector);
1050 }
1051 friend Vec operator|(const Vec &a, const Vec &b) {
1052 return Vec(from_native_vector, a.native_vector | b.native_vector);
1053 }
1054 friend Vec operator&&(const Vec &a, const Vec &b) {
1055 Vec r(empty);
1056 for (size_t i = 0; i < Lanes; i++) {
1057 r.native_vector[i] = a.native_vector[i] && b.native_vector[i];
1058 }
1059 return r;
1060 }
1061 friend Vec operator||(const Vec &a, const Vec &b) {
1062 Vec r(empty);
1063 for (size_t i = 0; i < Lanes; i++) {
1064 r.native_vector[i] = a.native_vector[i] || b.native_vector[i];
1065 }
1066 return r;
1067 }
1068
1069 friend Vec operator+(const Vec &a, const ElementType &b) {
1070 return Vec(from_native_vector, a.native_vector + b);
1071 }
1072 friend Vec operator-(const Vec &a, const ElementType &b) {
1073 return Vec(from_native_vector, a.native_vector - b);
1074 }
1075 friend Vec operator*(const Vec &a, const ElementType &b) {
1076 return Vec(from_native_vector, a.native_vector * b);
1077 }
1078 friend Vec operator/(const Vec &a, const ElementType &b) {
1079 return Vec(from_native_vector, a.native_vector / b);
1080 }
1081 friend Vec operator%(const Vec &a, const ElementType &b) {
1082 return Vec(from_native_vector, a.native_vector % b);
1083 }
1084 friend Vec operator<<(const Vec &a, const ElementType &b) {
1085 return Vec(from_native_vector, a.native_vector << b);
1086 }
1087 friend Vec operator>>(const Vec &a, const ElementType &b) {
1088 return Vec(from_native_vector, a.native_vector >> b);
1089 }
1090 friend Vec operator&(const Vec &a, const ElementType &b) {
1091 return Vec(from_native_vector, a.native_vector & b);
1092 }
1093 friend Vec operator|(const Vec &a, const ElementType &b) {
1094 return Vec(from_native_vector, a.native_vector | b);
1095 }
1096 friend Vec operator&&(const Vec &a, const ElementType &b) {
1097 Vec r(empty);
1098 for (size_t i = 0; i < Lanes; i++) {
1099 r.native_vector[i] = a.native_vector[i] && b;
1100 }
1101 return r;
1102 }
1103 friend Vec operator||(const Vec &a, const ElementType &b) {
1104 Vec r(empty);
1105 for (size_t i = 0; i < Lanes; i++) {
1106 r.native_vector[i] = a.native_vector[i] || b;
1107 }
1108 return r;
1109 }
1110
1111 friend Vec operator+(const ElementType &a, const Vec &b) {
1112 return Vec(from_native_vector, a + b.native_vector);
1113 }
1114 friend Vec operator-(const ElementType &a, const Vec &b) {
1115 return Vec(from_native_vector, a - b.native_vector);
1116 }
1117 friend Vec operator*(const ElementType &a, const Vec &b) {
1118 return Vec(from_native_vector, a * b.native_vector);
1119 }
1120 friend Vec operator/(const ElementType &a, const Vec &b) {
1121 return Vec(from_native_vector, a / b.native_vector);
1122 }
1123 friend Vec operator%(const ElementType &a, const Vec &b) {
1124 return Vec(from_native_vector, a % b.native_vector);
1125 }
1126 friend Vec operator<<(const ElementType &a, const Vec &b) {
1127 return Vec(from_native_vector, a << b.native_vector);
1128 }
1129 friend Vec operator>>(const ElementType &a, const Vec &b) {
1130 return Vec(from_native_vector, a >> b.native_vector);
1131 }
1132 friend Vec operator&(const ElementType &a, const Vec &b) {
1133 return Vec(from_native_vector, a & b.native_vector);
1134 }
1135 friend Vec operator|(const ElementType &a, const Vec &b) {
1136 return Vec(from_native_vector, a | b.native_vector);
1137 }
1138 friend Vec operator&&(const ElementType &a, const Vec &b) {
1139 Vec r(empty);
1140 for (size_t i = 0; i < Lanes; i++) {
1141 r.native_vector[i] = a && b.native_vector[i];
1142 }
1143 return r;
1144 }
1145 friend Vec operator||(const ElementType &a, const Vec &b) {
1146 Vec r(empty);
1147 for (size_t i = 0; i < Lanes; i++) {
1148 r.native_vector[i] = a || b.native_vector[i];
1149 }
1150 return r;
1151 }
1152
1153 // TODO: this should be improved by taking advantage of native operator support.
1154 friend Mask operator<(const Vec &a, const Vec &b) {
1155 Mask r;
1156 for (size_t i = 0; i < Lanes; i++) {
1157 r.native_vector[i] = a[i] < b[i] ? 0xff : 0x00;
1158 }
1159 return r;
1160 }
1161
1162 // TODO: this should be improved by taking advantage of native operator support.
1163 friend Mask operator<=(const Vec &a, const Vec &b) {
1164 Mask r;
1165 for (size_t i = 0; i < Lanes; i++) {
1166 r.native_vector[i] = a[i] <= b[i] ? 0xff : 0x00;
1167 }
1168 return r;
1169 }
1170
1171 // TODO: this should be improved by taking advantage of native operator support.
1172 friend Mask operator>(const Vec &a, const Vec &b) {
1173 Mask r;
1174 for (size_t i = 0; i < Lanes; i++) {
1175 r.native_vector[i] = a[i] > b[i] ? 0xff : 0x00;
1176 }
1177 return r;
1178 }
1179
1180 // TODO: this should be improved by taking advantage of native operator support.
1181 friend Mask operator>=(const Vec &a, const Vec &b) {
1182 Mask r;
1183 for (size_t i = 0; i < Lanes; i++) {
1184 r.native_vector[i] = a[i] >= b[i] ? 0xff : 0x00;
1185 }
1186 return r;
1187 }
1188
1189 // TODO: this should be improved by taking advantage of native operator support.
1190 friend Mask operator==(const Vec &a, const Vec &b) {
1191 Mask r;
1192 for (size_t i = 0; i < Lanes; i++) {
1193 r.native_vector[i] = a[i] == b[i] ? 0xff : 0x00;
1194 }
1195 return r;
1196 }
1197
1198 // TODO: this should be improved by taking advantage of native operator support.
1199 friend Mask operator!=(const Vec &a, const Vec &b) {
1200 Mask r;
1201 for (size_t i = 0; i < Lanes; i++) {
1202 r.native_vector[i] = a[i] != b[i] ? 0xff : 0x00;
1203 }
1204 return r;
1205 }
1206
1207 // TODO: this should be improved by taking advantage of native operator support.
1208 static Vec select(const Mask &cond, const Vec &true_value, const Vec &false_value) {
1209 Vec r(empty);
1210 for (size_t i = 0; i < Lanes; i++) {
1211 r.native_vector[i] = cond[i] ? true_value[i] : false_value[i];
1212 }
1213 return r;
1214 }
1215
1216 template <typename OtherVec>
1217 static Vec convert_from(const OtherVec &src) {
1218 #if __cplusplus >= 201103L
1219 static_assert(Vec::Lanes == OtherVec::Lanes, "Lanes mismatch");
1220 #endif
1221 #if 0 // __has_builtin(__builtin_convertvector)
1222 // Disabled (for now) because __builtin_convertvector appears to have
1223 // different float->int rounding behavior in at least some situations;
1224 // for now we'll use the much-slower-but-correct explicit C++ code.
1225 // (https://github.com/halide/Halide/issues/2080)
1226 return Vec(from_native_vector, __builtin_convertvector(src.native_vector, NativeVectorType));
1227 #else
1228 Vec r(empty);
1229 for (size_t i = 0; i < Lanes; i++) {
1230 r.native_vector[i] = static_cast<typename Vec::ElementType>(src.native_vector[i]);
1231 }
1232 return r;
1233 #endif
1234 }
1235
1236 // TODO: this should be improved by taking advantage of native operator support.
1237 static Vec max(const Vec &a, const Vec &b) {
1238 Vec r(empty);
1239 for (size_t i = 0; i < Lanes; i++) {
1240 r.native_vector[i] = ::halide_cpp_max(a[i], b[i]);
1241 }
1242 return r;
1243 }
1244
1245 // TODO: this should be improved by taking advantage of native operator support.
1246 static Vec min(const Vec &a, const Vec &b) {
1247 Vec r(empty);
1248 for (size_t i = 0; i < Lanes; i++) {
1249 r.native_vector[i] = ::halide_cpp_min(a[i], b[i]);
1250 }
1251 return r;
1252 }
1253
1254 private:
1255 template<typename, size_t> friend class NativeVector;
1256
1257 template <typename ElementType, typename OtherElementType, size_t Lanes>
1258 friend NativeVector<ElementType, Lanes> operator<<(
1259 const NativeVector<ElementType, Lanes> &a,
1260 const NativeVector<OtherElementType, Lanes> &b);
1261
1262 template <typename ElementType, typename OtherElementType, size_t Lanes>
1263 friend NativeVector<ElementType, Lanes> operator>>(
1264 const NativeVector<ElementType, Lanes> &a,
1265 const NativeVector<OtherElementType, Lanes> &b);
1266
1267 NativeVectorType native_vector;
1268
1269 // Leave vector uninitialized for cases where we overwrite every entry
1270 enum Empty { empty };
1271 inline NativeVector(Empty) {}
1272
1273 // Syntactic sugar to avoid ctor overloading issues
1274 enum FromNativeVector { from_native_vector };
1275 inline NativeVector(FromNativeVector, const NativeVectorType &src) {
1276 native_vector = src;
1277 }
1278 };
1279
1280 template <typename ElementType, typename OtherElementType, size_t Lanes>
1281 NativeVector<ElementType, Lanes> operator<<(const NativeVector<ElementType, Lanes> &a,
1282 const NativeVector<OtherElementType, Lanes> &b) {
1283 return NativeVector<ElementType, Lanes>(
1284 NativeVector<ElementType, Lanes>::from_native_vector,
1285 a.native_vector << b.native_vector);
1286 }
1287
1288 template <typename ElementType, typename OtherElementType, size_t Lanes>
1289 NativeVector<ElementType, Lanes> operator>>(const NativeVector<ElementType, Lanes> &a,
1290 const NativeVector<OtherElementType, Lanes> &b) {
1291 return NativeVector<ElementType, Lanes>(
1292 NativeVector<ElementType, Lanes>::from_native_vector,
1293 a.native_vector >> b.native_vector);
1294 }
1295 #endif // __has_attribute(ext_vector_type) || __has_attribute(vector_size)
1296
1297 )INLINE_CODE";
1298
1299 const char *vector_selection_decl = R"INLINE_CODE(
1300 // Dec. 1, 2018: Apparently emscripten compilation runs with the __has_attribute true,
1301 // then fails to handle the vector intrinsics later.
1302 #if !defined(__EMSCRIPTEN__) && (__has_attribute(ext_vector_type) || __has_attribute(vector_size))
1303 #if __GNUC__ && !__clang__
1304 // GCC only allows powers-of-two; fall back to CppVector for other widths
1305 #define halide_cpp_use_native_vector(type, lanes) ((lanes & (lanes - 1)) == 0)
1306 #else
1307 #define halide_cpp_use_native_vector(type, lanes) (true)
1308 #endif
1309 #else
1310 // No NativeVector available
1311 #define halide_cpp_use_native_vector(type, lanes) (false)
1312 #endif // __has_attribute(ext_vector_type) || __has_attribute(vector_size)
1313
1314 // Failsafe to allow forcing non-native vectors in case of unruly compilers
1315 #if HALIDE_CPP_ALWAYS_USE_CPP_VECTORS
1316 #undef halide_cpp_use_native_vector
1317 #define halide_cpp_use_native_vector(type, lanes) (false)
1318 #endif
1319
1320 )INLINE_CODE";
1321
1322 // Vodoo fix: on at least one config (our arm32 buildbot running gcc 5.4),
1323 // emitting this long text string was regularly garbled in a predictable pattern;
1324 // flushing the stream before or after heals it. Since C++ codegen is rarely
1325 // on a compilation critical path, we'll just band-aid it in this way.
1326 stream << std::flush;
1327 stream << cpp_vector_decl << native_vector_decl << vector_selection_decl;
1328 stream << std::flush;
1329
1330 for (const auto &t : vector_types) {
1331 string name = type_to_c_type(t, false, false);
1332 string scalar_name = type_to_c_type(t.element_of(), false, false);
1333 stream << "#if halide_cpp_use_native_vector(" << scalar_name << ", " << t.lanes() << ")\n";
1334 stream << "typedef NativeVector<" << scalar_name << ", " << t.lanes() << "> " << name << ";\n";
1335 // Useful for debugging which Vector implementation is being selected
1336 // stream << "#pragma message \"using NativeVector for " << t << "\"\n";
1337 stream << "#else\n";
1338 stream << "typedef CppVector<" << scalar_name << ", " << t.lanes() << "> " << name << ";\n";
1339 // Useful for debugging which Vector implementation is being selected
1340 // stream << "#pragma message \"using CppVector for " << t << "\"\n";
1341 stream << "#endif\n";
1342 }
1343 }
1344 }
1345
set_name_mangling_mode(NameMangling mode)1346 void CodeGen_C::set_name_mangling_mode(NameMangling mode) {
1347 if (extern_c_open && mode != NameMangling::C) {
1348 stream << "\n#ifdef __cplusplus\n";
1349 stream << "} // extern \"C\"\n";
1350 stream << "#endif\n\n";
1351 extern_c_open = false;
1352 } else if (!extern_c_open && mode == NameMangling::C) {
1353 stream << "\n#ifdef __cplusplus\n";
1354 stream << "extern \"C\" {\n";
1355 stream << "#endif\n\n";
1356 extern_c_open = true;
1357 }
1358 }
1359
print_type(Type type,AppendSpaceIfNeeded space_option)1360 string CodeGen_C::print_type(Type type, AppendSpaceIfNeeded space_option) {
1361 return type_to_c_type(type, space_option == AppendSpace);
1362 }
1363
print_reinterpret(Type type,const Expr & e)1364 string CodeGen_C::print_reinterpret(Type type, const Expr &e) {
1365 ostringstream oss;
1366 if (type.is_handle() || e.type().is_handle()) {
1367 // Use a c-style cast if either src or dest is a handle --
1368 // note that although Halide declares a "Handle" to always be 64 bits,
1369 // the source "handle" might actually be a 32-bit pointer (from
1370 // a function parameter), so calling reinterpret<> (which just memcpy's)
1371 // would be garbage-producing.
1372 oss << "(" << print_type(type) << ")";
1373 } else {
1374 oss << "reinterpret<" << print_type(type) << ">";
1375 }
1376 oss << "(" << print_expr(e) << ")";
1377 return oss.str();
1378 }
1379
print_name(const string & name)1380 string CodeGen_C::print_name(const string &name) {
1381 return c_print_name(name);
1382 }
1383
1384 namespace {
1385 class ExternCallPrototypes : public IRGraphVisitor {
1386 struct NamespaceOrCall {
1387 const Call *call; // nullptr if this is a subnamespace
1388 std::map<string, NamespaceOrCall> names;
NamespaceOrCallHalide::Internal::__anonc9f9d7000211::ExternCallPrototypes::NamespaceOrCall1389 NamespaceOrCall(const Call *call = nullptr)
1390 : call(call) {
1391 }
1392 };
1393 std::map<string, NamespaceOrCall> c_plus_plus_externs;
1394 std::map<string, const Call *> c_externs;
1395 std::set<std::string> processed;
1396 std::set<std::string> internal_linkage;
1397 std::set<std::string> destructors;
1398
1399 using IRGraphVisitor::visit;
1400
visit(const Call * op)1401 void visit(const Call *op) override {
1402 IRGraphVisitor::visit(op);
1403
1404 if (!processed.count(op->name)) {
1405 if (op->call_type == Call::Extern || op->call_type == Call::PureExtern) {
1406 c_externs.insert({op->name, op});
1407 } else if (op->call_type == Call::ExternCPlusPlus) {
1408 std::vector<std::string> namespaces;
1409 std::string name = extract_namespaces(op->name, namespaces);
1410 std::map<string, NamespaceOrCall> *namespace_map = &c_plus_plus_externs;
1411 for (const auto &ns : namespaces) {
1412 auto insertion = namespace_map->insert({ns, NamespaceOrCall()});
1413 namespace_map = &insertion.first->second.names;
1414 }
1415 namespace_map->insert({name, NamespaceOrCall(op)});
1416 }
1417 processed.insert(op->name);
1418 }
1419
1420 if (op->is_intrinsic(Call::register_destructor)) {
1421 internal_assert(op->args.size() == 2);
1422 const StringImm *fn = op->args[0].as<StringImm>();
1423 internal_assert(fn);
1424 destructors.insert(fn->value);
1425 }
1426 }
1427
visit(const Allocate * op)1428 void visit(const Allocate *op) override {
1429 IRGraphVisitor::visit(op);
1430 if (!op->free_function.empty()) {
1431 destructors.insert(op->free_function);
1432 }
1433 }
1434
emit_function_decl(ostream & stream,const Call * op,const std::string & name) const1435 void emit_function_decl(ostream &stream, const Call *op, const std::string &name) const {
1436 // op->name (rather than the name arg) since we need the fully-qualified C++ name
1437 if (internal_linkage.count(op->name)) {
1438 stream << "static ";
1439 }
1440 stream << type_to_c_type(op->type, /* append_space */ true) << name << "(";
1441 if (function_takes_user_context(name)) {
1442 stream << "void *";
1443 if (!op->args.empty()) {
1444 stream << ", ";
1445 }
1446 }
1447 for (size_t i = 0; i < op->args.size(); i++) {
1448 if (i > 0) {
1449 stream << ", ";
1450 }
1451 if (op->args[i].as<StringImm>()) {
1452 stream << "const char *";
1453 } else {
1454 stream << type_to_c_type(op->args[i].type(), true);
1455 }
1456 }
1457 stream << ");\n";
1458 }
1459
emit_namespace_or_call(ostream & stream,const NamespaceOrCall & ns_or_call,const std::string & name) const1460 void emit_namespace_or_call(ostream &stream, const NamespaceOrCall &ns_or_call, const std::string &name) const {
1461 if (ns_or_call.call == nullptr) {
1462 stream << "namespace " << name << " {\n";
1463 for (const auto &ns_or_call_inner : ns_or_call.names) {
1464 emit_namespace_or_call(stream, ns_or_call_inner.second, ns_or_call_inner.first);
1465 }
1466 stream << "} // namespace " << name << "\n";
1467 } else {
1468 emit_function_decl(stream, ns_or_call.call, name);
1469 }
1470 }
1471
1472 public:
ExternCallPrototypes()1473 ExternCallPrototypes() {
1474 // Make sure we don't catch calls that are already in the global declarations
1475 const char *strs[] = {globals.c_str(),
1476 (const char *)halide_internal_runtime_header_HalideRuntime_h,
1477 (const char *)halide_internal_initmod_inlined_c};
1478 for (const char *str : strs) {
1479 size_t j = 0;
1480 for (size_t i = 0; str[i]; i++) {
1481 char c = str[i];
1482 if (c == '(' && i > j + 1) {
1483 // Could be the end of a function_name.
1484 string name(str + j + 1, i - j - 1);
1485 processed.insert(name);
1486 }
1487
1488 if (('A' <= c && c <= 'Z') ||
1489 ('a' <= c && c <= 'z') ||
1490 c == '_' ||
1491 ('0' <= c && c <= '9')) {
1492 // Could be part of a function name.
1493 } else {
1494 j = i;
1495 }
1496 }
1497 }
1498 }
1499
set_internal_linkage(const std::string & name)1500 void set_internal_linkage(const std::string &name) {
1501 internal_linkage.insert(name);
1502 }
1503
has_c_declarations() const1504 bool has_c_declarations() const {
1505 return !c_externs.empty();
1506 }
1507
has_c_plus_plus_declarations() const1508 bool has_c_plus_plus_declarations() const {
1509 return !c_plus_plus_externs.empty();
1510 }
1511
emit_c_declarations(ostream & stream) const1512 void emit_c_declarations(ostream &stream) const {
1513 for (const auto &call : c_externs) {
1514 emit_function_decl(stream, call.second, call.first);
1515 }
1516 for (const auto &d : destructors) {
1517 stream << "void " << d << "(void *, void *);\n";
1518 }
1519 stream << "\n";
1520 }
1521
emit_c_plus_plus_declarations(ostream & stream) const1522 void emit_c_plus_plus_declarations(ostream &stream) const {
1523 for (const auto &ns_or_call : c_plus_plus_externs) {
1524 emit_namespace_or_call(stream, ns_or_call.second, ns_or_call.first);
1525 }
1526 stream << "\n";
1527 }
1528 };
1529 } // namespace
1530
forward_declare_type_if_needed(const Type & t)1531 void CodeGen_C::forward_declare_type_if_needed(const Type &t) {
1532 if (!t.handle_type ||
1533 forward_declared.count(t.handle_type) ||
1534 t.handle_type->inner_name.cpp_type_type == halide_cplusplus_type_name::Simple) {
1535 return;
1536 }
1537 for (auto &ns : t.handle_type->namespaces) {
1538 stream << "namespace " << ns << " { ";
1539 }
1540 switch (t.handle_type->inner_name.cpp_type_type) {
1541 case halide_cplusplus_type_name::Simple:
1542 // nothing
1543 break;
1544 case halide_cplusplus_type_name::Struct:
1545 stream << "struct " << t.handle_type->inner_name.name << ";";
1546 break;
1547 case halide_cplusplus_type_name::Class:
1548 stream << "class " << t.handle_type->inner_name.name << ";";
1549 break;
1550 case halide_cplusplus_type_name::Union:
1551 stream << "union " << t.handle_type->inner_name.name << ";";
1552 break;
1553 case halide_cplusplus_type_name::Enum:
1554 internal_error << "Passing pointers to enums is unsupported\n";
1555 break;
1556 }
1557 for (auto &ns : t.handle_type->namespaces) {
1558 (void)ns;
1559 stream << " }";
1560 }
1561 stream << "\n";
1562 forward_declared.insert(t.handle_type);
1563 }
1564
compile(const Module & input)1565 void CodeGen_C::compile(const Module &input) {
1566 TypeInfoGatherer type_info;
1567 for (const auto &f : input.functions()) {
1568 if (f.body.defined()) {
1569 f.body.accept(&type_info);
1570 }
1571 }
1572 uses_gpu_for_loops = (type_info.for_types_used.count(ForType::GPUBlock) ||
1573 type_info.for_types_used.count(ForType::GPUThread) ||
1574 type_info.for_types_used.count(ForType::GPULane));
1575
1576 // Forward-declare all the types we need; this needs to happen before
1577 // we emit function prototypes, since those may need the types.
1578 stream << "\n";
1579 for (const auto &f : input.functions()) {
1580 for (auto &arg : f.args) {
1581 forward_declare_type_if_needed(arg.type);
1582 }
1583 }
1584 stream << "\n";
1585
1586 if (!is_header_or_extern_decl()) {
1587 // Emit any external-code blobs that are C++.
1588 for (const ExternalCode &code_blob : input.external_code()) {
1589 if (code_blob.is_c_plus_plus_source()) {
1590 stream << "\n";
1591 stream << "// Begin External Code: " << code_blob.name() << "\n";
1592 stream.write((const char *)code_blob.contents().data(), code_blob.contents().size());
1593 stream << "\n";
1594 stream << "// End External Code: " << code_blob.name() << "\n";
1595 stream << "\n";
1596 }
1597 }
1598
1599 add_vector_typedefs(type_info.vector_types_used);
1600
1601 // Emit prototypes for all external and internal-only functions.
1602 // Gather them up and do them all up front, to reduce duplicates,
1603 // and to make it simpler to get internal-linkage functions correct.
1604 ExternCallPrototypes e;
1605 for (const auto &f : input.functions()) {
1606 f.body.accept(&e);
1607 if (f.linkage == LinkageType::Internal) {
1608 // We can't tell at the call site if a LoweredFunc is intended to be internal
1609 // or not, so mark them explicitly.
1610 e.set_internal_linkage(f.name);
1611 }
1612 }
1613
1614 if (e.has_c_plus_plus_declarations()) {
1615 set_name_mangling_mode(NameMangling::CPlusPlus);
1616 e.emit_c_plus_plus_declarations(stream);
1617 }
1618
1619 if (e.has_c_declarations()) {
1620 set_name_mangling_mode(NameMangling::C);
1621 e.emit_c_declarations(stream);
1622 }
1623 }
1624
1625 for (const auto &b : input.buffers()) {
1626 compile(b);
1627 }
1628 for (const auto &f : input.functions()) {
1629 compile(f);
1630 }
1631 }
1632
compile(const LoweredFunc & f)1633 void CodeGen_C::compile(const LoweredFunc &f) {
1634 // Don't put non-external function declarations in headers.
1635 if (is_header_or_extern_decl() && f.linkage == LinkageType::Internal) {
1636 return;
1637 }
1638
1639 const std::vector<LoweredArgument> &args = f.args;
1640
1641 have_user_context = false;
1642 for (size_t i = 0; i < args.size(); i++) {
1643 // TODO: check that its type is void *?
1644 have_user_context |= (args[i].name == "__user_context");
1645 }
1646
1647 NameMangling name_mangling = f.name_mangling;
1648 if (name_mangling == NameMangling::Default) {
1649 name_mangling = (target.has_feature(Target::CPlusPlusMangling) ? NameMangling::CPlusPlus : NameMangling::C);
1650 }
1651
1652 set_name_mangling_mode(name_mangling);
1653
1654 std::vector<std::string> namespaces;
1655 std::string simple_name = extract_namespaces(f.name, namespaces);
1656 if (!is_c_plus_plus_interface()) {
1657 user_assert(namespaces.empty()) << "Namespace qualifiers not allowed on function name if not compiling with Target::CPlusPlusNameMangling.\n";
1658 }
1659
1660 if (!namespaces.empty()) {
1661 for (const auto &ns : namespaces) {
1662 stream << "namespace " << ns << " {\n";
1663 }
1664 stream << "\n";
1665 }
1666
1667 // Emit the function prototype
1668 if (f.linkage == LinkageType::Internal) {
1669 // If the function isn't public, mark it static.
1670 stream << "static ";
1671 }
1672 stream << "HALIDE_FUNCTION_ATTRS\n";
1673 stream << "int " << simple_name << "(";
1674 for (size_t i = 0; i < args.size(); i++) {
1675 if (args[i].is_buffer()) {
1676 stream << "struct halide_buffer_t *"
1677 << print_name(args[i].name)
1678 << "_buffer";
1679 } else {
1680 stream << print_type(args[i].type, AppendSpace)
1681 << print_name(args[i].name);
1682 }
1683
1684 if (i < args.size() - 1) stream << ", ";
1685 }
1686
1687 if (is_header_or_extern_decl()) {
1688 stream << ");\n";
1689 } else {
1690 stream << ") {\n";
1691 indent += 1;
1692
1693 if (uses_gpu_for_loops) {
1694 stream << get_indent() << "halide_error("
1695 << (have_user_context ? "__user_context_" : "nullptr")
1696 << ", \"C++ Backend does not support gpu_blocks() or gpu_threads() yet, "
1697 << "this function will always fail at runtime\");\n";
1698 stream << get_indent() << "return halide_error_code_device_malloc_failed;\n";
1699 } else {
1700 // Emit a local user_context we can pass in all cases, either
1701 // aliasing __user_context or nullptr.
1702 stream << get_indent() << "void * const _ucon = "
1703 << (have_user_context ? "const_cast<void *>(__user_context)" : "nullptr")
1704 << ";\n";
1705
1706 if (target.has_feature(Target::NoAsserts)) {
1707 stream << get_indent() << "halide_unused(_ucon);";
1708 }
1709
1710 // Emit the body
1711 print(f.body);
1712
1713 // Return success.
1714 stream << get_indent() << "return 0;\n";
1715 }
1716
1717 indent -= 1;
1718 stream << "}\n";
1719 }
1720
1721 if (is_header_or_extern_decl() && f.linkage == LinkageType::ExternalPlusMetadata) {
1722 // Emit the argv version
1723 stream << "\nHALIDE_FUNCTION_ATTRS\nint " << simple_name << "_argv(void **args);\n";
1724
1725 // And also the metadata.
1726 stream << "\nHALIDE_FUNCTION_ATTRS\nconst struct halide_filter_metadata_t *" << simple_name << "_metadata();\n";
1727 }
1728
1729 if (!namespaces.empty()) {
1730 stream << "\n";
1731 for (size_t i = namespaces.size(); i > 0; i--) {
1732 stream << "} // namespace " << namespaces[i - 1] << "\n";
1733 }
1734 stream << "\n";
1735 }
1736 }
1737
compile(const Buffer<> & buffer)1738 void CodeGen_C::compile(const Buffer<> &buffer) {
1739 // Don't define buffers in headers or extern decls.
1740 if (is_header_or_extern_decl()) {
1741 return;
1742 }
1743
1744 string name = print_name(buffer.name());
1745 halide_buffer_t b = *(buffer.raw_buffer());
1746
1747 user_assert(b.host) << "Can't embed image: " << buffer.name() << " because it has a null host pointer\n";
1748 user_assert(!b.device_dirty()) << "Can't embed image: " << buffer.name() << "because it has a dirty device pointer\n";
1749
1750 // Figure out the offset of the last pixel.
1751 size_t num_elems = 1;
1752 for (int d = 0; d < b.dimensions; d++) {
1753 num_elems += b.dim[d].stride * (b.dim[d].extent - 1);
1754 }
1755
1756 // For now, we assume buffers that aren't scalar are constant,
1757 // while scalars can be mutated. This accommodates all our existing
1758 // use cases, which is that all buffers are constant, except those
1759 // used to store stateful module information in offloading runtimes.
1760 bool is_constant = buffer.dimensions() != 0;
1761
1762 // Emit the data
1763 stream << "static " << (is_constant ? "const" : "") << " uint8_t " << name << "_data[] HALIDE_ATTRIBUTE_ALIGN(32) = {\n";
1764 stream << get_indent();
1765 for (size_t i = 0; i < num_elems * b.type.bytes(); i++) {
1766 if (i > 0) {
1767 stream << ",";
1768 if (i % 16 == 0) {
1769 stream << "\n";
1770 stream << get_indent();
1771 } else {
1772 stream << " ";
1773 }
1774 }
1775 stream << (int)(b.host[i]);
1776 }
1777 stream << "\n};\n";
1778
1779 // Emit the shape (constant even for scalar buffers)
1780 stream << "static const halide_dimension_t " << name << "_buffer_shape[] = {";
1781 for (int i = 0; i < buffer.dimensions(); i++) {
1782 stream << "halide_dimension_t(" << buffer.dim(i).min()
1783 << ", " << buffer.dim(i).extent()
1784 << ", " << buffer.dim(i).stride() << ")";
1785 if (i < buffer.dimensions() - 1) {
1786 stream << ", ";
1787 }
1788 }
1789 stream << "};\n";
1790
1791 Type t = buffer.type();
1792
1793 // Emit the buffer struct. Note that although our shape and (usually) our host
1794 // data is const, the buffer itself isn't: embedded buffers in one pipeline
1795 // can be passed to another pipeline (e.g. for an extern stage), in which
1796 // case the buffer objects need to be non-const, because the constness
1797 // (from the POV of the extern stage) is a runtime property.
1798 stream << "static halide_buffer_t " << name << "_buffer_ = {"
1799 << "0, " // device
1800 << "nullptr, " // device_interface
1801 << "const_cast<uint8_t*>(&" << name << "_data[0]), " // host
1802 << "0, " // flags
1803 << "halide_type_t((halide_type_code_t)(" << (int)t.code() << "), " << t.bits() << ", " << t.lanes() << "), "
1804 << buffer.dimensions() << ", "
1805 << "const_cast<halide_dimension_t*>(" << name << "_buffer_shape)};\n";
1806
1807 // Make a global pointer to it.
1808 stream << "static halide_buffer_t * const " << name << "_buffer = &" << name << "_buffer_;\n";
1809 }
1810
print_expr(const Expr & e)1811 string CodeGen_C::print_expr(const Expr &e) {
1812 id = "$$ BAD ID $$";
1813 e.accept(this);
1814 return id;
1815 }
1816
print_cast_expr(const Type & t,const Expr & e)1817 string CodeGen_C::print_cast_expr(const Type &t, const Expr &e) {
1818 string value = print_expr(e);
1819 string type = print_type(t);
1820 if (t.is_vector() &&
1821 t.lanes() == e.type().lanes() &&
1822 t != e.type()) {
1823 return print_assignment(t, type + "::convert_from<" + print_type(e.type()) + ">(" + value + ")");
1824 } else {
1825 return print_assignment(t, "(" + type + ")(" + value + ")");
1826 }
1827 }
1828
print_stmt(const Stmt & s)1829 void CodeGen_C::print_stmt(const Stmt &s) {
1830 s.accept(this);
1831 }
1832
print_assignment(Type t,const std::string & rhs)1833 string CodeGen_C::print_assignment(Type t, const std::string &rhs) {
1834 auto cached = cache.find(rhs);
1835 if (cached == cache.end()) {
1836 id = unique_name('_');
1837 stream << get_indent() << print_type(t, AppendSpace) << (output_kind == CPlusPlusImplementation ? "const " : "") << id << " = " << rhs << ";\n";
1838 cache[rhs] = id;
1839 } else {
1840 id = cached->second;
1841 }
1842 return id;
1843 }
1844
open_scope()1845 void CodeGen_C::open_scope() {
1846 cache.clear();
1847 stream << get_indent();
1848 indent++;
1849 stream << "{\n";
1850 }
1851
close_scope(const std::string & comment)1852 void CodeGen_C::close_scope(const std::string &comment) {
1853 cache.clear();
1854 indent--;
1855 stream << get_indent();
1856 if (!comment.empty()) {
1857 stream << "} // " << comment << "\n";
1858 } else {
1859 stream << "}\n";
1860 }
1861 }
1862
visit(const Variable * op)1863 void CodeGen_C::visit(const Variable *op) {
1864 id = print_name(op->name);
1865 }
1866
visit(const Cast * op)1867 void CodeGen_C::visit(const Cast *op) {
1868 id = print_cast_expr(op->type, op->value);
1869 }
1870
visit_binop(Type t,const Expr & a,const Expr & b,const char * op)1871 void CodeGen_C::visit_binop(Type t, const Expr &a, const Expr &b, const char *op) {
1872 string sa = print_expr(a);
1873 string sb = print_expr(b);
1874 print_assignment(t, sa + " " + op + " " + sb);
1875 }
1876
visit(const Add * op)1877 void CodeGen_C::visit(const Add *op) {
1878 visit_binop(op->type, op->a, op->b, "+");
1879 }
1880
visit(const Sub * op)1881 void CodeGen_C::visit(const Sub *op) {
1882 visit_binop(op->type, op->a, op->b, "-");
1883 }
1884
visit(const Mul * op)1885 void CodeGen_C::visit(const Mul *op) {
1886 visit_binop(op->type, op->a, op->b, "*");
1887 }
1888
visit(const Div * op)1889 void CodeGen_C::visit(const Div *op) {
1890 int bits;
1891 if (is_const_power_of_two_integer(op->b, &bits)) {
1892 visit_binop(op->type, op->a, make_const(op->a.type(), bits), ">>");
1893 } else if (op->type.is_int()) {
1894 print_expr(lower_euclidean_div(op->a, op->b));
1895 } else {
1896 visit_binop(op->type, op->a, op->b, "/");
1897 }
1898 }
1899
visit(const Mod * op)1900 void CodeGen_C::visit(const Mod *op) {
1901 int bits;
1902 if (is_const_power_of_two_integer(op->b, &bits)) {
1903 visit_binop(op->type, op->a, make_const(op->a.type(), (1 << bits) - 1), "&");
1904 } else if (op->type.is_int()) {
1905 print_expr(lower_euclidean_mod(op->a, op->b));
1906 } else if (op->type.is_float()) {
1907 string arg0 = print_expr(op->a);
1908 string arg1 = print_expr(op->b);
1909 ostringstream rhs;
1910 rhs << "fmod(" << arg0 << ", " << arg1 << ")";
1911 print_assignment(op->type, rhs.str());
1912 } else {
1913 visit_binop(op->type, op->a, op->b, "%");
1914 }
1915 }
1916
visit(const Max * op)1917 void CodeGen_C::visit(const Max *op) {
1918 // clang doesn't support the ternary operator on OpenCL style vectors.
1919 // See: https://bugs.llvm.org/show_bug.cgi?id=33103
1920 if (op->type.is_scalar()) {
1921 print_expr(Call::make(op->type, "::halide_cpp_max", {op->a, op->b}, Call::Extern));
1922 } else {
1923 ostringstream rhs;
1924 rhs << print_type(op->type) << "::max(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
1925 print_assignment(op->type, rhs.str());
1926 }
1927 }
1928
visit(const Min * op)1929 void CodeGen_C::visit(const Min *op) {
1930 // clang doesn't support the ternary operator on OpenCL style vectors.
1931 // See: https://bugs.llvm.org/show_bug.cgi?id=33103
1932 if (op->type.is_scalar()) {
1933 print_expr(Call::make(op->type, "::halide_cpp_min", {op->a, op->b}, Call::Extern));
1934 } else {
1935 ostringstream rhs;
1936 rhs << print_type(op->type) << "::min(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
1937 print_assignment(op->type, rhs.str());
1938 }
1939 }
1940
visit(const EQ * op)1941 void CodeGen_C::visit(const EQ *op) {
1942 visit_binop(op->type, op->a, op->b, "==");
1943 }
1944
visit(const NE * op)1945 void CodeGen_C::visit(const NE *op) {
1946 visit_binop(op->type, op->a, op->b, "!=");
1947 }
1948
visit(const LT * op)1949 void CodeGen_C::visit(const LT *op) {
1950 visit_binop(op->type, op->a, op->b, "<");
1951 }
1952
visit(const LE * op)1953 void CodeGen_C::visit(const LE *op) {
1954 visit_binop(op->type, op->a, op->b, "<=");
1955 }
1956
visit(const GT * op)1957 void CodeGen_C::visit(const GT *op) {
1958 visit_binop(op->type, op->a, op->b, ">");
1959 }
1960
visit(const GE * op)1961 void CodeGen_C::visit(const GE *op) {
1962 visit_binop(op->type, op->a, op->b, ">=");
1963 }
1964
visit(const Or * op)1965 void CodeGen_C::visit(const Or *op) {
1966 visit_binop(op->type, op->a, op->b, "||");
1967 }
1968
visit(const And * op)1969 void CodeGen_C::visit(const And *op) {
1970 visit_binop(op->type, op->a, op->b, "&&");
1971 }
1972
visit(const Not * op)1973 void CodeGen_C::visit(const Not *op) {
1974 print_assignment(op->type, "!(" + print_expr(op->a) + ")");
1975 }
1976
visit(const IntImm * op)1977 void CodeGen_C::visit(const IntImm *op) {
1978 if (op->type == Int(32)) {
1979 id = std::to_string(op->value);
1980 } else {
1981 print_assignment(op->type, "(" + print_type(op->type) + ")(ADD_INT64_T_SUFFIX(" + std::to_string(op->value) + "))");
1982 }
1983 }
1984
visit(const UIntImm * op)1985 void CodeGen_C::visit(const UIntImm *op) {
1986 print_assignment(op->type, "(" + print_type(op->type) + ")(ADD_UINT64_T_SUFFIX(" + std::to_string(op->value) + "))");
1987 }
1988
visit(const StringImm * op)1989 void CodeGen_C::visit(const StringImm *op) {
1990 ostringstream oss;
1991 oss << Expr(op);
1992 id = oss.str();
1993 }
1994
1995 // NaN is the only float/double for which this is true... and
1996 // surprisingly, there doesn't seem to be a portable isnan function
1997 // (dsharlet).
1998 template<typename T>
isnan(T x)1999 static bool isnan(T x) {
2000 return x != x;
2001 }
2002
2003 template<typename T>
isinf(T x)2004 static bool isinf(T x) {
2005 return std::numeric_limits<T>::has_infinity && (x == std::numeric_limits<T>::infinity() ||
2006 x == -std::numeric_limits<T>::infinity());
2007 }
2008
visit(const FloatImm * op)2009 void CodeGen_C::visit(const FloatImm *op) {
2010 if (isnan(op->value)) {
2011 id = "nan_f32()";
2012 } else if (isinf(op->value)) {
2013 if (op->value > 0) {
2014 id = "inf_f32()";
2015 } else {
2016 id = "neg_inf_f32()";
2017 }
2018 } else {
2019 // Write the constant as reinterpreted uint to avoid any bits lost in conversion.
2020 union {
2021 uint32_t as_uint;
2022 float as_float;
2023 } u;
2024 u.as_float = op->value;
2025
2026 ostringstream oss;
2027 if (op->type.bits() == 64) {
2028 oss << "(double) ";
2029 }
2030 oss << "float_from_bits(" << u.as_uint << " /* " << u.as_float << " */)";
2031 print_assignment(op->type, oss.str());
2032 }
2033 }
2034
visit(const Call * op)2035 void CodeGen_C::visit(const Call *op) {
2036
2037 internal_assert(op->is_extern() || op->is_intrinsic())
2038 << "Can only codegen extern calls and intrinsics\n";
2039
2040 ostringstream rhs;
2041
2042 // Handle intrinsics first
2043 if (op->is_intrinsic(Call::debug_to_file)) {
2044 internal_assert(op->args.size() == 3);
2045 const StringImm *string_imm = op->args[0].as<StringImm>();
2046 internal_assert(string_imm);
2047 string filename = string_imm->value;
2048 string typecode = print_expr(op->args[1]);
2049 string buffer = print_name(print_expr(op->args[2]));
2050
2051 rhs << "halide_debug_to_file(_ucon, "
2052 << "\"" << filename << "\", "
2053 << typecode
2054 << ", (struct halide_buffer_t *)" << buffer << ")";
2055 } else if (op->is_intrinsic(Call::bitwise_and)) {
2056 internal_assert(op->args.size() == 2);
2057 string a0 = print_expr(op->args[0]);
2058 string a1 = print_expr(op->args[1]);
2059 rhs << a0 << " & " << a1;
2060 } else if (op->is_intrinsic(Call::bitwise_xor)) {
2061 internal_assert(op->args.size() == 2);
2062 string a0 = print_expr(op->args[0]);
2063 string a1 = print_expr(op->args[1]);
2064 rhs << a0 << " ^ " << a1;
2065 } else if (op->is_intrinsic(Call::bitwise_or)) {
2066 internal_assert(op->args.size() == 2);
2067 string a0 = print_expr(op->args[0]);
2068 string a1 = print_expr(op->args[1]);
2069 rhs << a0 << " | " << a1;
2070 } else if (op->is_intrinsic(Call::bitwise_not)) {
2071 internal_assert(op->args.size() == 1);
2072 rhs << "~" << print_expr(op->args[0]);
2073 } else if (op->is_intrinsic(Call::reinterpret)) {
2074 internal_assert(op->args.size() == 1);
2075 rhs << print_reinterpret(op->type, op->args[0]);
2076 } else if (op->is_intrinsic(Call::shift_left)) {
2077 internal_assert(op->args.size() == 2);
2078 string a0 = print_expr(op->args[0]);
2079 string a1 = print_expr(op->args[1]);
2080 rhs << a0 << " << " << a1;
2081 } else if (op->is_intrinsic(Call::shift_right)) {
2082 internal_assert(op->args.size() == 2);
2083 string a0 = print_expr(op->args[0]);
2084 string a1 = print_expr(op->args[1]);
2085 rhs << a0 << " >> " << a1;
2086 } else if (op->is_intrinsic(Call::count_leading_zeros) ||
2087 op->is_intrinsic(Call::count_trailing_zeros) ||
2088 op->is_intrinsic(Call::popcount)) {
2089 internal_assert(op->args.size() == 1);
2090 if (op->args[0].type().is_vector()) {
2091 rhs << print_scalarized_expr(op);
2092 } else {
2093 string a0 = print_expr(op->args[0]);
2094 rhs << "halide_" << op->name << "(" << a0 << ")";
2095 }
2096 } else if (op->is_intrinsic(Call::lerp)) {
2097 internal_assert(op->args.size() == 3);
2098 Expr e = lower_lerp(op->args[0], op->args[1], op->args[2]);
2099 rhs << print_expr(e);
2100 } else if (op->is_intrinsic(Call::absd)) {
2101 internal_assert(op->args.size() == 2);
2102 Expr a = op->args[0];
2103 Expr b = op->args[1];
2104 Expr e = cast(op->type, select(a < b, b - a, a - b));
2105 rhs << print_expr(e);
2106 } else if (op->is_intrinsic(Call::return_second)) {
2107 internal_assert(op->args.size() == 2);
2108 string arg0 = print_expr(op->args[0]);
2109 string arg1 = print_expr(op->args[1]);
2110 rhs << "return_second(" << arg0 << ", " << arg1 << ")";
2111 } else if (op->is_intrinsic(Call::if_then_else)) {
2112 internal_assert(op->args.size() == 3);
2113
2114 string result_id = unique_name('_');
2115
2116 stream << get_indent() << print_type(op->args[1].type(), AppendSpace)
2117 << result_id << ";\n";
2118
2119 string cond_id = print_expr(op->args[0]);
2120
2121 stream << get_indent() << "if (" << cond_id << ")\n";
2122 open_scope();
2123 string true_case = print_expr(op->args[1]);
2124 stream << get_indent() << result_id << " = " << true_case << ";\n";
2125 close_scope("if " + cond_id);
2126 stream << get_indent() << "else\n";
2127 open_scope();
2128 string false_case = print_expr(op->args[2]);
2129 stream << get_indent() << result_id << " = " << false_case << ";\n";
2130 close_scope("if " + cond_id + " else");
2131
2132 rhs << result_id;
2133 } else if (op->is_intrinsic(Call::require)) {
2134 internal_assert(op->args.size() == 3);
2135 if (op->args[0].type().is_vector()) {
2136 rhs << print_scalarized_expr(op);
2137 } else {
2138 create_assertion(op->args[0], op->args[2]);
2139 rhs << print_expr(op->args[1]);
2140 }
2141 } else if (op->is_intrinsic(Call::abs)) {
2142 internal_assert(op->args.size() == 1);
2143 Expr a0 = op->args[0];
2144 rhs << print_expr(cast(op->type, select(a0 > 0, a0, -a0)));
2145 } else if (op->is_intrinsic(Call::memoize_expr)) {
2146 internal_assert(!op->args.empty());
2147 string arg = print_expr(op->args[0]);
2148 rhs << "(" << arg << ")";
2149 } else if (op->is_intrinsic(Call::alloca)) {
2150 internal_assert(op->args.size() == 1);
2151 internal_assert(op->type.is_handle());
2152 const Call *call = op->args[0].as<Call>();
2153 if (op->type == type_of<struct halide_buffer_t *>() &&
2154 call && call->is_intrinsic(Call::size_of_halide_buffer_t)) {
2155 stream << get_indent();
2156 string buf_name = unique_name('b');
2157 stream << "halide_buffer_t " << buf_name << ";\n";
2158 rhs << "&" << buf_name;
2159 } else {
2160 // Make a stack of uint64_ts
2161 string size = print_expr(simplify((op->args[0] + 7) / 8));
2162 stream << get_indent();
2163 string array_name = unique_name('a');
2164 stream << "uint64_t " << array_name << "[" << size << "];";
2165 rhs << "(" << print_type(op->type) << ")(&" << array_name << ")";
2166 }
2167 } else if (op->is_intrinsic(Call::make_struct)) {
2168 if (op->args.empty()) {
2169 internal_assert(op->type.handle_type);
2170 // Add explicit cast so that different structs can't cache to the same value
2171 rhs << "(" << print_type(op->type) << ")(NULL)";
2172 } else if (op->type == type_of<halide_dimension_t *>()) {
2173 // Emit a shape
2174
2175 // Get the args
2176 vector<string> values;
2177 for (size_t i = 0; i < op->args.size(); i++) {
2178 values.push_back(print_expr(op->args[i]));
2179 }
2180
2181 static_assert(sizeof(halide_dimension_t) == 4 * sizeof(int32_t),
2182 "CodeGen_C assumes a halide_dimension_t is four densely-packed int32_ts");
2183
2184 internal_assert(values.size() % 4 == 0);
2185 int dimension = values.size() / 4;
2186
2187 string shape_name = unique_name('s');
2188 stream
2189 << get_indent() << "struct halide_dimension_t " << shape_name
2190 << "[" << dimension << "] = {\n";
2191 indent++;
2192 for (int i = 0; i < dimension; i++) {
2193 stream
2194 << get_indent() << "{"
2195 << values[i * 4 + 0] << ", "
2196 << values[i * 4 + 1] << ", "
2197 << values[i * 4 + 2] << ", "
2198 << values[i * 4 + 3] << "},\n";
2199 }
2200 indent--;
2201 stream << get_indent() << "};\n";
2202
2203 rhs << shape_name;
2204 } else {
2205 // Emit a declaration like:
2206 // struct {const int f_0, const char f_1, const int f_2} foo = {3, 'c', 4};
2207
2208 // Get the args
2209 vector<string> values;
2210 for (size_t i = 0; i < op->args.size(); i++) {
2211 values.push_back(print_expr(op->args[i]));
2212 }
2213 stream << get_indent() << "struct {\n";
2214 // List the types.
2215 indent++;
2216 for (size_t i = 0; i < op->args.size(); i++) {
2217 stream << get_indent() << "const " << print_type(op->args[i].type()) << " f_" << i << ";\n";
2218 }
2219 indent--;
2220 string struct_name = unique_name('s');
2221 stream << get_indent() << "} " << struct_name << " = {\n";
2222 // List the values.
2223 indent++;
2224 for (size_t i = 0; i < op->args.size(); i++) {
2225 stream << get_indent() << values[i];
2226 if (i < op->args.size() - 1) stream << ",";
2227 stream << "\n";
2228 }
2229 indent--;
2230 stream << get_indent() << "};\n";
2231
2232 // Return a pointer to it of the appropriate type
2233
2234 // TODO: This is dubious type-punning. We really need to
2235 // find a better way to do this. We dodge the problem for
2236 // the specific case of buffer shapes in the case above.
2237 if (op->type.handle_type) {
2238 rhs << "(" << print_type(op->type) << ")";
2239 }
2240 rhs << "(&" << struct_name << ")";
2241 }
2242 } else if (op->is_intrinsic(Call::stringify)) {
2243 // Rewrite to an snprintf
2244 vector<string> printf_args;
2245 string format_string = "";
2246 for (size_t i = 0; i < op->args.size(); i++) {
2247 Type t = op->args[i].type();
2248 printf_args.push_back(print_expr(op->args[i]));
2249 if (t.is_int()) {
2250 format_string += "%lld";
2251 printf_args[i] = "(long long)(" + printf_args[i] + ")";
2252 } else if (t.is_uint()) {
2253 format_string += "%llu";
2254 printf_args[i] = "(long long unsigned)(" + printf_args[i] + ")";
2255 } else if (t.is_float()) {
2256 if (t.bits() == 32) {
2257 format_string += "%f";
2258 } else {
2259 format_string += "%e";
2260 }
2261 } else if (op->args[i].as<StringImm>()) {
2262 format_string += "%s";
2263 } else {
2264 internal_assert(t.is_handle());
2265 format_string += "%p";
2266 }
2267 }
2268 string buf_name = unique_name('b');
2269 stream << get_indent() << "char " << buf_name << "[1024];\n";
2270 stream << get_indent() << "snprintf(" << buf_name << ", 1024, \"" << format_string << "\", " << with_commas(printf_args) << ");\n";
2271 rhs << buf_name;
2272
2273 } else if (op->is_intrinsic(Call::register_destructor)) {
2274 internal_assert(op->args.size() == 2);
2275 const StringImm *fn = op->args[0].as<StringImm>();
2276 internal_assert(fn);
2277 string arg = print_expr(op->args[1]);
2278
2279 stream << get_indent();
2280 // Make a struct on the stack that calls the given function as a destructor
2281 string struct_name = unique_name('s');
2282 string instance_name = unique_name('d');
2283 stream << "struct " << struct_name << " { "
2284 << "void * const ucon; "
2285 << "void * const arg; "
2286 << "" << struct_name << "(void *ucon, void *a) : ucon(ucon), arg((void *)a) {} "
2287 << "~" << struct_name << "() { " << fn->value + "(ucon, arg); } "
2288 << "} " << instance_name << "(_ucon, " << arg << ");\n";
2289 rhs << print_expr(0);
2290 } else if (op->is_intrinsic(Call::div_round_to_zero)) {
2291 rhs << print_expr(op->args[0]) << " / " << print_expr(op->args[1]);
2292 } else if (op->is_intrinsic(Call::mod_round_to_zero)) {
2293 rhs << print_expr(op->args[0]) << " % " << print_expr(op->args[1]);
2294 } else if (op->is_intrinsic(Call::signed_integer_overflow)) {
2295 user_error << "Signed integer overflow occurred during constant-folding. Signed"
2296 " integer overflow for int32 and int64 is undefined behavior in"
2297 " Halide.\n";
2298 } else if (op->is_intrinsic(Call::prefetch)) {
2299 user_assert((op->args.size() == 4) && is_one(op->args[2]))
2300 << "Only prefetch of 1 cache line is supported in C backend.\n";
2301 const Variable *base = op->args[0].as<Variable>();
2302 internal_assert(base && base->type.is_handle());
2303 rhs << "__builtin_prefetch("
2304 << "((" << print_type(op->type) << " *)" << print_name(base->name)
2305 << " + " << print_expr(op->args[1]) << "), 1)";
2306 } else if (op->is_intrinsic(Call::size_of_halide_buffer_t)) {
2307 rhs << "(sizeof(halide_buffer_t))";
2308 } else if (op->is_intrinsic(Call::strict_float)) {
2309 internal_assert(op->args.size() == 1);
2310 string arg0 = print_expr(op->args[0]);
2311 rhs << "(" << arg0 << ")";
2312 } else if (op->is_intrinsic()) {
2313 // TODO: other intrinsics
2314 internal_error << "Unhandled intrinsic in C backend: " << op->name << "\n";
2315 } else {
2316 // Generic extern calls
2317 rhs << print_extern_call(op);
2318 }
2319
2320 // Special-case halide_print, which has IR that returns int, but really return void.
2321 // The clean thing to do would be to change the definition of halide_print() to return
2322 // an ignored int, but as halide_print() has many overrides downstream (and in third-party
2323 // consumers), this is arguably a simpler fix for allowing halide_print() to work in the C++ backend.
2324 if (op->name == "halide_print") {
2325 stream << get_indent() << rhs.str() << ";\n";
2326 // Make an innocuous assignment value for our caller (probably an Evaluate node) to ignore.
2327 print_assignment(op->type, "0");
2328 } else {
2329 print_assignment(op->type, rhs.str());
2330 }
2331 }
2332
print_scalarized_expr(const Expr & e)2333 string CodeGen_C::print_scalarized_expr(const Expr &e) {
2334 Type t = e.type();
2335 internal_assert(t.is_vector());
2336 string v = unique_name('_');
2337 stream << get_indent() << print_type(t, AppendSpace) << v << ";\n";
2338 for (int lane = 0; lane < t.lanes(); lane++) {
2339 Expr e2 = extract_lane(e, lane);
2340 string elem = print_expr(e2);
2341 ostringstream rhs;
2342 rhs << v << ".replace(" << lane << ", " << elem << ")";
2343 v = print_assignment(t, rhs.str());
2344 }
2345 return v;
2346 }
2347
print_extern_call(const Call * op)2348 string CodeGen_C::print_extern_call(const Call *op) {
2349 if (op->type.is_vector()) {
2350 // Need to split into multiple scalar calls.
2351 return print_scalarized_expr(op);
2352 }
2353 ostringstream rhs;
2354 vector<string> args(op->args.size());
2355 for (size_t i = 0; i < op->args.size(); i++) {
2356 args[i] = print_expr(op->args[i]);
2357 // This substitution ensures const correctness for all calls
2358 if (args[i] == "__user_context") {
2359 args[i] = "_ucon";
2360 }
2361 }
2362 if (function_takes_user_context(op->name)) {
2363 args.insert(args.begin(), "_ucon");
2364 }
2365 rhs << op->name << "(" << with_commas(args) << ")";
2366 return rhs.str();
2367 }
2368
visit(const Load * op)2369 void CodeGen_C::visit(const Load *op) {
2370 user_assert(is_one(op->predicate)) << "Predicated load is not supported by C backend.\n";
2371
2372 // TODO: We could replicate the logic in the llvm codegen which decides whether
2373 // the vector access can be aligned. Doing so would also require introducing
2374 // aligned type equivalents for all the vector types.
2375 ostringstream rhs;
2376
2377 Type t = op->type;
2378 string name = print_name(op->name);
2379
2380 // If we're loading a contiguous ramp into a vector, just load the vector
2381 Expr dense_ramp_base = strided_ramp_base(op->index, 1);
2382 if (dense_ramp_base.defined()) {
2383 internal_assert(t.is_vector());
2384 string id_ramp_base = print_expr(dense_ramp_base);
2385 rhs << print_type(t) + "::load(" << name << ", " << id_ramp_base << ")";
2386 } else if (op->index.type().is_vector()) {
2387 // If index is a vector, gather vector elements.
2388 internal_assert(t.is_vector());
2389 string id_index = print_expr(op->index);
2390 rhs << print_type(t) + "::load(" << name << ", " << id_index << ")";
2391 } else {
2392 string id_index = print_expr(op->index);
2393 bool type_cast_needed = !(allocations.contains(op->name) &&
2394 allocations.get(op->name).type.element_of() == t.element_of());
2395 if (type_cast_needed) {
2396 rhs << "((const " << print_type(t.element_of()) << " *)" << name << ")";
2397 } else {
2398 rhs << name;
2399 }
2400 rhs << "[" << id_index << "]";
2401 }
2402 print_assignment(t, rhs.str());
2403 }
2404
visit(const Store * op)2405 void CodeGen_C::visit(const Store *op) {
2406 user_assert(is_one(op->predicate)) << "Predicated store is not supported by C backend.\n";
2407
2408 Type t = op->value.type();
2409
2410 if (inside_atomic_mutex_node) {
2411 user_assert(t.is_scalar())
2412 << "The vectorized atomic operation for the store" << op->name
2413 << " is lowered into a mutex lock, which does not support vectorization.\n";
2414 }
2415
2416 // Issue atomic store if we are in the designated producer.
2417 if (emit_atomic_stores) {
2418 stream << "#if defined(_OPENMP)\n";
2419 stream << "#pragma omp atomic\n";
2420 stream << "#else\n";
2421 stream << "#error \"Atomic stores in the C backend are only supported in compilers that support OpenMP.\"\n";
2422 stream << "#endif\n";
2423 }
2424
2425 string id_value = print_expr(op->value);
2426 string name = print_name(op->name);
2427
2428 // TODO: We could replicate the logic in the llvm codegen which decides whether
2429 // the vector access can be aligned. Doing so would also require introducing
2430 // aligned type equivalents for all the vector types.
2431
2432 // If we're writing a contiguous ramp, just store the vector.
2433 Expr dense_ramp_base = strided_ramp_base(op->index, 1);
2434 if (dense_ramp_base.defined()) {
2435 internal_assert(op->value.type().is_vector());
2436 string id_ramp_base = print_expr(dense_ramp_base);
2437 stream << get_indent() << id_value + ".store(" << name << ", " << id_ramp_base << ");\n";
2438 } else if (op->index.type().is_vector()) {
2439 // If index is a vector, scatter vector elements.
2440 internal_assert(t.is_vector());
2441 string id_index = print_expr(op->index);
2442 stream << get_indent() << id_value + ".store(" << name << ", " << id_index << ");\n";
2443 } else {
2444 bool type_cast_needed =
2445 t.is_handle() ||
2446 !allocations.contains(op->name) ||
2447 allocations.get(op->name).type != t;
2448
2449 string id_index = print_expr(op->index);
2450 stream << get_indent();
2451 if (type_cast_needed) {
2452 stream << "((" << print_type(t) << " *)" << name << ")";
2453 } else {
2454 stream << name;
2455 }
2456 stream << "[" << id_index << "] = " << id_value << ";\n";
2457 }
2458 cache.clear();
2459 }
2460
visit(const Let * op)2461 void CodeGen_C::visit(const Let *op) {
2462 string id_value = print_expr(op->value);
2463 Expr body = op->body;
2464 if (op->value.type().is_handle()) {
2465 // The body might contain a Load that references this directly
2466 // by name, so we can't rewrite the name.
2467 stream << get_indent() << print_type(op->value.type())
2468 << " " << print_name(op->name)
2469 << " = " << id_value << ";\n";
2470 } else {
2471 Expr new_var = Variable::make(op->value.type(), id_value);
2472 body = substitute(op->name, new_var, body);
2473 }
2474 print_expr(body);
2475 }
2476
visit(const Select * op)2477 void CodeGen_C::visit(const Select *op) {
2478 ostringstream rhs;
2479 string type = print_type(op->type);
2480 string true_val = print_expr(op->true_value);
2481 string false_val = print_expr(op->false_value);
2482 string cond = print_expr(op->condition);
2483
2484 // clang doesn't support the ternary operator on OpenCL style vectors.
2485 // See: https://bugs.llvm.org/show_bug.cgi?id=33103
2486 if (op->condition.type().is_scalar()) {
2487 rhs << "(" << type << ")"
2488 << "(" << cond
2489 << " ? " << true_val
2490 << " : " << false_val
2491 << ")";
2492 } else {
2493 rhs << type << "::select(" << cond << ", " << true_val << ", " << false_val << ")";
2494 }
2495 print_assignment(op->type, rhs.str());
2496 }
2497
visit(const LetStmt * op)2498 void CodeGen_C::visit(const LetStmt *op) {
2499 string id_value = print_expr(op->value);
2500 Stmt body = op->body;
2501 if (op->value.type().is_handle()) {
2502 // The body might contain a Load or Store that references this
2503 // directly by name, so we can't rewrite the name.
2504 stream << get_indent() << print_type(op->value.type())
2505 << " " << print_name(op->name)
2506 << " = " << id_value << ";\n";
2507 } else {
2508 Expr new_var = Variable::make(op->value.type(), id_value);
2509 body = substitute(op->name, new_var, body);
2510 }
2511 body.accept(this);
2512 }
2513
2514 // Halide asserts have different semantics to C asserts. They're
2515 // supposed to clean up and make the containing function return
2516 // -1, so we can't use the C version of assert. Instead we convert
2517 // to an if statement.
create_assertion(const string & id_cond,const Expr & message)2518 void CodeGen_C::create_assertion(const string &id_cond, const Expr &message) {
2519 internal_assert(!message.defined() || message.type() == Int(32))
2520 << "Assertion result is not an int: " << message;
2521
2522 if (target.has_feature(Target::NoAsserts)) {
2523 stream << get_indent() << "halide_unused(" << id_cond << ");\n";
2524 return;
2525 }
2526
2527 stream << get_indent() << "if (!" << id_cond << ")\n";
2528 open_scope();
2529 string id_msg = print_expr(message);
2530 stream << get_indent() << "return " << id_msg << ";\n";
2531 close_scope("");
2532 }
2533
create_assertion(const Expr & cond,const Expr & message)2534 void CodeGen_C::create_assertion(const Expr &cond, const Expr &message) {
2535 create_assertion(print_expr(cond), message);
2536 }
2537
visit(const AssertStmt * op)2538 void CodeGen_C::visit(const AssertStmt *op) {
2539 create_assertion(op->condition, op->message);
2540 }
2541
visit(const ProducerConsumer * op)2542 void CodeGen_C::visit(const ProducerConsumer *op) {
2543 stream << get_indent();
2544 if (op->is_producer) {
2545 stream << "// produce " << op->name << "\n";
2546 } else {
2547 stream << "// consume " << op->name << "\n";
2548 }
2549 print_stmt(op->body);
2550 }
2551
visit(const Fork * op)2552 void CodeGen_C::visit(const Fork *op) {
2553 // TODO: This doesn't actually work with nested tasks
2554 stream << get_indent() << "#pragma omp parallel\n";
2555 open_scope();
2556 stream << get_indent() << "#pragma omp single\n";
2557 open_scope();
2558 stream << get_indent() << "#pragma omp task\n";
2559 open_scope();
2560 print_stmt(op->first);
2561 close_scope("");
2562 stream << get_indent() << "#pragma omp task\n";
2563 open_scope();
2564 print_stmt(op->rest);
2565 close_scope("");
2566 stream << get_indent() << "#pragma omp taskwait\n";
2567 close_scope("");
2568 close_scope("");
2569 }
2570
visit(const Acquire * op)2571 void CodeGen_C::visit(const Acquire *op) {
2572 string id_sem = print_expr(op->semaphore);
2573 string id_count = print_expr(op->count);
2574 open_scope();
2575 stream << get_indent() << "while (!halide_semaphore_try_acquire(" << id_sem << ", " << id_count << "))\n";
2576 open_scope();
2577 stream << get_indent() << "#pragma omp taskyield\n";
2578 close_scope("");
2579 op->body.accept(this);
2580 close_scope("");
2581 }
2582
visit(const Atomic * op)2583 void CodeGen_C::visit(const Atomic *op) {
2584 if (!op->mutex_name.empty()) {
2585 internal_assert(!inside_atomic_mutex_node)
2586 << "Nested atomic mutex locks detected. This might causes a deadlock.\n";
2587 ScopedValue<bool> old_inside_atomic_mutex_node(inside_atomic_mutex_node, true);
2588 op->body.accept(this);
2589 } else {
2590 // Issue atomic stores.
2591 ScopedValue<bool> old_emit_atomic_stores(emit_atomic_stores, true);
2592 op->body.accept(this);
2593 }
2594 }
2595
visit(const For * op)2596 void CodeGen_C::visit(const For *op) {
2597 string id_min = print_expr(op->min);
2598 string id_extent = print_expr(op->extent);
2599
2600 if (op->for_type == ForType::Parallel) {
2601 stream << get_indent() << "#pragma omp parallel for\n";
2602 } else {
2603 internal_assert(op->for_type == ForType::Serial)
2604 << "Can only emit serial or parallel for loops to C\n";
2605 }
2606
2607 stream << get_indent() << "for (int "
2608 << print_name(op->name)
2609 << " = " << id_min
2610 << "; "
2611 << print_name(op->name)
2612 << " < " << id_min
2613 << " + " << id_extent
2614 << "; "
2615 << print_name(op->name)
2616 << "++)\n";
2617
2618 open_scope();
2619 op->body.accept(this);
2620 close_scope("for " + print_name(op->name));
2621 }
2622
visit(const Ramp * op)2623 void CodeGen_C::visit(const Ramp *op) {
2624 Type vector_type = op->type.with_lanes(op->lanes);
2625 string id_base = print_expr(op->base);
2626 string id_stride = print_expr(op->stride);
2627 print_assignment(vector_type, print_type(vector_type) + "::ramp(" + id_base + ", " + id_stride + ")");
2628 }
2629
visit(const Broadcast * op)2630 void CodeGen_C::visit(const Broadcast *op) {
2631 Type vector_type = op->type.with_lanes(op->lanes);
2632 string id_value = print_expr(op->value);
2633 string rhs;
2634 if (op->lanes > 1) {
2635 rhs = print_type(vector_type) + "::broadcast(" + id_value + ")";
2636 } else {
2637 rhs = id_value;
2638 }
2639
2640 print_assignment(vector_type, rhs);
2641 }
2642
visit(const Provide * op)2643 void CodeGen_C::visit(const Provide *op) {
2644 internal_error << "Cannot emit Provide statements as C\n";
2645 }
2646
visit(const Allocate * op)2647 void CodeGen_C::visit(const Allocate *op) {
2648 open_scope();
2649
2650 string op_name = print_name(op->name);
2651 string op_type = print_type(op->type, AppendSpace);
2652
2653 // For sizes less than 8k, do a stack allocation
2654 bool on_stack = false;
2655 int32_t constant_size;
2656 string size_id;
2657 Type size_id_type;
2658
2659 if (op->new_expr.defined()) {
2660 Allocation alloc;
2661 alloc.type = op->type;
2662 allocations.push(op->name, alloc);
2663 heap_allocations.push(op->name);
2664 stream << op_type << "*" << op_name << " = (" << print_expr(op->new_expr) << ");\n";
2665 } else {
2666 constant_size = op->constant_allocation_size();
2667 if (constant_size > 0) {
2668 int64_t stack_bytes = constant_size * op->type.bytes();
2669
2670 if (stack_bytes > ((int64_t(1) << 31) - 1)) {
2671 user_error << "Total size for allocation "
2672 << op->name << " is constant but exceeds 2^31 - 1.\n";
2673 } else {
2674 size_id_type = Int(32);
2675 size_id = print_expr(make_const(size_id_type, constant_size));
2676
2677 if (op->memory_type == MemoryType::Stack ||
2678 (op->memory_type == MemoryType::Auto &&
2679 can_allocation_fit_on_stack(stack_bytes))) {
2680 on_stack = true;
2681 }
2682 }
2683 } else {
2684 // Check that the allocation is not scalar (if it were scalar
2685 // it would have constant size).
2686 internal_assert(!op->extents.empty());
2687
2688 size_id = print_assignment(Int(64), print_expr(op->extents[0]));
2689 size_id_type = Int(64);
2690
2691 for (size_t i = 1; i < op->extents.size(); i++) {
2692 // Make the code a little less cluttered for two-dimensional case
2693 string new_size_id_rhs;
2694 string next_extent = print_expr(op->extents[i]);
2695 if (i > 1) {
2696 new_size_id_rhs = "(" + size_id + " > ((int64_t(1) << 31) - 1)) ? " + size_id + " : (" + size_id + " * " + next_extent + ")";
2697 } else {
2698 new_size_id_rhs = size_id + " * " + next_extent;
2699 }
2700 size_id = print_assignment(Int(64), new_size_id_rhs);
2701 }
2702 stream << get_indent() << "if (("
2703 << size_id << " > ((int64_t(1) << 31) - 1)) || (("
2704 << size_id << " * sizeof("
2705 << op_type << ")) > ((int64_t(1) << 31) - 1)))\n";
2706 open_scope();
2707 stream << get_indent();
2708 // TODO: call halide_error_buffer_allocation_too_large() here instead
2709 // TODO: call create_assertion() so that NoAssertions works
2710 stream << "halide_error(_ucon, "
2711 << "\"32-bit signed overflow computing size of allocation " << op->name << "\\n\");\n";
2712 stream << get_indent() << "return -1;\n";
2713 close_scope("overflow test " + op->name);
2714 }
2715
2716 // Check the condition to see if this allocation should actually be created.
2717 // If the allocation is on the stack, the only condition we can respect is
2718 // unconditional false (otherwise a non-constant-sized array declaration
2719 // will be generated).
2720 if (!on_stack || is_zero(op->condition)) {
2721 Expr conditional_size = Select::make(op->condition,
2722 Variable::make(size_id_type, size_id),
2723 make_const(size_id_type, 0));
2724 conditional_size = simplify(conditional_size);
2725 size_id = print_assignment(Int(64), print_expr(conditional_size));
2726 }
2727
2728 Allocation alloc;
2729 alloc.type = op->type;
2730 allocations.push(op->name, alloc);
2731
2732 stream << get_indent() << op_type;
2733
2734 if (on_stack) {
2735 stream << op_name
2736 << "[" << size_id << "];\n";
2737 } else {
2738 stream << "*"
2739 << op_name
2740 << " = ("
2741 << op_type
2742 << " *)halide_malloc(_ucon, sizeof("
2743 << op_type
2744 << ")*" << size_id << ");\n";
2745 heap_allocations.push(op->name);
2746 }
2747 }
2748
2749 if (!on_stack) {
2750 create_assertion(op_name, Call::make(Int(32), "halide_error_out_of_memory", {}, Call::Extern));
2751
2752 stream << get_indent();
2753 string free_function = op->free_function.empty() ? "halide_free" : op->free_function;
2754 stream << "HalideFreeHelper " << op_name << "_free(_ucon, "
2755 << op_name << ", " << free_function << ");\n";
2756 }
2757
2758 op->body.accept(this);
2759
2760 // Free the memory if it was allocated on the heap and there is no matching
2761 // Free node.
2762 print_heap_free(op->name);
2763 if (allocations.contains(op->name)) {
2764 allocations.pop(op->name);
2765 }
2766
2767 close_scope("alloc " + print_name(op->name));
2768 }
2769
print_heap_free(const std::string & alloc_name)2770 void CodeGen_C::print_heap_free(const std::string &alloc_name) {
2771 if (heap_allocations.contains(alloc_name)) {
2772 stream << get_indent() << print_name(alloc_name) << "_free.free();\n";
2773 heap_allocations.pop(alloc_name);
2774 }
2775 }
2776
visit(const Free * op)2777 void CodeGen_C::visit(const Free *op) {
2778 print_heap_free(op->name);
2779 allocations.pop(op->name);
2780 }
2781
visit(const Realize * op)2782 void CodeGen_C::visit(const Realize *op) {
2783 internal_error << "Cannot emit realize statements to C\n";
2784 }
2785
visit(const Prefetch * op)2786 void CodeGen_C::visit(const Prefetch *op) {
2787 internal_error << "Cannot emit prefetch statements to C\n";
2788 }
2789
visit(const IfThenElse * op)2790 void CodeGen_C::visit(const IfThenElse *op) {
2791 string cond_id = print_expr(op->condition);
2792
2793 stream << get_indent() << "if (" << cond_id << ")\n";
2794 open_scope();
2795 op->then_case.accept(this);
2796 close_scope("if " + cond_id);
2797
2798 if (op->else_case.defined()) {
2799 stream << get_indent() << "else\n";
2800 open_scope();
2801 op->else_case.accept(this);
2802 close_scope("if " + cond_id + " else");
2803 }
2804 }
2805
visit(const Evaluate * op)2806 void CodeGen_C::visit(const Evaluate *op) {
2807 if (is_const(op->value)) return;
2808 string id = print_expr(op->value);
2809 stream << get_indent() << "halide_unused(" << id << ");\n";
2810 }
2811
visit(const Shuffle * op)2812 void CodeGen_C::visit(const Shuffle *op) {
2813 internal_assert(!op->vectors.empty());
2814 internal_assert(op->vectors[0].type().is_vector());
2815 for (size_t i = 1; i < op->vectors.size(); i++) {
2816 internal_assert(op->vectors[0].type() == op->vectors[i].type());
2817 }
2818 internal_assert(op->type.lanes() == (int)op->indices.size());
2819 const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
2820 for (int i : op->indices) {
2821 internal_assert(i >= -1 && i < max_index);
2822 }
2823
2824 std::vector<string> vecs;
2825 for (Expr v : op->vectors) {
2826 vecs.push_back(print_expr(v));
2827 }
2828 string src = vecs[0];
2829 if (op->vectors.size() > 1) {
2830 ostringstream rhs;
2831 string storage_name = unique_name('_');
2832 stream << get_indent() << "const " << print_type(op->vectors[0].type()) << " " << storage_name << "[] = { " << with_commas(vecs) << " };\n";
2833
2834 rhs << print_type(op->type) << "::concat(" << op->vectors.size() << ", " << storage_name << ")";
2835 src = print_assignment(op->type, rhs.str());
2836 }
2837 ostringstream rhs;
2838 if (op->type.is_scalar()) {
2839 rhs << src << "[" << op->indices[0] << "]";
2840 } else {
2841 string indices_name = unique_name('_');
2842 stream << get_indent() << "const int32_t " << indices_name << "[" << op->indices.size() << "] = { " << with_commas(op->indices) << " };\n";
2843 rhs << print_type(op->type) << "::shuffle(" << src << ", " << indices_name << ")";
2844 }
2845 print_assignment(op->type, rhs.str());
2846 }
2847
test()2848 void CodeGen_C::test() {
2849 LoweredArgument buffer_arg("buf", Argument::OutputBuffer, Int(32), 3, ArgumentEstimates{});
2850 LoweredArgument float_arg("alpha", Argument::InputScalar, Float(32), 0, ArgumentEstimates{});
2851 LoweredArgument int_arg("beta", Argument::InputScalar, Int(32), 0, ArgumentEstimates{});
2852 LoweredArgument user_context_arg("__user_context", Argument::InputScalar, type_of<const void *>(), 0, ArgumentEstimates{});
2853 vector<LoweredArgument> args = {buffer_arg, float_arg, int_arg, user_context_arg};
2854 Var x("x");
2855 Param<float> alpha("alpha");
2856 Param<int> beta("beta");
2857 Expr e = Select::make(alpha > 4.0f, print_when(x < 1, 3), 2);
2858 Stmt s = Store::make("buf", e, x, Parameter(), const_true(), ModulusRemainder());
2859 s = LetStmt::make("x", beta + 1, s);
2860 s = Block::make(s, Free::make("tmp.stack"));
2861 s = Allocate::make("tmp.stack", Int(32), MemoryType::Stack, {127}, const_true(), s);
2862 s = Allocate::make("tmp.heap", Int(32), MemoryType::Heap, {43, beta}, const_true(), s);
2863 Expr buf = Variable::make(Handle(), "buf.buffer");
2864 s = LetStmt::make("buf", Call::make(Handle(), Call::buffer_get_host, {buf}, Call::Extern), s);
2865
2866 Module m("", get_host_target());
2867 m.append(LoweredFunc("test1", args, s, LinkageType::External));
2868
2869 ostringstream source;
2870 ostringstream macros;
2871 {
2872 CodeGen_C cg(source, Target("host"), CodeGen_C::CImplementation);
2873 cg.compile(m);
2874 cg.add_common_macros(macros);
2875 }
2876
2877 string src = source.str();
2878 string correct_source =
2879 headers +
2880 globals +
2881 string((const char *)halide_internal_runtime_header_HalideRuntime_h) + '\n' +
2882 string((const char *)halide_internal_initmod_inlined_c) + '\n' +
2883 macros.str() + '\n' + kDefineMustUseResult + R"GOLDEN_CODE(
2884 #ifndef HALIDE_FUNCTION_ATTRS
2885 #define HALIDE_FUNCTION_ATTRS
2886 #endif
2887
2888
2889
2890 #ifdef __cplusplus
2891 extern "C" {
2892 #endif
2893
2894 HALIDE_FUNCTION_ATTRS
2895 int test1(struct halide_buffer_t *_buf_buffer, float _alpha, int32_t _beta, void const *__user_context) {
2896 void * const _ucon = const_cast<void *>(__user_context);
2897 void *_0 = _halide_buffer_get_host(_buf_buffer);
2898 void * _buf = _0;
2899 {
2900 int64_t _1 = 43;
2901 int64_t _2 = _1 * _beta;
2902 if ((_2 > ((int64_t(1) << 31) - 1)) || ((_2 * sizeof(int32_t )) > ((int64_t(1) << 31) - 1)))
2903 {
2904 halide_error(_ucon, "32-bit signed overflow computing size of allocation tmp.heap\n");
2905 return -1;
2906 } // overflow test tmp.heap
2907 int64_t _3 = _2;
2908 int32_t *_tmp_heap = (int32_t *)halide_malloc(_ucon, sizeof(int32_t )*_3);
2909 if (!_tmp_heap)
2910 {
2911 int32_t _4 = halide_error_out_of_memory(_ucon);
2912 return _4;
2913 }
2914 HalideFreeHelper _tmp_heap_free(_ucon, _tmp_heap, halide_free);
2915 {
2916 int32_t _tmp_stack[127];
2917 int32_t _5 = _beta + 1;
2918 int32_t _6;
2919 bool _7 = _5 < 1;
2920 if (_7)
2921 {
2922 char b0[1024];
2923 snprintf(b0, 1024, "%lld%s", (long long)(3), "\n");
2924 char const *_8 = b0;
2925 halide_print(_ucon, _8);
2926 int32_t _9 = 0;
2927 int32_t _10 = return_second(_9, 3);
2928 _6 = _10;
2929 } // if _7
2930 else
2931 {
2932 _6 = 3;
2933 } // if _7 else
2934 int32_t _11 = _6;
2935 float _12 = float_from_bits(1082130432 /* 4 */);
2936 bool _13 = _alpha > _12;
2937 int32_t _14 = (int32_t)(_13 ? _11 : 2);
2938 ((int32_t *)_buf)[_5] = _14;
2939 } // alloc _tmp_stack
2940 _tmp_heap_free.free();
2941 } // alloc _tmp_heap
2942 return 0;
2943 }
2944
2945 #ifdef __cplusplus
2946 } // extern "C"
2947 #endif
2948
2949 )GOLDEN_CODE";
2950
2951 if (src != correct_source) {
2952 int diff = 0;
2953 while (src[diff] == correct_source[diff]) {
2954 diff++;
2955 }
2956 int diff_end = diff + 1;
2957 while (diff > 0 && src[diff] != '\n') {
2958 diff--;
2959 }
2960 while (diff_end < (int)src.size() && src[diff_end] != '\n') {
2961 diff_end++;
2962 }
2963
2964 internal_error
2965 << "Correct source code:\n"
2966 << correct_source
2967 << "Actual source code:\n"
2968 << src
2969 << "Difference starts at:\n"
2970 << "Correct: " << correct_source.substr(diff, diff_end - diff) << "\n"
2971 << "Actual: " << src.substr(diff, diff_end - diff) << "\n";
2972 }
2973
2974 std::cout << "CodeGen_C test passed\n";
2975 }
2976
2977 } // namespace Internal
2978 } // namespace Halide
2979