1 #include <iostream>
2 #include <limits>
3 
4 #include "CodeGen_C.h"
5 #include "CodeGen_Internal.h"
6 #include "Deinterleave.h"
7 #include "IROperator.h"
8 #include "Lerp.h"
9 #include "Param.h"
10 #include "Simplify.h"
11 #include "Substitute.h"
12 #include "Type.h"
13 #include "Util.h"
14 #include "Var.h"
15 
16 namespace Halide {
17 namespace Internal {
18 
19 using std::map;
20 using std::ostream;
21 using std::ostringstream;
22 using std::string;
23 using std::vector;
24 
25 extern "C" unsigned char halide_internal_initmod_inlined_c[];
26 extern "C" unsigned char halide_internal_runtime_header_HalideRuntime_h[];
27 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeCuda_h[];
28 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeHexagonHost_h[];
29 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeMetal_h[];
30 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeOpenCL_h[];
31 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeOpenGLCompute_h[];
32 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeOpenGL_h[];
33 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeQurt_h[];
34 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeD3D12Compute_h[];
35 
36 namespace {
37 
38 // HALIDE_MUST_USE_RESULT defined here is intended to exactly
39 // duplicate the definition in HalideRuntime.h (so that either or
40 // both can be present, in any order).
41 const char *const kDefineMustUseResult = R"INLINE_CODE(#ifndef HALIDE_MUST_USE_RESULT
42 #ifdef __has_attribute
43 #if __has_attribute(nodiscard)
44 #define HALIDE_MUST_USE_RESULT [[nodiscard]]
45 #elif __has_attribute(warn_unused_result)
46 #define HALIDE_MUST_USE_RESULT __attribute__((warn_unused_result))
47 #else
48 #define HALIDE_MUST_USE_RESULT
49 #endif
50 #else
51 #define HALIDE_MUST_USE_RESULT
52 #endif
53 #endif
54 )INLINE_CODE";
55 
56 const string headers =
57     "#include <iostream>\n"
58     "#include <math.h>\n"
59     "#include <float.h>\n"
60     "#include <assert.h>\n"
61     "#include <limits.h>\n"
62     "#include <string.h>\n"
63     "#include <stdio.h>\n"
64     "#include <stdint.h>\n";
65 
66 // We now add definitions of things in the runtime which are
67 // intended to be inlined into every module but are only expressed
68 // in .ll. The redundancy is regrettable (FIXME).
69 const string globals = R"INLINE_CODE(
70 extern "C" {
71 int64_t halide_current_time_ns(void *ctx);
72 void halide_profiler_pipeline_end(void *, void *);
73 }
74 
75 #ifdef _WIN32
76 __declspec(dllimport) float __cdecl roundf(float);
77 __declspec(dllimport) double __cdecl round(double);
78 #else
79 inline float asinh_f32(float x) {return asinhf(x);}
80 inline float acosh_f32(float x) {return acoshf(x);}
81 inline float atanh_f32(float x) {return atanhf(x);}
82 inline double asinh_f64(double x) {return asinh(x);}
83 inline double acosh_f64(double x) {return acosh(x);}
84 inline double atanh_f64(double x) {return atanh(x);}
85 #endif
86 inline float sqrt_f32(float x) {return sqrtf(x);}
87 inline float sin_f32(float x) {return sinf(x);}
88 inline float asin_f32(float x) {return asinf(x);}
89 inline float cos_f32(float x) {return cosf(x);}
90 inline float acos_f32(float x) {return acosf(x);}
91 inline float tan_f32(float x) {return tanf(x);}
92 inline float atan_f32(float x) {return atanf(x);}
93 inline float atan2_f32(float x, float y) {return atan2f(x, y);}
94 inline float sinh_f32(float x) {return sinhf(x);}
95 inline float cosh_f32(float x) {return coshf(x);}
96 inline float tanh_f32(float x) {return tanhf(x);}
97 inline float hypot_f32(float x, float y) {return hypotf(x, y);}
98 inline float exp_f32(float x) {return expf(x);}
99 inline float log_f32(float x) {return logf(x);}
100 inline float pow_f32(float x, float y) {return powf(x, y);}
101 inline float floor_f32(float x) {return floorf(x);}
102 inline float ceil_f32(float x) {return ceilf(x);}
103 inline float round_f32(float x) {return roundf(x);}
104 
105 inline double sqrt_f64(double x) {return sqrt(x);}
106 inline double sin_f64(double x) {return sin(x);}
107 inline double asin_f64(double x) {return asin(x);}
108 inline double cos_f64(double x) {return cos(x);}
109 inline double acos_f64(double x) {return acos(x);}
110 inline double tan_f64(double x) {return tan(x);}
111 inline double atan_f64(double x) {return atan(x);}
112 inline double atan2_f64(double x, double y) {return atan2(x, y);}
113 inline double sinh_f64(double x) {return sinh(x);}
114 inline double cosh_f64(double x) {return cosh(x);}
115 inline double tanh_f64(double x) {return tanh(x);}
116 inline double hypot_f64(double x, double y) {return hypot(x, y);}
117 inline double exp_f64(double x) {return exp(x);}
118 inline double log_f64(double x) {return log(x);}
119 inline double pow_f64(double x, double y) {return pow(x, y);}
120 inline double floor_f64(double x) {return floor(x);}
121 inline double ceil_f64(double x) {return ceil(x);}
122 inline double round_f64(double x) {return round(x);}
123 
124 inline float nan_f32() {return NAN;}
125 inline float neg_inf_f32() {return -INFINITY;}
126 inline float inf_f32() {return INFINITY;}
127 inline bool is_nan_f32(float x) {return isnan(x);}
128 inline bool is_nan_f64(double x) {return isnan(x);}
129 inline bool is_inf_f32(float x) {return isinf(x);}
130 inline bool is_inf_f64(double x) {return isinf(x);}
131 inline bool is_finite_f32(float x) {return isfinite(x);}
132 inline bool is_finite_f64(double x) {return isfinite(x);}
133 
134 template<typename A, typename B>
135 inline A reinterpret(const B &b) {
136     #if __cplusplus >= 201103L
137     static_assert(sizeof(A) == sizeof(B), "type size mismatch");
138     #endif
139     A a;
140     memcpy(&a, &b, sizeof(a));
141     return a;
142 }
143 inline float float_from_bits(uint32_t bits) {
144     return reinterpret<float, uint32_t>(bits);
145 }
146 
147 template<typename T>
148 inline int halide_popcount(T a) {
149     int bits_set = 0;
150     while (a != 0) {
151         bits_set += a & 1;
152         a >>= 1;
153     }
154     return bits_set;
155 }
156 
157 template<typename T>
158 inline int halide_count_leading_zeros(T a) {
159     int leading_zeros = 0;
160     int bit = sizeof(a) * 8 - 1;
161     while (bit >= 0 && (a & (((T)1) << bit)) == 0) {
162         leading_zeros++;
163         bit--;
164     }
165     return leading_zeros;
166 }
167 
168 template<typename T>
169 inline int halide_count_trailing_zeros(T a) {
170     int trailing_zeros = 0;
171     constexpr int bits = sizeof(a) * 8;
172     int bit = 0;
173     while (bit < bits && (a & (((T)1) << bit)) == 0) {
174         trailing_zeros++;
175         bit++;
176     }
177     return trailing_zeros;
178 }
179 
180 template<typename T>
181 inline T halide_cpp_max(const T &a, const T &b) {return (a > b) ? a : b;}
182 
183 template<typename T>
184 inline T halide_cpp_min(const T &a, const T &b) {return (a < b) ? a : b;}
185 
186 template<typename T>
187 inline void halide_unused(const T&) {}
188 
189 template<typename A, typename B>
190 const B &return_second(const A &a, const B &b) {
191     halide_unused(a);
192     return b;
193 }
194 
195 namespace {
196 class HalideFreeHelper {
197     typedef void (*FreeFunction)(void *user_context, void *p);
198     void * user_context;
199     void *p;
200     FreeFunction free_function;
201 public:
202     HalideFreeHelper(void *user_context, void *p, FreeFunction free_function)
203         : user_context(user_context), p(p), free_function(free_function) {}
204     ~HalideFreeHelper() { free(); }
205     void free() {
206         if (p) {
207             // TODO: do all free_functions guarantee to ignore a nullptr?
208             free_function(user_context, p);
209             p = nullptr;
210         }
211     }
212 };
213 } // namespace
214 )INLINE_CODE";
215 }  // namespace
216 
217 class TypeInfoGatherer : public IRGraphVisitor {
218 private:
219     using IRGraphVisitor::include;
220     using IRGraphVisitor::visit;
221 
include_type(const Type & t)222     void include_type(const Type &t) {
223         if (t.is_vector()) {
224             if (t.is_bool()) {
225                 // bool vectors are always emitted as uint8 in the C++ backend
226                 // TODO: on some architectures, we could do better by choosing
227                 // a bitwidth that matches the other vectors in use; EliminateBoolVectors
228                 // could be used for this with a bit of work.
229                 vector_types_used.insert(UInt(8).with_lanes(t.lanes()));
230             } else if (!t.is_handle()) {
231                 // Vector-handle types can be seen when processing (e.g.)
232                 // require() statements that are vectorized, but they
233                 // will all be scalarized away prior to use, so don't emit
234                 // them.
235                 vector_types_used.insert(t);
236                 if (t.is_int()) {
237                     // If we are including an int-vector type, also include
238                     // the same-width uint-vector type; there are various operations
239                     // that can use uint vectors for intermediate results (e.g. lerp(),
240                     // but also Mod, which can generate a call to abs() for int types,
241                     // which always produces uint results for int inputs in Halide);
242                     // it's easier to just err on the side of extra vectors we don't
243                     // use since they are just type declarations.
244                     vector_types_used.insert(t.with_code(halide_type_uint));
245                 }
246             }
247         }
248     }
249 
include_lerp_types(const Type & t)250     void include_lerp_types(const Type &t) {
251         if (t.is_vector() && t.is_int_or_uint() && (t.bits() >= 8 && t.bits() <= 32)) {
252             Type doubled = t.with_bits(t.bits() * 2);
253             include_type(doubled);
254         }
255     }
256 
257 protected:
include(const Expr & e)258     void include(const Expr &e) override {
259         include_type(e.type());
260         IRGraphVisitor::include(e);
261     }
262 
263     // GCC's __builtin_shuffle takes an integer vector of
264     // the size of its input vector. Make sure this type exists.
visit(const Shuffle * op)265     void visit(const Shuffle *op) override {
266         vector_types_used.insert(Int(32, op->vectors[0].type().lanes()));
267         IRGraphVisitor::visit(op);
268     }
269 
visit(const For * op)270     void visit(const For *op) override {
271         for_types_used.insert(op->for_type);
272         IRGraphVisitor::visit(op);
273     }
274 
visit(const Ramp * op)275     void visit(const Ramp *op) override {
276         include_type(op->type.with_lanes(op->lanes));
277         IRGraphVisitor::visit(op);
278     }
279 
visit(const Broadcast * op)280     void visit(const Broadcast *op) override {
281         include_type(op->type.with_lanes(op->lanes));
282         IRGraphVisitor::visit(op);
283     }
284 
visit(const Cast * op)285     void visit(const Cast *op) override {
286         include_type(op->type);
287         IRGraphVisitor::visit(op);
288     }
289 
visit(const Call * op)290     void visit(const Call *op) override {
291         include_type(op->type);
292         if (op->is_intrinsic(Call::lerp)) {
293             // lower_lerp() can synthesize wider vector types.
294             for (auto &a : op->args) {
295                 include_lerp_types(a.type());
296             }
297         }
298 
299         IRGraphVisitor::visit(op);
300     }
301 
302 public:
303     std::set<ForType> for_types_used;
304     std::set<Type> vector_types_used;
305 };
306 
CodeGen_C(ostream & s,Target t,OutputKind output_kind,const std::string & guard)307 CodeGen_C::CodeGen_C(ostream &s, Target t, OutputKind output_kind, const std::string &guard)
308     : IRPrinter(s), id("$$ BAD ID $$"), target(t), output_kind(output_kind),
309       extern_c_open(false), inside_atomic_mutex_node(false), emit_atomic_stores(false) {
310 
311     if (is_header()) {
312         // If it's a header, emit an include guard.
313         stream << "#ifndef HALIDE_" << print_name(guard) << "\n"
314                << "#define HALIDE_" << print_name(guard) << "\n"
315                << "#include <stdint.h>\n"
316                << "\n"
317                << "// Forward declarations of the types used in the interface\n"
318                << "// to the Halide pipeline.\n"
319                << "//\n";
320         if (target.has_feature(Target::NoRuntime)) {
321             stream << "// For the definitions of these structs, include HalideRuntime.h\n";
322         } else {
323             stream << "// Definitions for these structs are below.\n";
324         }
325         stream << "\n"
326                << "// Halide's representation of a multi-dimensional array.\n"
327                << "// Halide::Runtime::Buffer is a more user-friendly wrapper\n"
328                << "// around this. Its declaration is in HalideBuffer.h\n"
329                << "struct halide_buffer_t;\n"
330                << "\n"
331                << "// Metadata describing the arguments to the generated function.\n"
332                << "// Used to construct calls to the _argv version of the function.\n"
333                << "struct halide_filter_metadata_t;\n"
334                << "\n";
335         // We just forward declared the following types:
336         forward_declared.insert(type_of<halide_buffer_t *>().handle_type);
337         forward_declared.insert(type_of<halide_filter_metadata_t *>().handle_type);
338     } else if (is_extern_decl()) {
339         // Extern decls to be wrapped inside other code (eg python extensions);
340         // emit the forward decls with a minimum of noise. Note that we never
341         // mess with legacy buffer types in this case.
342         stream << "struct halide_buffer_t;\n"
343                << "struct halide_filter_metadata_t;\n"
344                << "\n";
345         forward_declared.insert(type_of<halide_buffer_t *>().handle_type);
346         forward_declared.insert(type_of<halide_filter_metadata_t *>().handle_type);
347     } else {
348         // Include declarations of everything generated C source might want
349         stream
350             << headers
351             << globals
352             << halide_internal_runtime_header_HalideRuntime_h << "\n"
353             << halide_internal_initmod_inlined_c << "\n";
354         add_common_macros(stream);
355         stream << "\n";
356     }
357 
358     stream << kDefineMustUseResult << "\n";
359 
360     // Throw in a default (empty) definition of HALIDE_FUNCTION_ATTRS
361     // (some hosts may define this to e.g. __attribute__((warn_unused_result)))
362     stream << "#ifndef HALIDE_FUNCTION_ATTRS\n";
363     stream << "#define HALIDE_FUNCTION_ATTRS\n";
364     stream << "#endif\n";
365 }
366 
~CodeGen_C()367 CodeGen_C::~CodeGen_C() {
368     set_name_mangling_mode(NameMangling::Default);
369 
370     if (is_header()) {
371         if (!target.has_feature(Target::NoRuntime)) {
372             stream << "\n"
373                    << "// The generated object file that goes with this header\n"
374                    << "// includes a full copy of the Halide runtime so that it\n"
375                    << "// can be used standalone. Declarations for the functions\n"
376                    << "// in the Halide runtime are below.\n";
377             if (target.os == Target::Windows) {
378                 stream
379                     << "//\n"
380                     << "// The inclusion of this runtime means that it is not legal\n"
381                     << "// to link multiple Halide-generated object files together.\n"
382                     << "// This problem is Windows-specific. On other platforms, we\n"
383                     << "// use weak linkage.\n";
384             } else {
385                 stream
386                     << "//\n"
387                     << "// The runtime is defined using weak linkage, so it is legal\n"
388                     << "// to link multiple Halide-generated object files together,\n"
389                     << "// or to clobber any of these functions with your own\n"
390                     << "// definition.\n";
391             }
392             stream << "//\n"
393                    << "// To generate an object file without a full copy of the\n"
394                    << "// runtime, use the -no_runtime target flag. To generate a\n"
395                    << "// standalone Halide runtime to use with such object files\n"
396                    << "// use the -r flag with any Halide generator binary, e.g.:\n"
397                    << "// $ ./my_generator -r halide_runtime -o . target=host\n"
398                    << "\n"
399                    << halide_internal_runtime_header_HalideRuntime_h << "\n";
400             if (target.has_feature(Target::CUDA)) {
401                 stream << halide_internal_runtime_header_HalideRuntimeCuda_h << "\n";
402             }
403             if (target.has_feature(Target::HVX_128) ||
404                 target.has_feature(Target::HVX_64)) {
405                 stream << halide_internal_runtime_header_HalideRuntimeHexagonHost_h << "\n";
406             }
407             if (target.has_feature(Target::Metal)) {
408                 stream << halide_internal_runtime_header_HalideRuntimeMetal_h << "\n";
409             }
410             if (target.has_feature(Target::OpenCL)) {
411                 stream << halide_internal_runtime_header_HalideRuntimeOpenCL_h << "\n";
412             }
413             if (target.has_feature(Target::OpenGLCompute)) {
414                 stream << halide_internal_runtime_header_HalideRuntimeOpenGLCompute_h << "\n";
415             }
416             if (target.has_feature(Target::OpenGL)) {
417                 stream << halide_internal_runtime_header_HalideRuntimeOpenGL_h << "\n";
418             }
419             if (target.has_feature(Target::D3D12Compute)) {
420                 stream << halide_internal_runtime_header_HalideRuntimeD3D12Compute_h << "\n";
421             }
422         }
423         stream << "#endif\n";
424     }
425 }
426 
add_common_macros(std::ostream & dest)427 void CodeGen_C::add_common_macros(std::ostream &dest) {
428     const char *macros = R"INLINE_CODE(
429 // ll suffix in OpenCL is reserved for 128-bit integers.
430 #if defined __OPENCL_VERSION__
431 #define ADD_INT64_T_SUFFIX(x) x##l
432 #define ADD_UINT64_T_SUFFIX(x) x##ul
433 // HLSL doesn't have any suffixes.
434 #elif defined HLSL_VERSION
435 #define ADD_INT64_T_SUFFIX(x) x
436 #define ADD_UINT64_T_SUFFIX(x) x
437 #else
438 #define ADD_INT64_T_SUFFIX(x) x##ll
439 #define ADD_UINT64_T_SUFFIX(x) x##ull
440 #endif
441 )INLINE_CODE";
442     dest << macros;
443 }
444 
add_vector_typedefs(const std::set<Type> & vector_types)445 void CodeGen_C::add_vector_typedefs(const std::set<Type> &vector_types) {
446     if (!vector_types.empty()) {
447         // MSVC has a limit of ~16k for string literals, so split
448         // up these declarations accordingly
449         const char *cpp_vector_decl = R"INLINE_CODE(
450 #if !defined(__has_attribute)
451     #define __has_attribute(x) 0
452 #endif
453 
454 #if !defined(__has_builtin)
455     #define __has_builtin(x) 0
456 #endif
457 
458 template <typename ElementType_, size_t Lanes_>
459 class CppVector {
460 public:
461     typedef ElementType_ ElementType;
462     static const size_t Lanes = Lanes_;
463     typedef CppVector<ElementType, Lanes> Vec;
464     typedef CppVector<uint8_t, Lanes> Mask;
465 
466     CppVector &operator=(const Vec &src) {
467         if (this != &src) {
468             for (size_t i = 0; i < Lanes; i++) {
469                 elements[i] = src[i];
470             }
471         }
472         return *this;
473     }
474 
475     /* not-explicit */ CppVector(const Vec &src) {
476         for (size_t i = 0; i < Lanes; i++) {
477             elements[i] = src[i];
478         }
479     }
480 
481     CppVector() {
482         for (size_t i = 0; i < Lanes; i++) {
483             elements[i] = 0;
484         }
485     }
486 
487     static Vec broadcast(const ElementType &v) {
488         Vec r(empty);
489         for (size_t i = 0; i < Lanes; i++) {
490             r.elements[i] = v;
491         }
492         return r;
493     }
494 
495     static Vec ramp(const ElementType &base, const ElementType &stride) {
496         Vec r(empty);
497         for (size_t i = 0; i < Lanes; i++) {
498             r.elements[i] = base + stride * i;
499         }
500         return r;
501     }
502 
503     static Vec load(const void *base, int32_t offset) {
504         Vec r(empty);
505         memcpy(&r.elements[0], ((const ElementType*)base + offset), sizeof(r.elements));
506         return r;
507     }
508 
509     // gather
510     static Vec load(const void *base, const CppVector<int32_t, Lanes> &offset) {
511         Vec r(empty);
512         for (size_t i = 0; i < Lanes; i++) {
513             r.elements[i] = ((const ElementType*)base)[offset[i]];
514         }
515         return r;
516     }
517 
518     void store(void *base, int32_t offset) const {
519         memcpy(((ElementType*)base + offset), &this->elements[0], sizeof(this->elements));
520     }
521 
522     // scatter
523     void store(void *base, const CppVector<int32_t, Lanes> &offset) const {
524         for (size_t i = 0; i < Lanes; i++) {
525             ((ElementType*)base)[offset[i]] = elements[i];
526         }
527     }
528 
529     static Vec shuffle(const Vec &a, const int32_t indices[Lanes]) {
530         Vec r(empty);
531         for (size_t i = 0; i < Lanes; i++) {
532             if (indices[i] < 0) {
533                 continue;
534             }
535             r.elements[i] = a[indices[i]];
536         }
537         return r;
538     }
539 
540     template<size_t InputLanes>
541     static Vec concat(size_t count, const CppVector<ElementType, InputLanes> vecs[]) {
542         Vec r(empty);
543         for (size_t i = 0; i < Lanes; i++) {
544             r.elements[i] = vecs[i / InputLanes][i % InputLanes];
545         }
546         return r;
547     }
548 
549     Vec replace(size_t i, const ElementType &b) const {
550         Vec r = *this;
551         r.elements[i] = b;
552         return r;
553     }
554 
555     ElementType operator[](size_t i) const {
556         return elements[i];
557     }
558 
559     Vec operator~() const {
560         Vec r(empty);
561         for (size_t i = 0; i < Lanes; i++) {
562             r.elements[i] = ~elements[i];
563         }
564         return r;
565     }
566     Vec operator!() const {
567         Vec r(empty);
568         for (size_t i = 0; i < Lanes; i++) {
569             r.elements[i] = !r.elements[i];
570         }
571         return r;
572     }
573 
574     friend Vec operator+(const Vec &a, const Vec &b) {
575         Vec r(empty);
576         for (size_t i = 0; i < Lanes; i++) {
577             r.elements[i] = a[i] + b[i];
578         }
579         return r;
580     }
581     friend Vec operator-(const Vec &a, const Vec &b) {
582         Vec r(empty);
583         for (size_t i = 0; i < Lanes; i++) {
584             r.elements[i] = a[i] - b[i];
585         }
586         return r;
587     }
588     friend Vec operator*(const Vec &a, const Vec &b) {
589         Vec r(empty);
590         for (size_t i = 0; i < Lanes; i++) {
591             r.elements[i] = a[i] * b[i];
592         }
593         return r;
594     }
595     friend Vec operator/(const Vec &a, const Vec &b) {
596         Vec r(empty);
597         for (size_t i = 0; i < Lanes; i++) {
598             r.elements[i] = a[i] / b[i];
599         }
600         return r;
601     }
602     friend Vec operator%(const Vec &a, const Vec &b) {
603         Vec r(empty);
604         for (size_t i = 0; i < Lanes; i++) {
605             r.elements[i] = a[i] % b[i];
606         }
607         return r;
608     }
609     template <typename OtherElementType>
610     friend Vec operator<<(const Vec &a, const CppVector<OtherElementType, Lanes> &b) {
611         Vec r(empty);
612         for (size_t i = 0; i < Lanes; i++) {
613             r.elements[i] = a[i] << b[i];
614         }
615         return r;
616     }
617     template <typename OtherElementType>
618     friend Vec operator>>(const Vec &a, const CppVector<OtherElementType, Lanes> &b) {
619         Vec r(empty);
620         for (size_t i = 0; i < Lanes; i++) {
621             r.elements[i] = a[i] >> b[i];
622         }
623         return r;
624     }
625     friend Vec operator&(const Vec &a, const Vec &b) {
626         Vec r(empty);
627         for (size_t i = 0; i < Lanes; i++) {
628             r.elements[i] = a[i] & b[i];
629         }
630         return r;
631     }
632     friend Vec operator|(const Vec &a, const Vec &b) {
633         Vec r(empty);
634         for (size_t i = 0; i < Lanes; i++) {
635             r.elements[i] = a[i] | b[i];
636         }
637         return r;
638     }
639 
640     friend Vec operator&&(const Vec &a, const Vec &b) {
641         Vec r(empty);
642         for (size_t i = 0; i < Lanes; i++) {
643             r.elements[i] = a[i] && b[i];
644         }
645         return r;
646     }
647     friend Vec operator||(const Vec &a, const Vec &b) {
648         Vec r(empty);
649         for (size_t i = 0; i < Lanes; i++) {
650             r.elements[i] = a[i] || b[i];
651         }
652         return r;
653     }
654 
655     friend Vec operator+(const Vec &a, const ElementType &b) {
656         Vec r(empty);
657         for (size_t i = 0; i < Lanes; i++) {
658             r.elements[i] = a[i] + b;
659         }
660         return r;
661     }
662     friend Vec operator-(const Vec &a, const ElementType &b) {
663         Vec r(empty);
664         for (size_t i = 0; i < Lanes; i++) {
665             r.elements[i] = a[i] - b;
666         }
667         return r;
668     }
669     friend Vec operator*(const Vec &a, const ElementType &b) {
670         Vec r(empty);
671         for (size_t i = 0; i < Lanes; i++) {
672             r.elements[i] = a[i] * b;
673         }
674         return r;
675     }
676     friend Vec operator/(const Vec &a, const ElementType &b) {
677         Vec r(empty);
678         for (size_t i = 0; i < Lanes; i++) {
679             r.elements[i] = a[i] / b;
680         }
681         return r;
682     }
683     friend Vec operator%(const Vec &a, const ElementType &b) {
684         Vec r(empty);
685         for (size_t i = 0; i < Lanes; i++) {
686             r.elements[i] = a[i] % b;
687         }
688         return r;
689     }
690     friend Vec operator>>(const Vec &a, const ElementType &b) {
691         Vec r(empty);
692         for (size_t i = 0; i < Lanes; i++) {
693             r.elements[i] = a[i] >> b;
694         }
695         return r;
696     }
697     friend Vec operator<<(const Vec &a, const ElementType &b) {
698         Vec r(empty);
699         for (size_t i = 0; i < Lanes; i++) {
700             r.elements[i] = a[i] << b;
701         }
702         return r;
703     }
704     friend Vec operator&(const Vec &a, const ElementType &b) {
705         Vec r(empty);
706         for (size_t i = 0; i < Lanes; i++) {
707             r.elements[i] = a[i] & b;
708         }
709         return r;
710     }
711     friend Vec operator|(const Vec &a, const ElementType &b) {
712         Vec r(empty);
713         for (size_t i = 0; i < Lanes; i++) {
714             r.elements[i] = a[i] | b;
715         }
716         return r;
717     }
718     friend Vec operator&&(const Vec &a, const ElementType &b) {
719         Vec r(empty);
720         for (size_t i = 0; i < Lanes; i++) {
721             r.elements[i] = a[i] && b;
722         }
723         return r;
724     }
725     friend Vec operator||(const Vec &a, const ElementType &b) {
726         Vec r(empty);
727         for (size_t i = 0; i < Lanes; i++) {
728             r.elements[i] = a[i] || b;
729         }
730         return r;
731     }
732 
733     friend Vec operator+(const ElementType &a, const Vec &b) {
734         Vec r(empty);
735         for (size_t i = 0; i < Lanes; i++) {
736             r.elements[i] = a + b[i];
737         }
738         return r;
739     }
740     friend Vec operator-(const ElementType &a, const Vec &b) {
741         Vec r(empty);
742         for (size_t i = 0; i < Lanes; i++) {
743             r.elements[i] = a - b[i];
744         }
745         return r;
746     }
747     friend Vec operator*(const ElementType &a, const Vec &b) {
748         Vec r(empty);
749         for (size_t i = 0; i < Lanes; i++) {
750             r.elements[i] = a * b[i];
751         }
752         return r;
753     }
754     friend Vec operator/(const ElementType &a, const Vec &b) {
755         Vec r(empty);
756         for (size_t i = 0; i < Lanes; i++) {
757             r.elements[i] = a / b[i];
758         }
759         return r;
760     }
761     friend Vec operator%(const ElementType &a, const Vec &b) {
762         Vec r(empty);
763         for (size_t i = 0; i < Lanes; i++) {
764             r.elements[i] = a % b[i];
765         }
766         return r;
767     }
768     friend Vec operator>>(const ElementType &a, const Vec &b) {
769         Vec r(empty);
770         for (size_t i = 0; i < Lanes; i++) {
771             r.elements[i] = a >> b[i];
772         }
773         return r;
774     }
775     friend Vec operator<<(const ElementType &a, const Vec &b) {
776         Vec r(empty);
777         for (size_t i = 0; i < Lanes; i++) {
778             r.elements[i] = a << b[i];
779         }
780         return r;
781     }
782     friend Vec operator&(const ElementType &a, const Vec &b) {
783         Vec r(empty);
784         for (size_t i = 0; i < Lanes; i++) {
785             r.elements[i] = a & b[i];
786         }
787         return r;
788     }
789     friend Vec operator|(const ElementType &a, const Vec &b) {
790         Vec r(empty);
791         for (size_t i = 0; i < Lanes; i++) {
792             r.elements[i] = a | b[i];
793         }
794         return r;
795     }
796     friend Vec operator&&(const ElementType &a, const Vec &b) {
797         Vec r(empty);
798         for (size_t i = 0; i < Lanes; i++) {
799             r.elements[i] = a && b[i];
800         }
801         return r;
802     }
803     friend Vec operator||(const ElementType &a, const Vec &b) {
804         Vec r(empty);
805         for (size_t i = 0; i < Lanes; i++) {
806             r.elements[i] = a || b[i];
807         }
808         return r;
809     }
810 
811     friend Mask operator<(const Vec &a, const Vec &b) {
812         Mask r;
813         for (size_t i = 0; i < Lanes; i++) {
814             r.elements[i] = a[i] < b[i] ? 0xff : 0x00;
815         }
816         return r;
817     }
818 
819     friend Mask operator<=(const Vec &a, const Vec &b) {
820         Mask r;
821         for (size_t i = 0; i < Lanes; i++) {
822             r.elements[i] = a[i] <= b[i] ? 0xff : 0x00;
823         }
824         return r;
825     }
826 
827     friend Mask operator>(const Vec &a, const Vec &b) {
828         Mask r;
829         for (size_t i = 0; i < Lanes; i++) {
830             r.elements[i] = a[i] > b[i] ? 0xff : 0x00;
831         }
832         return r;
833     }
834 
835     friend Mask operator>=(const Vec &a, const Vec &b) {
836         Mask r;
837         for (size_t i = 0; i < Lanes; i++) {
838             r.elements[i] = a[i] >= b[i] ? 0xff : 0x00;
839         }
840         return r;
841     }
842 
843     friend Mask operator==(const Vec &a, const Vec &b) {
844         Mask r;
845         for (size_t i = 0; i < Lanes; i++) {
846             r.elements[i] = a[i] == b[i] ? 0xff : 0x00;
847         }
848         return r;
849     }
850 
851     friend Mask operator!=(const Vec &a, const Vec &b) {
852         Mask r;
853         for (size_t i = 0; i < Lanes; i++) {
854             r.elements[i] = a[i] != b[i] ? 0xff : 0x00;
855         }
856         return r;
857     }
858 
859     static Vec select(const Mask &cond, const Vec &true_value, const Vec &false_value) {
860         Vec r(empty);
861         for (size_t i = 0; i < Lanes; i++) {
862             r.elements[i] = cond[i] ? true_value[i] : false_value[i];
863         }
864         return r;
865     }
866 
867     template <typename OtherVec>
868     static Vec convert_from(const OtherVec &src) {
869         #if __cplusplus >= 201103L
870         static_assert(Vec::Lanes == OtherVec::Lanes, "Lanes mismatch");
871         #endif
872         Vec r(empty);
873         for (size_t i = 0; i < Lanes; i++) {
874             r.elements[i] = static_cast<typename Vec::ElementType>(src[i]);
875         }
876         return r;
877     }
878 
879     static Vec max(const Vec &a, const Vec &b) {
880         Vec r(empty);
881         for (size_t i = 0; i < Lanes; i++) {
882             r.elements[i] = ::halide_cpp_max(a[i], b[i]);
883         }
884         return r;
885     }
886 
887     static Vec min(const Vec &a, const Vec &b) {
888         Vec r(empty);
889         for (size_t i = 0; i < Lanes; i++) {
890             r.elements[i] = ::halide_cpp_min(a[i], b[i]);
891         }
892         return r;
893     }
894 
895 private:
896     template <typename, size_t> friend class CppVector;
897     ElementType elements[Lanes];
898 
899     // Leave vector uninitialized for cases where we overwrite every entry
900     enum Empty { empty };
901     CppVector(Empty) {}
902 };
903 
904 )INLINE_CODE";
905 
906         const char *native_vector_decl = R"INLINE_CODE(
907 #if __has_attribute(ext_vector_type) || __has_attribute(vector_size)
908 template <typename ElementType_, size_t Lanes_>
909 class NativeVector {
910 public:
911     typedef ElementType_ ElementType;
912     static const size_t Lanes = Lanes_;
913     typedef NativeVector<ElementType, Lanes> Vec;
914     typedef NativeVector<uint8_t, Lanes> Mask;
915 
916 #if __has_attribute(ext_vector_type)
917     typedef ElementType_ NativeVectorType __attribute__((ext_vector_type(Lanes), aligned(sizeof(ElementType))));
918 #elif __has_attribute(vector_size) || __GNUC__
919     typedef ElementType_ NativeVectorType __attribute__((vector_size(Lanes * sizeof(ElementType)), aligned(sizeof(ElementType))));
920 #endif
921 
922     NativeVector &operator=(const Vec &src) {
923         if (this != &src) {
924             native_vector = src.native_vector;
925         }
926         return *this;
927     }
928 
929     /* not-explicit */ NativeVector(const Vec &src) {
930         native_vector = src.native_vector;
931     }
932 
933     NativeVector() {
934         native_vector = (NativeVectorType){};
935     }
936 
937     static Vec broadcast(const ElementType &v) {
938         Vec zero; // Zero-initialized native vector.
939         return zero + v;
940     }
941 
942     // TODO: this should be improved by taking advantage of native operator support.
943     static Vec ramp(const ElementType &base, const ElementType &stride) {
944         Vec r(empty);
945         for (size_t i = 0; i < Lanes; i++) {
946             r.native_vector[i] = base + stride * i;
947         }
948         return r;
949     }
950 
951     // TODO: could this be improved by taking advantage of native operator support?
952     static Vec load(const void *base, int32_t offset) {
953         Vec r(empty);
954         // Note: do not use sizeof(NativeVectorType) here; if it's an unusual type
955         // (e.g. uint8x48, which could be produced by concat()), the actual implementation
956         // might be larger (e.g. it might really be a uint8x64). Only copy the amount
957         // that is in the logical type, to avoid possible overreads.
958         memcpy(&r.native_vector, ((const ElementType*)base + offset), sizeof(ElementType) * Lanes);
959         return r;
960     }
961 
962     // gather
963     // TODO: could this be improved by taking advantage of native operator support?
964     static Vec load(const void *base, const NativeVector<int32_t, Lanes> &offset) {
965         Vec r(empty);
966         for (size_t i = 0; i < Lanes; i++) {
967             r.native_vector[i] = ((const ElementType*)base)[offset[i]];
968         }
969         return r;
970     }
971 
972     // TODO: could this be improved by taking advantage of native operator support?
973     void store(void *base, int32_t offset) const {
974         // Note: do not use sizeof(NativeVectorType) here; if it's an unusual type
975         // (e.g. uint8x48, which could be produced by concat()), the actual implementation
976         // might be larger (e.g. it might really be a uint8x64). Only copy the amount
977         // that is in the logical type, to avoid possible overwrites.
978         memcpy(((ElementType*)base + offset), &native_vector, sizeof(ElementType) * Lanes);
979     }
980 
981     // scatter
982     // TODO: could this be improved by taking advantage of native operator support?
983     void store(void *base, const NativeVector<int32_t, Lanes> &offset) const {
984         for (size_t i = 0; i < Lanes; i++) {
985             ((ElementType*)base)[offset[i]] = native_vector[i];
986         }
987     }
988 
989     // TODO: this should be improved by taking advantage of native operator support.
990     static Vec shuffle(const Vec &a, const int32_t indices[Lanes]) {
991         Vec r(empty);
992         for (size_t i = 0; i < Lanes; i++) {
993             if (indices[i] < 0) {
994                 continue;
995             }
996             r.native_vector[i] = a[indices[i]];
997         }
998         return r;
999     }
1000 
1001     // TODO: this should be improved by taking advantage of native operator support.
1002     template<size_t InputLanes>
1003     static Vec concat(size_t count, const NativeVector<ElementType, InputLanes> vecs[]) {
1004         Vec r(empty);
1005         for (size_t i = 0; i < Lanes; i++) {
1006             r.native_vector[i] = vecs[i / InputLanes][i % InputLanes];
1007         }
1008         return r;
1009     }
1010 
1011     // TODO: this should be improved by taking advantage of native operator support.
1012     Vec replace(size_t i, const ElementType &b) const {
1013         Vec r = *this;
1014         r.native_vector[i] = b;
1015         return r;
1016     }
1017 
1018     ElementType operator[](size_t i) const {
1019         return native_vector[i];
1020     }
1021 
1022     Vec operator~() const {
1023         return Vec(from_native_vector, ~native_vector);
1024     }
1025     Vec operator!() const {
1026         Vec r(empty);
1027         for (size_t i = 0; i < Lanes; i++) {
1028             r.native_vector[i] = !(*this)[i];
1029         }
1030         return r;
1031     }
1032 
1033     friend Vec operator+(const Vec &a, const Vec &b) {
1034         return Vec(from_native_vector, a.native_vector + b.native_vector);
1035     }
1036     friend Vec operator-(const Vec &a, const Vec &b) {
1037         return Vec(from_native_vector, a.native_vector - b.native_vector);
1038     }
1039     friend Vec operator*(const Vec &a, const Vec &b) {
1040         return Vec(from_native_vector, a.native_vector * b.native_vector);
1041     }
1042     friend Vec operator/(const Vec &a, const Vec &b) {
1043         return Vec(from_native_vector, a.native_vector / b.native_vector);
1044     }
1045     friend Vec operator%(const Vec &a, const Vec &b) {
1046         return Vec(from_native_vector, a.native_vector % b.native_vector);
1047     }
1048     friend Vec operator&(const Vec &a, const Vec &b) {
1049         return Vec(from_native_vector, a.native_vector & b.native_vector);
1050     }
1051     friend Vec operator|(const Vec &a, const Vec &b) {
1052         return Vec(from_native_vector, a.native_vector | b.native_vector);
1053     }
1054     friend Vec operator&&(const Vec &a, const Vec &b) {
1055         Vec r(empty);
1056         for (size_t i = 0; i < Lanes; i++) {
1057             r.native_vector[i] = a.native_vector[i] && b.native_vector[i];
1058         }
1059         return r;
1060     }
1061     friend Vec operator||(const Vec &a, const Vec &b) {
1062         Vec r(empty);
1063         for (size_t i = 0; i < Lanes; i++) {
1064             r.native_vector[i] = a.native_vector[i] || b.native_vector[i];
1065         }
1066         return r;
1067     }
1068 
1069     friend Vec operator+(const Vec &a, const ElementType &b) {
1070         return Vec(from_native_vector, a.native_vector + b);
1071     }
1072     friend Vec operator-(const Vec &a, const ElementType &b) {
1073         return Vec(from_native_vector, a.native_vector - b);
1074     }
1075     friend Vec operator*(const Vec &a, const ElementType &b) {
1076         return Vec(from_native_vector, a.native_vector * b);
1077     }
1078     friend Vec operator/(const Vec &a, const ElementType &b) {
1079         return Vec(from_native_vector, a.native_vector / b);
1080     }
1081     friend Vec operator%(const Vec &a, const ElementType &b) {
1082         return Vec(from_native_vector, a.native_vector % b);
1083     }
1084     friend Vec operator<<(const Vec &a, const ElementType &b) {
1085         return Vec(from_native_vector, a.native_vector << b);
1086     }
1087     friend Vec operator>>(const Vec &a, const ElementType &b) {
1088         return Vec(from_native_vector, a.native_vector >> b);
1089     }
1090     friend Vec operator&(const Vec &a, const ElementType &b) {
1091         return Vec(from_native_vector, a.native_vector & b);
1092     }
1093     friend Vec operator|(const Vec &a, const ElementType &b) {
1094         return Vec(from_native_vector, a.native_vector | b);
1095     }
1096     friend Vec operator&&(const Vec &a, const ElementType &b) {
1097         Vec r(empty);
1098         for (size_t i = 0; i < Lanes; i++) {
1099             r.native_vector[i] = a.native_vector[i] && b;
1100         }
1101         return r;
1102     }
1103     friend Vec operator||(const Vec &a, const ElementType &b) {
1104         Vec r(empty);
1105         for (size_t i = 0; i < Lanes; i++) {
1106             r.native_vector[i] = a.native_vector[i] || b;
1107         }
1108         return r;
1109     }
1110 
1111     friend Vec operator+(const ElementType &a, const Vec &b) {
1112         return Vec(from_native_vector, a + b.native_vector);
1113     }
1114     friend Vec operator-(const ElementType &a, const Vec &b) {
1115         return Vec(from_native_vector, a - b.native_vector);
1116     }
1117     friend Vec operator*(const ElementType &a, const Vec &b) {
1118         return Vec(from_native_vector, a * b.native_vector);
1119     }
1120     friend Vec operator/(const ElementType &a, const Vec &b) {
1121         return Vec(from_native_vector, a / b.native_vector);
1122     }
1123     friend Vec operator%(const ElementType &a, const Vec &b) {
1124         return Vec(from_native_vector, a % b.native_vector);
1125     }
1126     friend Vec operator<<(const ElementType &a, const Vec &b) {
1127         return Vec(from_native_vector, a << b.native_vector);
1128     }
1129     friend Vec operator>>(const ElementType &a, const Vec &b) {
1130         return Vec(from_native_vector, a >> b.native_vector);
1131     }
1132     friend Vec operator&(const ElementType &a, const Vec &b) {
1133         return Vec(from_native_vector, a & b.native_vector);
1134     }
1135     friend Vec operator|(const ElementType &a, const Vec &b) {
1136         return Vec(from_native_vector, a | b.native_vector);
1137     }
1138     friend Vec operator&&(const ElementType &a, const Vec &b) {
1139         Vec r(empty);
1140         for (size_t i = 0; i < Lanes; i++) {
1141             r.native_vector[i] = a && b.native_vector[i];
1142         }
1143         return r;
1144     }
1145     friend Vec operator||(const ElementType &a, const Vec &b) {
1146         Vec r(empty);
1147         for (size_t i = 0; i < Lanes; i++) {
1148             r.native_vector[i] = a || b.native_vector[i];
1149         }
1150         return r;
1151     }
1152 
1153     // TODO: this should be improved by taking advantage of native operator support.
1154     friend Mask operator<(const Vec &a, const Vec &b) {
1155         Mask r;
1156         for (size_t i = 0; i < Lanes; i++) {
1157             r.native_vector[i] = a[i] < b[i] ? 0xff : 0x00;
1158         }
1159         return r;
1160     }
1161 
1162     // TODO: this should be improved by taking advantage of native operator support.
1163     friend Mask operator<=(const Vec &a, const Vec &b) {
1164         Mask r;
1165         for (size_t i = 0; i < Lanes; i++) {
1166             r.native_vector[i] = a[i] <= b[i] ? 0xff : 0x00;
1167         }
1168         return r;
1169     }
1170 
1171     // TODO: this should be improved by taking advantage of native operator support.
1172     friend Mask operator>(const Vec &a, const Vec &b) {
1173         Mask r;
1174         for (size_t i = 0; i < Lanes; i++) {
1175             r.native_vector[i] = a[i] > b[i] ? 0xff : 0x00;
1176         }
1177         return r;
1178     }
1179 
1180     // TODO: this should be improved by taking advantage of native operator support.
1181     friend Mask operator>=(const Vec &a, const Vec &b) {
1182         Mask r;
1183         for (size_t i = 0; i < Lanes; i++) {
1184             r.native_vector[i] = a[i] >= b[i] ? 0xff : 0x00;
1185         }
1186         return r;
1187     }
1188 
1189     // TODO: this should be improved by taking advantage of native operator support.
1190     friend Mask operator==(const Vec &a, const Vec &b) {
1191         Mask r;
1192         for (size_t i = 0; i < Lanes; i++) {
1193             r.native_vector[i] = a[i] == b[i] ? 0xff : 0x00;
1194         }
1195         return r;
1196     }
1197 
1198     // TODO: this should be improved by taking advantage of native operator support.
1199     friend Mask operator!=(const Vec &a, const Vec &b) {
1200         Mask r;
1201         for (size_t i = 0; i < Lanes; i++) {
1202             r.native_vector[i] = a[i] != b[i] ? 0xff : 0x00;
1203         }
1204         return r;
1205     }
1206 
1207     // TODO: this should be improved by taking advantage of native operator support.
1208     static Vec select(const Mask &cond, const Vec &true_value, const Vec &false_value) {
1209         Vec r(empty);
1210         for (size_t i = 0; i < Lanes; i++) {
1211             r.native_vector[i] = cond[i] ? true_value[i] : false_value[i];
1212         }
1213         return r;
1214     }
1215 
1216     template <typename OtherVec>
1217     static Vec convert_from(const OtherVec &src) {
1218         #if __cplusplus >= 201103L
1219         static_assert(Vec::Lanes == OtherVec::Lanes, "Lanes mismatch");
1220         #endif
1221 #if 0 // __has_builtin(__builtin_convertvector)
1222         // Disabled (for now) because __builtin_convertvector appears to have
1223         // different float->int rounding behavior in at least some situations;
1224         // for now we'll use the much-slower-but-correct explicit C++ code.
1225         // (https://github.com/halide/Halide/issues/2080)
1226         return Vec(from_native_vector, __builtin_convertvector(src.native_vector, NativeVectorType));
1227 #else
1228         Vec r(empty);
1229         for (size_t i = 0; i < Lanes; i++) {
1230             r.native_vector[i] = static_cast<typename Vec::ElementType>(src.native_vector[i]);
1231         }
1232         return r;
1233 #endif
1234     }
1235 
1236     // TODO: this should be improved by taking advantage of native operator support.
1237     static Vec max(const Vec &a, const Vec &b) {
1238         Vec r(empty);
1239         for (size_t i = 0; i < Lanes; i++) {
1240             r.native_vector[i] = ::halide_cpp_max(a[i], b[i]);
1241         }
1242         return r;
1243     }
1244 
1245     // TODO: this should be improved by taking advantage of native operator support.
1246     static Vec min(const Vec &a, const Vec &b) {
1247         Vec r(empty);
1248         for (size_t i = 0; i < Lanes; i++) {
1249             r.native_vector[i] = ::halide_cpp_min(a[i], b[i]);
1250         }
1251         return r;
1252     }
1253 
1254 private:
1255     template<typename, size_t> friend class NativeVector;
1256 
1257     template <typename ElementType, typename OtherElementType, size_t Lanes>
1258     friend NativeVector<ElementType, Lanes> operator<<(
1259                     const NativeVector<ElementType, Lanes> &a,
1260                     const NativeVector<OtherElementType, Lanes> &b);
1261 
1262     template <typename ElementType, typename OtherElementType, size_t Lanes>
1263     friend NativeVector<ElementType, Lanes> operator>>(
1264                     const NativeVector<ElementType, Lanes> &a,
1265                     const NativeVector<OtherElementType, Lanes> &b);
1266 
1267     NativeVectorType native_vector;
1268 
1269     // Leave vector uninitialized for cases where we overwrite every entry
1270     enum Empty { empty };
1271     inline NativeVector(Empty) {}
1272 
1273     // Syntactic sugar to avoid ctor overloading issues
1274     enum FromNativeVector { from_native_vector };
1275     inline NativeVector(FromNativeVector, const NativeVectorType &src) {
1276         native_vector = src;
1277     }
1278 };
1279 
1280 template <typename ElementType, typename OtherElementType, size_t Lanes>
1281 NativeVector<ElementType, Lanes> operator<<(const NativeVector<ElementType, Lanes> &a,
1282                     const NativeVector<OtherElementType, Lanes> &b) {
1283     return NativeVector<ElementType, Lanes>(
1284                   NativeVector<ElementType, Lanes>::from_native_vector,
1285                   a.native_vector << b.native_vector);
1286 }
1287 
1288 template <typename ElementType, typename OtherElementType, size_t Lanes>
1289 NativeVector<ElementType, Lanes> operator>>(const NativeVector<ElementType, Lanes> &a,
1290                     const NativeVector<OtherElementType, Lanes> &b) {
1291     return NativeVector<ElementType, Lanes>(
1292                   NativeVector<ElementType, Lanes>::from_native_vector,
1293                   a.native_vector >> b.native_vector);
1294 }
1295 #endif  // __has_attribute(ext_vector_type) || __has_attribute(vector_size)
1296 
1297 )INLINE_CODE";
1298 
1299         const char *vector_selection_decl = R"INLINE_CODE(
1300 // Dec. 1, 2018: Apparently emscripten compilation runs with the __has_attribute true,
1301 // then fails to handle the vector intrinsics later.
1302 #if !defined(__EMSCRIPTEN__) && (__has_attribute(ext_vector_type) || __has_attribute(vector_size))
1303     #if __GNUC__ && !__clang__
1304         // GCC only allows powers-of-two; fall back to CppVector for other widths
1305         #define halide_cpp_use_native_vector(type, lanes) ((lanes & (lanes - 1)) == 0)
1306     #else
1307         #define halide_cpp_use_native_vector(type, lanes) (true)
1308     #endif
1309 #else
1310     // No NativeVector available
1311     #define halide_cpp_use_native_vector(type, lanes) (false)
1312 #endif  // __has_attribute(ext_vector_type) || __has_attribute(vector_size)
1313 
1314 // Failsafe to allow forcing non-native vectors in case of unruly compilers
1315 #if HALIDE_CPP_ALWAYS_USE_CPP_VECTORS
1316     #undef halide_cpp_use_native_vector
1317     #define halide_cpp_use_native_vector(type, lanes) (false)
1318 #endif
1319 
1320 )INLINE_CODE";
1321 
1322         // Vodoo fix: on at least one config (our arm32 buildbot running gcc 5.4),
1323         // emitting this long text string was regularly garbled in a predictable pattern;
1324         // flushing the stream before or after heals it. Since C++ codegen is rarely
1325         // on a compilation critical path, we'll just band-aid it in this way.
1326         stream << std::flush;
1327         stream << cpp_vector_decl << native_vector_decl << vector_selection_decl;
1328         stream << std::flush;
1329 
1330         for (const auto &t : vector_types) {
1331             string name = type_to_c_type(t, false, false);
1332             string scalar_name = type_to_c_type(t.element_of(), false, false);
1333             stream << "#if halide_cpp_use_native_vector(" << scalar_name << ", " << t.lanes() << ")\n";
1334             stream << "typedef NativeVector<" << scalar_name << ", " << t.lanes() << "> " << name << ";\n";
1335             // Useful for debugging which Vector implementation is being selected
1336             // stream << "#pragma message \"using NativeVector for " << t << "\"\n";
1337             stream << "#else\n";
1338             stream << "typedef CppVector<" << scalar_name << ", " << t.lanes() << "> " << name << ";\n";
1339             // Useful for debugging which Vector implementation is being selected
1340             // stream << "#pragma message \"using CppVector for " << t << "\"\n";
1341             stream << "#endif\n";
1342         }
1343     }
1344 }
1345 
set_name_mangling_mode(NameMangling mode)1346 void CodeGen_C::set_name_mangling_mode(NameMangling mode) {
1347     if (extern_c_open && mode != NameMangling::C) {
1348         stream << "\n#ifdef __cplusplus\n";
1349         stream << "}  // extern \"C\"\n";
1350         stream << "#endif\n\n";
1351         extern_c_open = false;
1352     } else if (!extern_c_open && mode == NameMangling::C) {
1353         stream << "\n#ifdef __cplusplus\n";
1354         stream << "extern \"C\" {\n";
1355         stream << "#endif\n\n";
1356         extern_c_open = true;
1357     }
1358 }
1359 
print_type(Type type,AppendSpaceIfNeeded space_option)1360 string CodeGen_C::print_type(Type type, AppendSpaceIfNeeded space_option) {
1361     return type_to_c_type(type, space_option == AppendSpace);
1362 }
1363 
print_reinterpret(Type type,const Expr & e)1364 string CodeGen_C::print_reinterpret(Type type, const Expr &e) {
1365     ostringstream oss;
1366     if (type.is_handle() || e.type().is_handle()) {
1367         // Use a c-style cast if either src or dest is a handle --
1368         // note that although Halide declares a "Handle" to always be 64 bits,
1369         // the source "handle" might actually be a 32-bit pointer (from
1370         // a function parameter), so calling reinterpret<> (which just memcpy's)
1371         // would be garbage-producing.
1372         oss << "(" << print_type(type) << ")";
1373     } else {
1374         oss << "reinterpret<" << print_type(type) << ">";
1375     }
1376     oss << "(" << print_expr(e) << ")";
1377     return oss.str();
1378 }
1379 
print_name(const string & name)1380 string CodeGen_C::print_name(const string &name) {
1381     return c_print_name(name);
1382 }
1383 
1384 namespace {
1385 class ExternCallPrototypes : public IRGraphVisitor {
1386     struct NamespaceOrCall {
1387         const Call *call;  // nullptr if this is a subnamespace
1388         std::map<string, NamespaceOrCall> names;
NamespaceOrCallHalide::Internal::__anonc9f9d7000211::ExternCallPrototypes::NamespaceOrCall1389         NamespaceOrCall(const Call *call = nullptr)
1390             : call(call) {
1391         }
1392     };
1393     std::map<string, NamespaceOrCall> c_plus_plus_externs;
1394     std::map<string, const Call *> c_externs;
1395     std::set<std::string> processed;
1396     std::set<std::string> internal_linkage;
1397     std::set<std::string> destructors;
1398 
1399     using IRGraphVisitor::visit;
1400 
visit(const Call * op)1401     void visit(const Call *op) override {
1402         IRGraphVisitor::visit(op);
1403 
1404         if (!processed.count(op->name)) {
1405             if (op->call_type == Call::Extern || op->call_type == Call::PureExtern) {
1406                 c_externs.insert({op->name, op});
1407             } else if (op->call_type == Call::ExternCPlusPlus) {
1408                 std::vector<std::string> namespaces;
1409                 std::string name = extract_namespaces(op->name, namespaces);
1410                 std::map<string, NamespaceOrCall> *namespace_map = &c_plus_plus_externs;
1411                 for (const auto &ns : namespaces) {
1412                     auto insertion = namespace_map->insert({ns, NamespaceOrCall()});
1413                     namespace_map = &insertion.first->second.names;
1414                 }
1415                 namespace_map->insert({name, NamespaceOrCall(op)});
1416             }
1417             processed.insert(op->name);
1418         }
1419 
1420         if (op->is_intrinsic(Call::register_destructor)) {
1421             internal_assert(op->args.size() == 2);
1422             const StringImm *fn = op->args[0].as<StringImm>();
1423             internal_assert(fn);
1424             destructors.insert(fn->value);
1425         }
1426     }
1427 
visit(const Allocate * op)1428     void visit(const Allocate *op) override {
1429         IRGraphVisitor::visit(op);
1430         if (!op->free_function.empty()) {
1431             destructors.insert(op->free_function);
1432         }
1433     }
1434 
emit_function_decl(ostream & stream,const Call * op,const std::string & name) const1435     void emit_function_decl(ostream &stream, const Call *op, const std::string &name) const {
1436         // op->name (rather than the name arg) since we need the fully-qualified C++ name
1437         if (internal_linkage.count(op->name)) {
1438             stream << "static ";
1439         }
1440         stream << type_to_c_type(op->type, /* append_space */ true) << name << "(";
1441         if (function_takes_user_context(name)) {
1442             stream << "void *";
1443             if (!op->args.empty()) {
1444                 stream << ", ";
1445             }
1446         }
1447         for (size_t i = 0; i < op->args.size(); i++) {
1448             if (i > 0) {
1449                 stream << ", ";
1450             }
1451             if (op->args[i].as<StringImm>()) {
1452                 stream << "const char *";
1453             } else {
1454                 stream << type_to_c_type(op->args[i].type(), true);
1455             }
1456         }
1457         stream << ");\n";
1458     }
1459 
emit_namespace_or_call(ostream & stream,const NamespaceOrCall & ns_or_call,const std::string & name) const1460     void emit_namespace_or_call(ostream &stream, const NamespaceOrCall &ns_or_call, const std::string &name) const {
1461         if (ns_or_call.call == nullptr) {
1462             stream << "namespace " << name << " {\n";
1463             for (const auto &ns_or_call_inner : ns_or_call.names) {
1464                 emit_namespace_or_call(stream, ns_or_call_inner.second, ns_or_call_inner.first);
1465             }
1466             stream << "} // namespace " << name << "\n";
1467         } else {
1468             emit_function_decl(stream, ns_or_call.call, name);
1469         }
1470     }
1471 
1472 public:
ExternCallPrototypes()1473     ExternCallPrototypes() {
1474         // Make sure we don't catch calls that are already in the global declarations
1475         const char *strs[] = {globals.c_str(),
1476                               (const char *)halide_internal_runtime_header_HalideRuntime_h,
1477                               (const char *)halide_internal_initmod_inlined_c};
1478         for (const char *str : strs) {
1479             size_t j = 0;
1480             for (size_t i = 0; str[i]; i++) {
1481                 char c = str[i];
1482                 if (c == '(' && i > j + 1) {
1483                     // Could be the end of a function_name.
1484                     string name(str + j + 1, i - j - 1);
1485                     processed.insert(name);
1486                 }
1487 
1488                 if (('A' <= c && c <= 'Z') ||
1489                     ('a' <= c && c <= 'z') ||
1490                     c == '_' ||
1491                     ('0' <= c && c <= '9')) {
1492                     // Could be part of a function name.
1493                 } else {
1494                     j = i;
1495                 }
1496             }
1497         }
1498     }
1499 
set_internal_linkage(const std::string & name)1500     void set_internal_linkage(const std::string &name) {
1501         internal_linkage.insert(name);
1502     }
1503 
has_c_declarations() const1504     bool has_c_declarations() const {
1505         return !c_externs.empty();
1506     }
1507 
has_c_plus_plus_declarations() const1508     bool has_c_plus_plus_declarations() const {
1509         return !c_plus_plus_externs.empty();
1510     }
1511 
emit_c_declarations(ostream & stream) const1512     void emit_c_declarations(ostream &stream) const {
1513         for (const auto &call : c_externs) {
1514             emit_function_decl(stream, call.second, call.first);
1515         }
1516         for (const auto &d : destructors) {
1517             stream << "void " << d << "(void *, void *);\n";
1518         }
1519         stream << "\n";
1520     }
1521 
emit_c_plus_plus_declarations(ostream & stream) const1522     void emit_c_plus_plus_declarations(ostream &stream) const {
1523         for (const auto &ns_or_call : c_plus_plus_externs) {
1524             emit_namespace_or_call(stream, ns_or_call.second, ns_or_call.first);
1525         }
1526         stream << "\n";
1527     }
1528 };
1529 }  // namespace
1530 
forward_declare_type_if_needed(const Type & t)1531 void CodeGen_C::forward_declare_type_if_needed(const Type &t) {
1532     if (!t.handle_type ||
1533         forward_declared.count(t.handle_type) ||
1534         t.handle_type->inner_name.cpp_type_type == halide_cplusplus_type_name::Simple) {
1535         return;
1536     }
1537     for (auto &ns : t.handle_type->namespaces) {
1538         stream << "namespace " << ns << " { ";
1539     }
1540     switch (t.handle_type->inner_name.cpp_type_type) {
1541     case halide_cplusplus_type_name::Simple:
1542         // nothing
1543         break;
1544     case halide_cplusplus_type_name::Struct:
1545         stream << "struct " << t.handle_type->inner_name.name << ";";
1546         break;
1547     case halide_cplusplus_type_name::Class:
1548         stream << "class " << t.handle_type->inner_name.name << ";";
1549         break;
1550     case halide_cplusplus_type_name::Union:
1551         stream << "union " << t.handle_type->inner_name.name << ";";
1552         break;
1553     case halide_cplusplus_type_name::Enum:
1554         internal_error << "Passing pointers to enums is unsupported\n";
1555         break;
1556     }
1557     for (auto &ns : t.handle_type->namespaces) {
1558         (void)ns;
1559         stream << " }";
1560     }
1561     stream << "\n";
1562     forward_declared.insert(t.handle_type);
1563 }
1564 
compile(const Module & input)1565 void CodeGen_C::compile(const Module &input) {
1566     TypeInfoGatherer type_info;
1567     for (const auto &f : input.functions()) {
1568         if (f.body.defined()) {
1569             f.body.accept(&type_info);
1570         }
1571     }
1572     uses_gpu_for_loops = (type_info.for_types_used.count(ForType::GPUBlock) ||
1573                           type_info.for_types_used.count(ForType::GPUThread) ||
1574                           type_info.for_types_used.count(ForType::GPULane));
1575 
1576     // Forward-declare all the types we need; this needs to happen before
1577     // we emit function prototypes, since those may need the types.
1578     stream << "\n";
1579     for (const auto &f : input.functions()) {
1580         for (auto &arg : f.args) {
1581             forward_declare_type_if_needed(arg.type);
1582         }
1583     }
1584     stream << "\n";
1585 
1586     if (!is_header_or_extern_decl()) {
1587         // Emit any external-code blobs that are C++.
1588         for (const ExternalCode &code_blob : input.external_code()) {
1589             if (code_blob.is_c_plus_plus_source()) {
1590                 stream << "\n";
1591                 stream << "// Begin External Code: " << code_blob.name() << "\n";
1592                 stream.write((const char *)code_blob.contents().data(), code_blob.contents().size());
1593                 stream << "\n";
1594                 stream << "// End External Code: " << code_blob.name() << "\n";
1595                 stream << "\n";
1596             }
1597         }
1598 
1599         add_vector_typedefs(type_info.vector_types_used);
1600 
1601         // Emit prototypes for all external and internal-only functions.
1602         // Gather them up and do them all up front, to reduce duplicates,
1603         // and to make it simpler to get internal-linkage functions correct.
1604         ExternCallPrototypes e;
1605         for (const auto &f : input.functions()) {
1606             f.body.accept(&e);
1607             if (f.linkage == LinkageType::Internal) {
1608                 // We can't tell at the call site if a LoweredFunc is intended to be internal
1609                 // or not, so mark them explicitly.
1610                 e.set_internal_linkage(f.name);
1611             }
1612         }
1613 
1614         if (e.has_c_plus_plus_declarations()) {
1615             set_name_mangling_mode(NameMangling::CPlusPlus);
1616             e.emit_c_plus_plus_declarations(stream);
1617         }
1618 
1619         if (e.has_c_declarations()) {
1620             set_name_mangling_mode(NameMangling::C);
1621             e.emit_c_declarations(stream);
1622         }
1623     }
1624 
1625     for (const auto &b : input.buffers()) {
1626         compile(b);
1627     }
1628     for (const auto &f : input.functions()) {
1629         compile(f);
1630     }
1631 }
1632 
compile(const LoweredFunc & f)1633 void CodeGen_C::compile(const LoweredFunc &f) {
1634     // Don't put non-external function declarations in headers.
1635     if (is_header_or_extern_decl() && f.linkage == LinkageType::Internal) {
1636         return;
1637     }
1638 
1639     const std::vector<LoweredArgument> &args = f.args;
1640 
1641     have_user_context = false;
1642     for (size_t i = 0; i < args.size(); i++) {
1643         // TODO: check that its type is void *?
1644         have_user_context |= (args[i].name == "__user_context");
1645     }
1646 
1647     NameMangling name_mangling = f.name_mangling;
1648     if (name_mangling == NameMangling::Default) {
1649         name_mangling = (target.has_feature(Target::CPlusPlusMangling) ? NameMangling::CPlusPlus : NameMangling::C);
1650     }
1651 
1652     set_name_mangling_mode(name_mangling);
1653 
1654     std::vector<std::string> namespaces;
1655     std::string simple_name = extract_namespaces(f.name, namespaces);
1656     if (!is_c_plus_plus_interface()) {
1657         user_assert(namespaces.empty()) << "Namespace qualifiers not allowed on function name if not compiling with Target::CPlusPlusNameMangling.\n";
1658     }
1659 
1660     if (!namespaces.empty()) {
1661         for (const auto &ns : namespaces) {
1662             stream << "namespace " << ns << " {\n";
1663         }
1664         stream << "\n";
1665     }
1666 
1667     // Emit the function prototype
1668     if (f.linkage == LinkageType::Internal) {
1669         // If the function isn't public, mark it static.
1670         stream << "static ";
1671     }
1672     stream << "HALIDE_FUNCTION_ATTRS\n";
1673     stream << "int " << simple_name << "(";
1674     for (size_t i = 0; i < args.size(); i++) {
1675         if (args[i].is_buffer()) {
1676             stream << "struct halide_buffer_t *"
1677                    << print_name(args[i].name)
1678                    << "_buffer";
1679         } else {
1680             stream << print_type(args[i].type, AppendSpace)
1681                    << print_name(args[i].name);
1682         }
1683 
1684         if (i < args.size() - 1) stream << ", ";
1685     }
1686 
1687     if (is_header_or_extern_decl()) {
1688         stream << ");\n";
1689     } else {
1690         stream << ") {\n";
1691         indent += 1;
1692 
1693         if (uses_gpu_for_loops) {
1694             stream << get_indent() << "halide_error("
1695                    << (have_user_context ? "__user_context_" : "nullptr")
1696                    << ", \"C++ Backend does not support gpu_blocks() or gpu_threads() yet, "
1697                    << "this function will always fail at runtime\");\n";
1698             stream << get_indent() << "return halide_error_code_device_malloc_failed;\n";
1699         } else {
1700             // Emit a local user_context we can pass in all cases, either
1701             // aliasing __user_context or nullptr.
1702             stream << get_indent() << "void * const _ucon = "
1703                    << (have_user_context ? "const_cast<void *>(__user_context)" : "nullptr")
1704                    << ";\n";
1705 
1706             if (target.has_feature(Target::NoAsserts)) {
1707                 stream << get_indent() << "halide_unused(_ucon);";
1708             }
1709 
1710             // Emit the body
1711             print(f.body);
1712 
1713             // Return success.
1714             stream << get_indent() << "return 0;\n";
1715         }
1716 
1717         indent -= 1;
1718         stream << "}\n";
1719     }
1720 
1721     if (is_header_or_extern_decl() && f.linkage == LinkageType::ExternalPlusMetadata) {
1722         // Emit the argv version
1723         stream << "\nHALIDE_FUNCTION_ATTRS\nint " << simple_name << "_argv(void **args);\n";
1724 
1725         // And also the metadata.
1726         stream << "\nHALIDE_FUNCTION_ATTRS\nconst struct halide_filter_metadata_t *" << simple_name << "_metadata();\n";
1727     }
1728 
1729     if (!namespaces.empty()) {
1730         stream << "\n";
1731         for (size_t i = namespaces.size(); i > 0; i--) {
1732             stream << "}  // namespace " << namespaces[i - 1] << "\n";
1733         }
1734         stream << "\n";
1735     }
1736 }
1737 
compile(const Buffer<> & buffer)1738 void CodeGen_C::compile(const Buffer<> &buffer) {
1739     // Don't define buffers in headers or extern decls.
1740     if (is_header_or_extern_decl()) {
1741         return;
1742     }
1743 
1744     string name = print_name(buffer.name());
1745     halide_buffer_t b = *(buffer.raw_buffer());
1746 
1747     user_assert(b.host) << "Can't embed image: " << buffer.name() << " because it has a null host pointer\n";
1748     user_assert(!b.device_dirty()) << "Can't embed image: " << buffer.name() << "because it has a dirty device pointer\n";
1749 
1750     // Figure out the offset of the last pixel.
1751     size_t num_elems = 1;
1752     for (int d = 0; d < b.dimensions; d++) {
1753         num_elems += b.dim[d].stride * (b.dim[d].extent - 1);
1754     }
1755 
1756     // For now, we assume buffers that aren't scalar are constant,
1757     // while scalars can be mutated. This accommodates all our existing
1758     // use cases, which is that all buffers are constant, except those
1759     // used to store stateful module information in offloading runtimes.
1760     bool is_constant = buffer.dimensions() != 0;
1761 
1762     // Emit the data
1763     stream << "static " << (is_constant ? "const" : "") << " uint8_t " << name << "_data[] HALIDE_ATTRIBUTE_ALIGN(32) = {\n";
1764     stream << get_indent();
1765     for (size_t i = 0; i < num_elems * b.type.bytes(); i++) {
1766         if (i > 0) {
1767             stream << ",";
1768             if (i % 16 == 0) {
1769                 stream << "\n";
1770                 stream << get_indent();
1771             } else {
1772                 stream << " ";
1773             }
1774         }
1775         stream << (int)(b.host[i]);
1776     }
1777     stream << "\n};\n";
1778 
1779     // Emit the shape (constant even for scalar buffers)
1780     stream << "static const halide_dimension_t " << name << "_buffer_shape[] = {";
1781     for (int i = 0; i < buffer.dimensions(); i++) {
1782         stream << "halide_dimension_t(" << buffer.dim(i).min()
1783                << ", " << buffer.dim(i).extent()
1784                << ", " << buffer.dim(i).stride() << ")";
1785         if (i < buffer.dimensions() - 1) {
1786             stream << ", ";
1787         }
1788     }
1789     stream << "};\n";
1790 
1791     Type t = buffer.type();
1792 
1793     // Emit the buffer struct. Note that although our shape and (usually) our host
1794     // data is const, the buffer itself isn't: embedded buffers in one pipeline
1795     // can be passed to another pipeline (e.g. for an extern stage), in which
1796     // case the buffer objects need to be non-const, because the constness
1797     // (from the POV of the extern stage) is a runtime property.
1798     stream << "static halide_buffer_t " << name << "_buffer_ = {"
1799            << "0, "                                              // device
1800            << "nullptr, "                                        // device_interface
1801            << "const_cast<uint8_t*>(&" << name << "_data[0]), "  // host
1802            << "0, "                                              // flags
1803            << "halide_type_t((halide_type_code_t)(" << (int)t.code() << "), " << t.bits() << ", " << t.lanes() << "), "
1804            << buffer.dimensions() << ", "
1805            << "const_cast<halide_dimension_t*>(" << name << "_buffer_shape)};\n";
1806 
1807     // Make a global pointer to it.
1808     stream << "static halide_buffer_t * const " << name << "_buffer = &" << name << "_buffer_;\n";
1809 }
1810 
print_expr(const Expr & e)1811 string CodeGen_C::print_expr(const Expr &e) {
1812     id = "$$ BAD ID $$";
1813     e.accept(this);
1814     return id;
1815 }
1816 
print_cast_expr(const Type & t,const Expr & e)1817 string CodeGen_C::print_cast_expr(const Type &t, const Expr &e) {
1818     string value = print_expr(e);
1819     string type = print_type(t);
1820     if (t.is_vector() &&
1821         t.lanes() == e.type().lanes() &&
1822         t != e.type()) {
1823         return print_assignment(t, type + "::convert_from<" + print_type(e.type()) + ">(" + value + ")");
1824     } else {
1825         return print_assignment(t, "(" + type + ")(" + value + ")");
1826     }
1827 }
1828 
print_stmt(const Stmt & s)1829 void CodeGen_C::print_stmt(const Stmt &s) {
1830     s.accept(this);
1831 }
1832 
print_assignment(Type t,const std::string & rhs)1833 string CodeGen_C::print_assignment(Type t, const std::string &rhs) {
1834     auto cached = cache.find(rhs);
1835     if (cached == cache.end()) {
1836         id = unique_name('_');
1837         stream << get_indent() << print_type(t, AppendSpace) << (output_kind == CPlusPlusImplementation ? "const " : "") << id << " = " << rhs << ";\n";
1838         cache[rhs] = id;
1839     } else {
1840         id = cached->second;
1841     }
1842     return id;
1843 }
1844 
open_scope()1845 void CodeGen_C::open_scope() {
1846     cache.clear();
1847     stream << get_indent();
1848     indent++;
1849     stream << "{\n";
1850 }
1851 
close_scope(const std::string & comment)1852 void CodeGen_C::close_scope(const std::string &comment) {
1853     cache.clear();
1854     indent--;
1855     stream << get_indent();
1856     if (!comment.empty()) {
1857         stream << "} // " << comment << "\n";
1858     } else {
1859         stream << "}\n";
1860     }
1861 }
1862 
visit(const Variable * op)1863 void CodeGen_C::visit(const Variable *op) {
1864     id = print_name(op->name);
1865 }
1866 
visit(const Cast * op)1867 void CodeGen_C::visit(const Cast *op) {
1868     id = print_cast_expr(op->type, op->value);
1869 }
1870 
visit_binop(Type t,const Expr & a,const Expr & b,const char * op)1871 void CodeGen_C::visit_binop(Type t, const Expr &a, const Expr &b, const char *op) {
1872     string sa = print_expr(a);
1873     string sb = print_expr(b);
1874     print_assignment(t, sa + " " + op + " " + sb);
1875 }
1876 
visit(const Add * op)1877 void CodeGen_C::visit(const Add *op) {
1878     visit_binop(op->type, op->a, op->b, "+");
1879 }
1880 
visit(const Sub * op)1881 void CodeGen_C::visit(const Sub *op) {
1882     visit_binop(op->type, op->a, op->b, "-");
1883 }
1884 
visit(const Mul * op)1885 void CodeGen_C::visit(const Mul *op) {
1886     visit_binop(op->type, op->a, op->b, "*");
1887 }
1888 
visit(const Div * op)1889 void CodeGen_C::visit(const Div *op) {
1890     int bits;
1891     if (is_const_power_of_two_integer(op->b, &bits)) {
1892         visit_binop(op->type, op->a, make_const(op->a.type(), bits), ">>");
1893     } else if (op->type.is_int()) {
1894         print_expr(lower_euclidean_div(op->a, op->b));
1895     } else {
1896         visit_binop(op->type, op->a, op->b, "/");
1897     }
1898 }
1899 
visit(const Mod * op)1900 void CodeGen_C::visit(const Mod *op) {
1901     int bits;
1902     if (is_const_power_of_two_integer(op->b, &bits)) {
1903         visit_binop(op->type, op->a, make_const(op->a.type(), (1 << bits) - 1), "&");
1904     } else if (op->type.is_int()) {
1905         print_expr(lower_euclidean_mod(op->a, op->b));
1906     } else if (op->type.is_float()) {
1907         string arg0 = print_expr(op->a);
1908         string arg1 = print_expr(op->b);
1909         ostringstream rhs;
1910         rhs << "fmod(" << arg0 << ", " << arg1 << ")";
1911         print_assignment(op->type, rhs.str());
1912     } else {
1913         visit_binop(op->type, op->a, op->b, "%");
1914     }
1915 }
1916 
visit(const Max * op)1917 void CodeGen_C::visit(const Max *op) {
1918     // clang doesn't support the ternary operator on OpenCL style vectors.
1919     // See: https://bugs.llvm.org/show_bug.cgi?id=33103
1920     if (op->type.is_scalar()) {
1921         print_expr(Call::make(op->type, "::halide_cpp_max", {op->a, op->b}, Call::Extern));
1922     } else {
1923         ostringstream rhs;
1924         rhs << print_type(op->type) << "::max(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
1925         print_assignment(op->type, rhs.str());
1926     }
1927 }
1928 
visit(const Min * op)1929 void CodeGen_C::visit(const Min *op) {
1930     // clang doesn't support the ternary operator on OpenCL style vectors.
1931     // See: https://bugs.llvm.org/show_bug.cgi?id=33103
1932     if (op->type.is_scalar()) {
1933         print_expr(Call::make(op->type, "::halide_cpp_min", {op->a, op->b}, Call::Extern));
1934     } else {
1935         ostringstream rhs;
1936         rhs << print_type(op->type) << "::min(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
1937         print_assignment(op->type, rhs.str());
1938     }
1939 }
1940 
visit(const EQ * op)1941 void CodeGen_C::visit(const EQ *op) {
1942     visit_binop(op->type, op->a, op->b, "==");
1943 }
1944 
visit(const NE * op)1945 void CodeGen_C::visit(const NE *op) {
1946     visit_binop(op->type, op->a, op->b, "!=");
1947 }
1948 
visit(const LT * op)1949 void CodeGen_C::visit(const LT *op) {
1950     visit_binop(op->type, op->a, op->b, "<");
1951 }
1952 
visit(const LE * op)1953 void CodeGen_C::visit(const LE *op) {
1954     visit_binop(op->type, op->a, op->b, "<=");
1955 }
1956 
visit(const GT * op)1957 void CodeGen_C::visit(const GT *op) {
1958     visit_binop(op->type, op->a, op->b, ">");
1959 }
1960 
visit(const GE * op)1961 void CodeGen_C::visit(const GE *op) {
1962     visit_binop(op->type, op->a, op->b, ">=");
1963 }
1964 
visit(const Or * op)1965 void CodeGen_C::visit(const Or *op) {
1966     visit_binop(op->type, op->a, op->b, "||");
1967 }
1968 
visit(const And * op)1969 void CodeGen_C::visit(const And *op) {
1970     visit_binop(op->type, op->a, op->b, "&&");
1971 }
1972 
visit(const Not * op)1973 void CodeGen_C::visit(const Not *op) {
1974     print_assignment(op->type, "!(" + print_expr(op->a) + ")");
1975 }
1976 
visit(const IntImm * op)1977 void CodeGen_C::visit(const IntImm *op) {
1978     if (op->type == Int(32)) {
1979         id = std::to_string(op->value);
1980     } else {
1981         print_assignment(op->type, "(" + print_type(op->type) + ")(ADD_INT64_T_SUFFIX(" + std::to_string(op->value) + "))");
1982     }
1983 }
1984 
visit(const UIntImm * op)1985 void CodeGen_C::visit(const UIntImm *op) {
1986     print_assignment(op->type, "(" + print_type(op->type) + ")(ADD_UINT64_T_SUFFIX(" + std::to_string(op->value) + "))");
1987 }
1988 
visit(const StringImm * op)1989 void CodeGen_C::visit(const StringImm *op) {
1990     ostringstream oss;
1991     oss << Expr(op);
1992     id = oss.str();
1993 }
1994 
1995 // NaN is the only float/double for which this is true... and
1996 // surprisingly, there doesn't seem to be a portable isnan function
1997 // (dsharlet).
1998 template<typename T>
isnan(T x)1999 static bool isnan(T x) {
2000     return x != x;
2001 }
2002 
2003 template<typename T>
isinf(T x)2004 static bool isinf(T x) {
2005     return std::numeric_limits<T>::has_infinity && (x == std::numeric_limits<T>::infinity() ||
2006                                                     x == -std::numeric_limits<T>::infinity());
2007 }
2008 
visit(const FloatImm * op)2009 void CodeGen_C::visit(const FloatImm *op) {
2010     if (isnan(op->value)) {
2011         id = "nan_f32()";
2012     } else if (isinf(op->value)) {
2013         if (op->value > 0) {
2014             id = "inf_f32()";
2015         } else {
2016             id = "neg_inf_f32()";
2017         }
2018     } else {
2019         // Write the constant as reinterpreted uint to avoid any bits lost in conversion.
2020         union {
2021             uint32_t as_uint;
2022             float as_float;
2023         } u;
2024         u.as_float = op->value;
2025 
2026         ostringstream oss;
2027         if (op->type.bits() == 64) {
2028             oss << "(double) ";
2029         }
2030         oss << "float_from_bits(" << u.as_uint << " /* " << u.as_float << " */)";
2031         print_assignment(op->type, oss.str());
2032     }
2033 }
2034 
visit(const Call * op)2035 void CodeGen_C::visit(const Call *op) {
2036 
2037     internal_assert(op->is_extern() || op->is_intrinsic())
2038         << "Can only codegen extern calls and intrinsics\n";
2039 
2040     ostringstream rhs;
2041 
2042     // Handle intrinsics first
2043     if (op->is_intrinsic(Call::debug_to_file)) {
2044         internal_assert(op->args.size() == 3);
2045         const StringImm *string_imm = op->args[0].as<StringImm>();
2046         internal_assert(string_imm);
2047         string filename = string_imm->value;
2048         string typecode = print_expr(op->args[1]);
2049         string buffer = print_name(print_expr(op->args[2]));
2050 
2051         rhs << "halide_debug_to_file(_ucon, "
2052             << "\"" << filename << "\", "
2053             << typecode
2054             << ", (struct halide_buffer_t *)" << buffer << ")";
2055     } else if (op->is_intrinsic(Call::bitwise_and)) {
2056         internal_assert(op->args.size() == 2);
2057         string a0 = print_expr(op->args[0]);
2058         string a1 = print_expr(op->args[1]);
2059         rhs << a0 << " & " << a1;
2060     } else if (op->is_intrinsic(Call::bitwise_xor)) {
2061         internal_assert(op->args.size() == 2);
2062         string a0 = print_expr(op->args[0]);
2063         string a1 = print_expr(op->args[1]);
2064         rhs << a0 << " ^ " << a1;
2065     } else if (op->is_intrinsic(Call::bitwise_or)) {
2066         internal_assert(op->args.size() == 2);
2067         string a0 = print_expr(op->args[0]);
2068         string a1 = print_expr(op->args[1]);
2069         rhs << a0 << " | " << a1;
2070     } else if (op->is_intrinsic(Call::bitwise_not)) {
2071         internal_assert(op->args.size() == 1);
2072         rhs << "~" << print_expr(op->args[0]);
2073     } else if (op->is_intrinsic(Call::reinterpret)) {
2074         internal_assert(op->args.size() == 1);
2075         rhs << print_reinterpret(op->type, op->args[0]);
2076     } else if (op->is_intrinsic(Call::shift_left)) {
2077         internal_assert(op->args.size() == 2);
2078         string a0 = print_expr(op->args[0]);
2079         string a1 = print_expr(op->args[1]);
2080         rhs << a0 << " << " << a1;
2081     } else if (op->is_intrinsic(Call::shift_right)) {
2082         internal_assert(op->args.size() == 2);
2083         string a0 = print_expr(op->args[0]);
2084         string a1 = print_expr(op->args[1]);
2085         rhs << a0 << " >> " << a1;
2086     } else if (op->is_intrinsic(Call::count_leading_zeros) ||
2087                op->is_intrinsic(Call::count_trailing_zeros) ||
2088                op->is_intrinsic(Call::popcount)) {
2089         internal_assert(op->args.size() == 1);
2090         if (op->args[0].type().is_vector()) {
2091             rhs << print_scalarized_expr(op);
2092         } else {
2093             string a0 = print_expr(op->args[0]);
2094             rhs << "halide_" << op->name << "(" << a0 << ")";
2095         }
2096     } else if (op->is_intrinsic(Call::lerp)) {
2097         internal_assert(op->args.size() == 3);
2098         Expr e = lower_lerp(op->args[0], op->args[1], op->args[2]);
2099         rhs << print_expr(e);
2100     } else if (op->is_intrinsic(Call::absd)) {
2101         internal_assert(op->args.size() == 2);
2102         Expr a = op->args[0];
2103         Expr b = op->args[1];
2104         Expr e = cast(op->type, select(a < b, b - a, a - b));
2105         rhs << print_expr(e);
2106     } else if (op->is_intrinsic(Call::return_second)) {
2107         internal_assert(op->args.size() == 2);
2108         string arg0 = print_expr(op->args[0]);
2109         string arg1 = print_expr(op->args[1]);
2110         rhs << "return_second(" << arg0 << ", " << arg1 << ")";
2111     } else if (op->is_intrinsic(Call::if_then_else)) {
2112         internal_assert(op->args.size() == 3);
2113 
2114         string result_id = unique_name('_');
2115 
2116         stream << get_indent() << print_type(op->args[1].type(), AppendSpace)
2117                << result_id << ";\n";
2118 
2119         string cond_id = print_expr(op->args[0]);
2120 
2121         stream << get_indent() << "if (" << cond_id << ")\n";
2122         open_scope();
2123         string true_case = print_expr(op->args[1]);
2124         stream << get_indent() << result_id << " = " << true_case << ";\n";
2125         close_scope("if " + cond_id);
2126         stream << get_indent() << "else\n";
2127         open_scope();
2128         string false_case = print_expr(op->args[2]);
2129         stream << get_indent() << result_id << " = " << false_case << ";\n";
2130         close_scope("if " + cond_id + " else");
2131 
2132         rhs << result_id;
2133     } else if (op->is_intrinsic(Call::require)) {
2134         internal_assert(op->args.size() == 3);
2135         if (op->args[0].type().is_vector()) {
2136             rhs << print_scalarized_expr(op);
2137         } else {
2138             create_assertion(op->args[0], op->args[2]);
2139             rhs << print_expr(op->args[1]);
2140         }
2141     } else if (op->is_intrinsic(Call::abs)) {
2142         internal_assert(op->args.size() == 1);
2143         Expr a0 = op->args[0];
2144         rhs << print_expr(cast(op->type, select(a0 > 0, a0, -a0)));
2145     } else if (op->is_intrinsic(Call::memoize_expr)) {
2146         internal_assert(!op->args.empty());
2147         string arg = print_expr(op->args[0]);
2148         rhs << "(" << arg << ")";
2149     } else if (op->is_intrinsic(Call::alloca)) {
2150         internal_assert(op->args.size() == 1);
2151         internal_assert(op->type.is_handle());
2152         const Call *call = op->args[0].as<Call>();
2153         if (op->type == type_of<struct halide_buffer_t *>() &&
2154             call && call->is_intrinsic(Call::size_of_halide_buffer_t)) {
2155             stream << get_indent();
2156             string buf_name = unique_name('b');
2157             stream << "halide_buffer_t " << buf_name << ";\n";
2158             rhs << "&" << buf_name;
2159         } else {
2160             // Make a stack of uint64_ts
2161             string size = print_expr(simplify((op->args[0] + 7) / 8));
2162             stream << get_indent();
2163             string array_name = unique_name('a');
2164             stream << "uint64_t " << array_name << "[" << size << "];";
2165             rhs << "(" << print_type(op->type) << ")(&" << array_name << ")";
2166         }
2167     } else if (op->is_intrinsic(Call::make_struct)) {
2168         if (op->args.empty()) {
2169             internal_assert(op->type.handle_type);
2170             // Add explicit cast so that different structs can't cache to the same value
2171             rhs << "(" << print_type(op->type) << ")(NULL)";
2172         } else if (op->type == type_of<halide_dimension_t *>()) {
2173             // Emit a shape
2174 
2175             // Get the args
2176             vector<string> values;
2177             for (size_t i = 0; i < op->args.size(); i++) {
2178                 values.push_back(print_expr(op->args[i]));
2179             }
2180 
2181             static_assert(sizeof(halide_dimension_t) == 4 * sizeof(int32_t),
2182                           "CodeGen_C assumes a halide_dimension_t is four densely-packed int32_ts");
2183 
2184             internal_assert(values.size() % 4 == 0);
2185             int dimension = values.size() / 4;
2186 
2187             string shape_name = unique_name('s');
2188             stream
2189                 << get_indent() << "struct halide_dimension_t " << shape_name
2190                 << "[" << dimension << "] = {\n";
2191             indent++;
2192             for (int i = 0; i < dimension; i++) {
2193                 stream
2194                     << get_indent() << "{"
2195                     << values[i * 4 + 0] << ", "
2196                     << values[i * 4 + 1] << ", "
2197                     << values[i * 4 + 2] << ", "
2198                     << values[i * 4 + 3] << "},\n";
2199             }
2200             indent--;
2201             stream << get_indent() << "};\n";
2202 
2203             rhs << shape_name;
2204         } else {
2205             // Emit a declaration like:
2206             // struct {const int f_0, const char f_1, const int f_2} foo = {3, 'c', 4};
2207 
2208             // Get the args
2209             vector<string> values;
2210             for (size_t i = 0; i < op->args.size(); i++) {
2211                 values.push_back(print_expr(op->args[i]));
2212             }
2213             stream << get_indent() << "struct {\n";
2214             // List the types.
2215             indent++;
2216             for (size_t i = 0; i < op->args.size(); i++) {
2217                 stream << get_indent() << "const " << print_type(op->args[i].type()) << " f_" << i << ";\n";
2218             }
2219             indent--;
2220             string struct_name = unique_name('s');
2221             stream << get_indent() << "} " << struct_name << " = {\n";
2222             // List the values.
2223             indent++;
2224             for (size_t i = 0; i < op->args.size(); i++) {
2225                 stream << get_indent() << values[i];
2226                 if (i < op->args.size() - 1) stream << ",";
2227                 stream << "\n";
2228             }
2229             indent--;
2230             stream << get_indent() << "};\n";
2231 
2232             // Return a pointer to it of the appropriate type
2233 
2234             // TODO: This is dubious type-punning. We really need to
2235             // find a better way to do this. We dodge the problem for
2236             // the specific case of buffer shapes in the case above.
2237             if (op->type.handle_type) {
2238                 rhs << "(" << print_type(op->type) << ")";
2239             }
2240             rhs << "(&" << struct_name << ")";
2241         }
2242     } else if (op->is_intrinsic(Call::stringify)) {
2243         // Rewrite to an snprintf
2244         vector<string> printf_args;
2245         string format_string = "";
2246         for (size_t i = 0; i < op->args.size(); i++) {
2247             Type t = op->args[i].type();
2248             printf_args.push_back(print_expr(op->args[i]));
2249             if (t.is_int()) {
2250                 format_string += "%lld";
2251                 printf_args[i] = "(long long)(" + printf_args[i] + ")";
2252             } else if (t.is_uint()) {
2253                 format_string += "%llu";
2254                 printf_args[i] = "(long long unsigned)(" + printf_args[i] + ")";
2255             } else if (t.is_float()) {
2256                 if (t.bits() == 32) {
2257                     format_string += "%f";
2258                 } else {
2259                     format_string += "%e";
2260                 }
2261             } else if (op->args[i].as<StringImm>()) {
2262                 format_string += "%s";
2263             } else {
2264                 internal_assert(t.is_handle());
2265                 format_string += "%p";
2266             }
2267         }
2268         string buf_name = unique_name('b');
2269         stream << get_indent() << "char " << buf_name << "[1024];\n";
2270         stream << get_indent() << "snprintf(" << buf_name << ", 1024, \"" << format_string << "\", " << with_commas(printf_args) << ");\n";
2271         rhs << buf_name;
2272 
2273     } else if (op->is_intrinsic(Call::register_destructor)) {
2274         internal_assert(op->args.size() == 2);
2275         const StringImm *fn = op->args[0].as<StringImm>();
2276         internal_assert(fn);
2277         string arg = print_expr(op->args[1]);
2278 
2279         stream << get_indent();
2280         // Make a struct on the stack that calls the given function as a destructor
2281         string struct_name = unique_name('s');
2282         string instance_name = unique_name('d');
2283         stream << "struct " << struct_name << " { "
2284                << "void * const ucon; "
2285                << "void * const arg; "
2286                << "" << struct_name << "(void *ucon, void *a) : ucon(ucon), arg((void *)a) {} "
2287                << "~" << struct_name << "() { " << fn->value + "(ucon, arg); } "
2288                << "} " << instance_name << "(_ucon, " << arg << ");\n";
2289         rhs << print_expr(0);
2290     } else if (op->is_intrinsic(Call::div_round_to_zero)) {
2291         rhs << print_expr(op->args[0]) << " / " << print_expr(op->args[1]);
2292     } else if (op->is_intrinsic(Call::mod_round_to_zero)) {
2293         rhs << print_expr(op->args[0]) << " % " << print_expr(op->args[1]);
2294     } else if (op->is_intrinsic(Call::signed_integer_overflow)) {
2295         user_error << "Signed integer overflow occurred during constant-folding. Signed"
2296                       " integer overflow for int32 and int64 is undefined behavior in"
2297                       " Halide.\n";
2298     } else if (op->is_intrinsic(Call::prefetch)) {
2299         user_assert((op->args.size() == 4) && is_one(op->args[2]))
2300             << "Only prefetch of 1 cache line is supported in C backend.\n";
2301         const Variable *base = op->args[0].as<Variable>();
2302         internal_assert(base && base->type.is_handle());
2303         rhs << "__builtin_prefetch("
2304             << "((" << print_type(op->type) << " *)" << print_name(base->name)
2305             << " + " << print_expr(op->args[1]) << "), 1)";
2306     } else if (op->is_intrinsic(Call::size_of_halide_buffer_t)) {
2307         rhs << "(sizeof(halide_buffer_t))";
2308     } else if (op->is_intrinsic(Call::strict_float)) {
2309         internal_assert(op->args.size() == 1);
2310         string arg0 = print_expr(op->args[0]);
2311         rhs << "(" << arg0 << ")";
2312     } else if (op->is_intrinsic()) {
2313         // TODO: other intrinsics
2314         internal_error << "Unhandled intrinsic in C backend: " << op->name << "\n";
2315     } else {
2316         // Generic extern calls
2317         rhs << print_extern_call(op);
2318     }
2319 
2320     // Special-case halide_print, which has IR that returns int, but really return void.
2321     // The clean thing to do would be to change the definition of halide_print() to return
2322     // an ignored int, but as halide_print() has many overrides downstream (and in third-party
2323     // consumers), this is arguably a simpler fix for allowing halide_print() to work in the C++ backend.
2324     if (op->name == "halide_print") {
2325         stream << get_indent() << rhs.str() << ";\n";
2326         // Make an innocuous assignment value for our caller (probably an Evaluate node) to ignore.
2327         print_assignment(op->type, "0");
2328     } else {
2329         print_assignment(op->type, rhs.str());
2330     }
2331 }
2332 
print_scalarized_expr(const Expr & e)2333 string CodeGen_C::print_scalarized_expr(const Expr &e) {
2334     Type t = e.type();
2335     internal_assert(t.is_vector());
2336     string v = unique_name('_');
2337     stream << get_indent() << print_type(t, AppendSpace) << v << ";\n";
2338     for (int lane = 0; lane < t.lanes(); lane++) {
2339         Expr e2 = extract_lane(e, lane);
2340         string elem = print_expr(e2);
2341         ostringstream rhs;
2342         rhs << v << ".replace(" << lane << ", " << elem << ")";
2343         v = print_assignment(t, rhs.str());
2344     }
2345     return v;
2346 }
2347 
print_extern_call(const Call * op)2348 string CodeGen_C::print_extern_call(const Call *op) {
2349     if (op->type.is_vector()) {
2350         // Need to split into multiple scalar calls.
2351         return print_scalarized_expr(op);
2352     }
2353     ostringstream rhs;
2354     vector<string> args(op->args.size());
2355     for (size_t i = 0; i < op->args.size(); i++) {
2356         args[i] = print_expr(op->args[i]);
2357         // This substitution ensures const correctness for all calls
2358         if (args[i] == "__user_context") {
2359             args[i] = "_ucon";
2360         }
2361     }
2362     if (function_takes_user_context(op->name)) {
2363         args.insert(args.begin(), "_ucon");
2364     }
2365     rhs << op->name << "(" << with_commas(args) << ")";
2366     return rhs.str();
2367 }
2368 
visit(const Load * op)2369 void CodeGen_C::visit(const Load *op) {
2370     user_assert(is_one(op->predicate)) << "Predicated load is not supported by C backend.\n";
2371 
2372     // TODO: We could replicate the logic in the llvm codegen which decides whether
2373     // the vector access can be aligned. Doing so would also require introducing
2374     // aligned type equivalents for all the vector types.
2375     ostringstream rhs;
2376 
2377     Type t = op->type;
2378     string name = print_name(op->name);
2379 
2380     // If we're loading a contiguous ramp into a vector, just load the vector
2381     Expr dense_ramp_base = strided_ramp_base(op->index, 1);
2382     if (dense_ramp_base.defined()) {
2383         internal_assert(t.is_vector());
2384         string id_ramp_base = print_expr(dense_ramp_base);
2385         rhs << print_type(t) + "::load(" << name << ", " << id_ramp_base << ")";
2386     } else if (op->index.type().is_vector()) {
2387         // If index is a vector, gather vector elements.
2388         internal_assert(t.is_vector());
2389         string id_index = print_expr(op->index);
2390         rhs << print_type(t) + "::load(" << name << ", " << id_index << ")";
2391     } else {
2392         string id_index = print_expr(op->index);
2393         bool type_cast_needed = !(allocations.contains(op->name) &&
2394                                   allocations.get(op->name).type.element_of() == t.element_of());
2395         if (type_cast_needed) {
2396             rhs << "((const " << print_type(t.element_of()) << " *)" << name << ")";
2397         } else {
2398             rhs << name;
2399         }
2400         rhs << "[" << id_index << "]";
2401     }
2402     print_assignment(t, rhs.str());
2403 }
2404 
visit(const Store * op)2405 void CodeGen_C::visit(const Store *op) {
2406     user_assert(is_one(op->predicate)) << "Predicated store is not supported by C backend.\n";
2407 
2408     Type t = op->value.type();
2409 
2410     if (inside_atomic_mutex_node) {
2411         user_assert(t.is_scalar())
2412             << "The vectorized atomic operation for the store" << op->name
2413             << " is lowered into a mutex lock, which does not support vectorization.\n";
2414     }
2415 
2416     // Issue atomic store if we are in the designated producer.
2417     if (emit_atomic_stores) {
2418         stream << "#if defined(_OPENMP)\n";
2419         stream << "#pragma omp atomic\n";
2420         stream << "#else\n";
2421         stream << "#error \"Atomic stores in the C backend are only supported in compilers that support OpenMP.\"\n";
2422         stream << "#endif\n";
2423     }
2424 
2425     string id_value = print_expr(op->value);
2426     string name = print_name(op->name);
2427 
2428     // TODO: We could replicate the logic in the llvm codegen which decides whether
2429     // the vector access can be aligned. Doing so would also require introducing
2430     // aligned type equivalents for all the vector types.
2431 
2432     // If we're writing a contiguous ramp, just store the vector.
2433     Expr dense_ramp_base = strided_ramp_base(op->index, 1);
2434     if (dense_ramp_base.defined()) {
2435         internal_assert(op->value.type().is_vector());
2436         string id_ramp_base = print_expr(dense_ramp_base);
2437         stream << get_indent() << id_value + ".store(" << name << ", " << id_ramp_base << ");\n";
2438     } else if (op->index.type().is_vector()) {
2439         // If index is a vector, scatter vector elements.
2440         internal_assert(t.is_vector());
2441         string id_index = print_expr(op->index);
2442         stream << get_indent() << id_value + ".store(" << name << ", " << id_index << ");\n";
2443     } else {
2444         bool type_cast_needed =
2445             t.is_handle() ||
2446             !allocations.contains(op->name) ||
2447             allocations.get(op->name).type != t;
2448 
2449         string id_index = print_expr(op->index);
2450         stream << get_indent();
2451         if (type_cast_needed) {
2452             stream << "((" << print_type(t) << " *)" << name << ")";
2453         } else {
2454             stream << name;
2455         }
2456         stream << "[" << id_index << "] = " << id_value << ";\n";
2457     }
2458     cache.clear();
2459 }
2460 
visit(const Let * op)2461 void CodeGen_C::visit(const Let *op) {
2462     string id_value = print_expr(op->value);
2463     Expr body = op->body;
2464     if (op->value.type().is_handle()) {
2465         // The body might contain a Load that references this directly
2466         // by name, so we can't rewrite the name.
2467         stream << get_indent() << print_type(op->value.type())
2468                << " " << print_name(op->name)
2469                << " = " << id_value << ";\n";
2470     } else {
2471         Expr new_var = Variable::make(op->value.type(), id_value);
2472         body = substitute(op->name, new_var, body);
2473     }
2474     print_expr(body);
2475 }
2476 
visit(const Select * op)2477 void CodeGen_C::visit(const Select *op) {
2478     ostringstream rhs;
2479     string type = print_type(op->type);
2480     string true_val = print_expr(op->true_value);
2481     string false_val = print_expr(op->false_value);
2482     string cond = print_expr(op->condition);
2483 
2484     // clang doesn't support the ternary operator on OpenCL style vectors.
2485     // See: https://bugs.llvm.org/show_bug.cgi?id=33103
2486     if (op->condition.type().is_scalar()) {
2487         rhs << "(" << type << ")"
2488             << "(" << cond
2489             << " ? " << true_val
2490             << " : " << false_val
2491             << ")";
2492     } else {
2493         rhs << type << "::select(" << cond << ", " << true_val << ", " << false_val << ")";
2494     }
2495     print_assignment(op->type, rhs.str());
2496 }
2497 
visit(const LetStmt * op)2498 void CodeGen_C::visit(const LetStmt *op) {
2499     string id_value = print_expr(op->value);
2500     Stmt body = op->body;
2501     if (op->value.type().is_handle()) {
2502         // The body might contain a Load or Store that references this
2503         // directly by name, so we can't rewrite the name.
2504         stream << get_indent() << print_type(op->value.type())
2505                << " " << print_name(op->name)
2506                << " = " << id_value << ";\n";
2507     } else {
2508         Expr new_var = Variable::make(op->value.type(), id_value);
2509         body = substitute(op->name, new_var, body);
2510     }
2511     body.accept(this);
2512 }
2513 
2514 // Halide asserts have different semantics to C asserts.  They're
2515 // supposed to clean up and make the containing function return
2516 // -1, so we can't use the C version of assert. Instead we convert
2517 // to an if statement.
create_assertion(const string & id_cond,const Expr & message)2518 void CodeGen_C::create_assertion(const string &id_cond, const Expr &message) {
2519     internal_assert(!message.defined() || message.type() == Int(32))
2520         << "Assertion result is not an int: " << message;
2521 
2522     if (target.has_feature(Target::NoAsserts)) {
2523         stream << get_indent() << "halide_unused(" << id_cond << ");\n";
2524         return;
2525     }
2526 
2527     stream << get_indent() << "if (!" << id_cond << ")\n";
2528     open_scope();
2529     string id_msg = print_expr(message);
2530     stream << get_indent() << "return " << id_msg << ";\n";
2531     close_scope("");
2532 }
2533 
create_assertion(const Expr & cond,const Expr & message)2534 void CodeGen_C::create_assertion(const Expr &cond, const Expr &message) {
2535     create_assertion(print_expr(cond), message);
2536 }
2537 
visit(const AssertStmt * op)2538 void CodeGen_C::visit(const AssertStmt *op) {
2539     create_assertion(op->condition, op->message);
2540 }
2541 
visit(const ProducerConsumer * op)2542 void CodeGen_C::visit(const ProducerConsumer *op) {
2543     stream << get_indent();
2544     if (op->is_producer) {
2545         stream << "// produce " << op->name << "\n";
2546     } else {
2547         stream << "// consume " << op->name << "\n";
2548     }
2549     print_stmt(op->body);
2550 }
2551 
visit(const Fork * op)2552 void CodeGen_C::visit(const Fork *op) {
2553     // TODO: This doesn't actually work with nested tasks
2554     stream << get_indent() << "#pragma omp parallel\n";
2555     open_scope();
2556     stream << get_indent() << "#pragma omp single\n";
2557     open_scope();
2558     stream << get_indent() << "#pragma omp task\n";
2559     open_scope();
2560     print_stmt(op->first);
2561     close_scope("");
2562     stream << get_indent() << "#pragma omp task\n";
2563     open_scope();
2564     print_stmt(op->rest);
2565     close_scope("");
2566     stream << get_indent() << "#pragma omp taskwait\n";
2567     close_scope("");
2568     close_scope("");
2569 }
2570 
visit(const Acquire * op)2571 void CodeGen_C::visit(const Acquire *op) {
2572     string id_sem = print_expr(op->semaphore);
2573     string id_count = print_expr(op->count);
2574     open_scope();
2575     stream << get_indent() << "while (!halide_semaphore_try_acquire(" << id_sem << ", " << id_count << "))\n";
2576     open_scope();
2577     stream << get_indent() << "#pragma omp taskyield\n";
2578     close_scope("");
2579     op->body.accept(this);
2580     close_scope("");
2581 }
2582 
visit(const Atomic * op)2583 void CodeGen_C::visit(const Atomic *op) {
2584     if (!op->mutex_name.empty()) {
2585         internal_assert(!inside_atomic_mutex_node)
2586             << "Nested atomic mutex locks detected. This might causes a deadlock.\n";
2587         ScopedValue<bool> old_inside_atomic_mutex_node(inside_atomic_mutex_node, true);
2588         op->body.accept(this);
2589     } else {
2590         // Issue atomic stores.
2591         ScopedValue<bool> old_emit_atomic_stores(emit_atomic_stores, true);
2592         op->body.accept(this);
2593     }
2594 }
2595 
visit(const For * op)2596 void CodeGen_C::visit(const For *op) {
2597     string id_min = print_expr(op->min);
2598     string id_extent = print_expr(op->extent);
2599 
2600     if (op->for_type == ForType::Parallel) {
2601         stream << get_indent() << "#pragma omp parallel for\n";
2602     } else {
2603         internal_assert(op->for_type == ForType::Serial)
2604             << "Can only emit serial or parallel for loops to C\n";
2605     }
2606 
2607     stream << get_indent() << "for (int "
2608            << print_name(op->name)
2609            << " = " << id_min
2610            << "; "
2611            << print_name(op->name)
2612            << " < " << id_min
2613            << " + " << id_extent
2614            << "; "
2615            << print_name(op->name)
2616            << "++)\n";
2617 
2618     open_scope();
2619     op->body.accept(this);
2620     close_scope("for " + print_name(op->name));
2621 }
2622 
visit(const Ramp * op)2623 void CodeGen_C::visit(const Ramp *op) {
2624     Type vector_type = op->type.with_lanes(op->lanes);
2625     string id_base = print_expr(op->base);
2626     string id_stride = print_expr(op->stride);
2627     print_assignment(vector_type, print_type(vector_type) + "::ramp(" + id_base + ", " + id_stride + ")");
2628 }
2629 
visit(const Broadcast * op)2630 void CodeGen_C::visit(const Broadcast *op) {
2631     Type vector_type = op->type.with_lanes(op->lanes);
2632     string id_value = print_expr(op->value);
2633     string rhs;
2634     if (op->lanes > 1) {
2635         rhs = print_type(vector_type) + "::broadcast(" + id_value + ")";
2636     } else {
2637         rhs = id_value;
2638     }
2639 
2640     print_assignment(vector_type, rhs);
2641 }
2642 
visit(const Provide * op)2643 void CodeGen_C::visit(const Provide *op) {
2644     internal_error << "Cannot emit Provide statements as C\n";
2645 }
2646 
visit(const Allocate * op)2647 void CodeGen_C::visit(const Allocate *op) {
2648     open_scope();
2649 
2650     string op_name = print_name(op->name);
2651     string op_type = print_type(op->type, AppendSpace);
2652 
2653     // For sizes less than 8k, do a stack allocation
2654     bool on_stack = false;
2655     int32_t constant_size;
2656     string size_id;
2657     Type size_id_type;
2658 
2659     if (op->new_expr.defined()) {
2660         Allocation alloc;
2661         alloc.type = op->type;
2662         allocations.push(op->name, alloc);
2663         heap_allocations.push(op->name);
2664         stream << op_type << "*" << op_name << " = (" << print_expr(op->new_expr) << ");\n";
2665     } else {
2666         constant_size = op->constant_allocation_size();
2667         if (constant_size > 0) {
2668             int64_t stack_bytes = constant_size * op->type.bytes();
2669 
2670             if (stack_bytes > ((int64_t(1) << 31) - 1)) {
2671                 user_error << "Total size for allocation "
2672                            << op->name << " is constant but exceeds 2^31 - 1.\n";
2673             } else {
2674                 size_id_type = Int(32);
2675                 size_id = print_expr(make_const(size_id_type, constant_size));
2676 
2677                 if (op->memory_type == MemoryType::Stack ||
2678                     (op->memory_type == MemoryType::Auto &&
2679                      can_allocation_fit_on_stack(stack_bytes))) {
2680                     on_stack = true;
2681                 }
2682             }
2683         } else {
2684             // Check that the allocation is not scalar (if it were scalar
2685             // it would have constant size).
2686             internal_assert(!op->extents.empty());
2687 
2688             size_id = print_assignment(Int(64), print_expr(op->extents[0]));
2689             size_id_type = Int(64);
2690 
2691             for (size_t i = 1; i < op->extents.size(); i++) {
2692                 // Make the code a little less cluttered for two-dimensional case
2693                 string new_size_id_rhs;
2694                 string next_extent = print_expr(op->extents[i]);
2695                 if (i > 1) {
2696                     new_size_id_rhs = "(" + size_id + " > ((int64_t(1) << 31) - 1)) ? " + size_id + " : (" + size_id + " * " + next_extent + ")";
2697                 } else {
2698                     new_size_id_rhs = size_id + " * " + next_extent;
2699                 }
2700                 size_id = print_assignment(Int(64), new_size_id_rhs);
2701             }
2702             stream << get_indent() << "if (("
2703                    << size_id << " > ((int64_t(1) << 31) - 1)) || (("
2704                    << size_id << " * sizeof("
2705                    << op_type << ")) > ((int64_t(1) << 31) - 1)))\n";
2706             open_scope();
2707             stream << get_indent();
2708             // TODO: call halide_error_buffer_allocation_too_large() here instead
2709             // TODO: call create_assertion() so that NoAssertions works
2710             stream << "halide_error(_ucon, "
2711                    << "\"32-bit signed overflow computing size of allocation " << op->name << "\\n\");\n";
2712             stream << get_indent() << "return -1;\n";
2713             close_scope("overflow test " + op->name);
2714         }
2715 
2716         // Check the condition to see if this allocation should actually be created.
2717         // If the allocation is on the stack, the only condition we can respect is
2718         // unconditional false (otherwise a non-constant-sized array declaration
2719         // will be generated).
2720         if (!on_stack || is_zero(op->condition)) {
2721             Expr conditional_size = Select::make(op->condition,
2722                                                  Variable::make(size_id_type, size_id),
2723                                                  make_const(size_id_type, 0));
2724             conditional_size = simplify(conditional_size);
2725             size_id = print_assignment(Int(64), print_expr(conditional_size));
2726         }
2727 
2728         Allocation alloc;
2729         alloc.type = op->type;
2730         allocations.push(op->name, alloc);
2731 
2732         stream << get_indent() << op_type;
2733 
2734         if (on_stack) {
2735             stream << op_name
2736                    << "[" << size_id << "];\n";
2737         } else {
2738             stream << "*"
2739                    << op_name
2740                    << " = ("
2741                    << op_type
2742                    << " *)halide_malloc(_ucon, sizeof("
2743                    << op_type
2744                    << ")*" << size_id << ");\n";
2745             heap_allocations.push(op->name);
2746         }
2747     }
2748 
2749     if (!on_stack) {
2750         create_assertion(op_name, Call::make(Int(32), "halide_error_out_of_memory", {}, Call::Extern));
2751 
2752         stream << get_indent();
2753         string free_function = op->free_function.empty() ? "halide_free" : op->free_function;
2754         stream << "HalideFreeHelper " << op_name << "_free(_ucon, "
2755                << op_name << ", " << free_function << ");\n";
2756     }
2757 
2758     op->body.accept(this);
2759 
2760     // Free the memory if it was allocated on the heap and there is no matching
2761     // Free node.
2762     print_heap_free(op->name);
2763     if (allocations.contains(op->name)) {
2764         allocations.pop(op->name);
2765     }
2766 
2767     close_scope("alloc " + print_name(op->name));
2768 }
2769 
print_heap_free(const std::string & alloc_name)2770 void CodeGen_C::print_heap_free(const std::string &alloc_name) {
2771     if (heap_allocations.contains(alloc_name)) {
2772         stream << get_indent() << print_name(alloc_name) << "_free.free();\n";
2773         heap_allocations.pop(alloc_name);
2774     }
2775 }
2776 
visit(const Free * op)2777 void CodeGen_C::visit(const Free *op) {
2778     print_heap_free(op->name);
2779     allocations.pop(op->name);
2780 }
2781 
visit(const Realize * op)2782 void CodeGen_C::visit(const Realize *op) {
2783     internal_error << "Cannot emit realize statements to C\n";
2784 }
2785 
visit(const Prefetch * op)2786 void CodeGen_C::visit(const Prefetch *op) {
2787     internal_error << "Cannot emit prefetch statements to C\n";
2788 }
2789 
visit(const IfThenElse * op)2790 void CodeGen_C::visit(const IfThenElse *op) {
2791     string cond_id = print_expr(op->condition);
2792 
2793     stream << get_indent() << "if (" << cond_id << ")\n";
2794     open_scope();
2795     op->then_case.accept(this);
2796     close_scope("if " + cond_id);
2797 
2798     if (op->else_case.defined()) {
2799         stream << get_indent() << "else\n";
2800         open_scope();
2801         op->else_case.accept(this);
2802         close_scope("if " + cond_id + " else");
2803     }
2804 }
2805 
visit(const Evaluate * op)2806 void CodeGen_C::visit(const Evaluate *op) {
2807     if (is_const(op->value)) return;
2808     string id = print_expr(op->value);
2809     stream << get_indent() << "halide_unused(" << id << ");\n";
2810 }
2811 
visit(const Shuffle * op)2812 void CodeGen_C::visit(const Shuffle *op) {
2813     internal_assert(!op->vectors.empty());
2814     internal_assert(op->vectors[0].type().is_vector());
2815     for (size_t i = 1; i < op->vectors.size(); i++) {
2816         internal_assert(op->vectors[0].type() == op->vectors[i].type());
2817     }
2818     internal_assert(op->type.lanes() == (int)op->indices.size());
2819     const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
2820     for (int i : op->indices) {
2821         internal_assert(i >= -1 && i < max_index);
2822     }
2823 
2824     std::vector<string> vecs;
2825     for (Expr v : op->vectors) {
2826         vecs.push_back(print_expr(v));
2827     }
2828     string src = vecs[0];
2829     if (op->vectors.size() > 1) {
2830         ostringstream rhs;
2831         string storage_name = unique_name('_');
2832         stream << get_indent() << "const " << print_type(op->vectors[0].type()) << " " << storage_name << "[] = { " << with_commas(vecs) << " };\n";
2833 
2834         rhs << print_type(op->type) << "::concat(" << op->vectors.size() << ", " << storage_name << ")";
2835         src = print_assignment(op->type, rhs.str());
2836     }
2837     ostringstream rhs;
2838     if (op->type.is_scalar()) {
2839         rhs << src << "[" << op->indices[0] << "]";
2840     } else {
2841         string indices_name = unique_name('_');
2842         stream << get_indent() << "const int32_t " << indices_name << "[" << op->indices.size() << "] = { " << with_commas(op->indices) << " };\n";
2843         rhs << print_type(op->type) << "::shuffle(" << src << ", " << indices_name << ")";
2844     }
2845     print_assignment(op->type, rhs.str());
2846 }
2847 
test()2848 void CodeGen_C::test() {
2849     LoweredArgument buffer_arg("buf", Argument::OutputBuffer, Int(32), 3, ArgumentEstimates{});
2850     LoweredArgument float_arg("alpha", Argument::InputScalar, Float(32), 0, ArgumentEstimates{});
2851     LoweredArgument int_arg("beta", Argument::InputScalar, Int(32), 0, ArgumentEstimates{});
2852     LoweredArgument user_context_arg("__user_context", Argument::InputScalar, type_of<const void *>(), 0, ArgumentEstimates{});
2853     vector<LoweredArgument> args = {buffer_arg, float_arg, int_arg, user_context_arg};
2854     Var x("x");
2855     Param<float> alpha("alpha");
2856     Param<int> beta("beta");
2857     Expr e = Select::make(alpha > 4.0f, print_when(x < 1, 3), 2);
2858     Stmt s = Store::make("buf", e, x, Parameter(), const_true(), ModulusRemainder());
2859     s = LetStmt::make("x", beta + 1, s);
2860     s = Block::make(s, Free::make("tmp.stack"));
2861     s = Allocate::make("tmp.stack", Int(32), MemoryType::Stack, {127}, const_true(), s);
2862     s = Allocate::make("tmp.heap", Int(32), MemoryType::Heap, {43, beta}, const_true(), s);
2863     Expr buf = Variable::make(Handle(), "buf.buffer");
2864     s = LetStmt::make("buf", Call::make(Handle(), Call::buffer_get_host, {buf}, Call::Extern), s);
2865 
2866     Module m("", get_host_target());
2867     m.append(LoweredFunc("test1", args, s, LinkageType::External));
2868 
2869     ostringstream source;
2870     ostringstream macros;
2871     {
2872         CodeGen_C cg(source, Target("host"), CodeGen_C::CImplementation);
2873         cg.compile(m);
2874         cg.add_common_macros(macros);
2875     }
2876 
2877     string src = source.str();
2878     string correct_source =
2879         headers +
2880         globals +
2881         string((const char *)halide_internal_runtime_header_HalideRuntime_h) + '\n' +
2882         string((const char *)halide_internal_initmod_inlined_c) + '\n' +
2883         macros.str() + '\n' + kDefineMustUseResult + R"GOLDEN_CODE(
2884 #ifndef HALIDE_FUNCTION_ATTRS
2885 #define HALIDE_FUNCTION_ATTRS
2886 #endif
2887 
2888 
2889 
2890 #ifdef __cplusplus
2891 extern "C" {
2892 #endif
2893 
2894 HALIDE_FUNCTION_ATTRS
2895 int test1(struct halide_buffer_t *_buf_buffer, float _alpha, int32_t _beta, void const *__user_context) {
2896  void * const _ucon = const_cast<void *>(__user_context);
2897  void *_0 = _halide_buffer_get_host(_buf_buffer);
2898  void * _buf = _0;
2899  {
2900   int64_t _1 = 43;
2901   int64_t _2 = _1 * _beta;
2902   if ((_2 > ((int64_t(1) << 31) - 1)) || ((_2 * sizeof(int32_t )) > ((int64_t(1) << 31) - 1)))
2903   {
2904    halide_error(_ucon, "32-bit signed overflow computing size of allocation tmp.heap\n");
2905    return -1;
2906   } // overflow test tmp.heap
2907   int64_t _3 = _2;
2908   int32_t *_tmp_heap = (int32_t  *)halide_malloc(_ucon, sizeof(int32_t )*_3);
2909   if (!_tmp_heap)
2910   {
2911    int32_t _4 = halide_error_out_of_memory(_ucon);
2912    return _4;
2913   }
2914   HalideFreeHelper _tmp_heap_free(_ucon, _tmp_heap, halide_free);
2915   {
2916    int32_t _tmp_stack[127];
2917    int32_t _5 = _beta + 1;
2918    int32_t _6;
2919    bool _7 = _5 < 1;
2920    if (_7)
2921    {
2922     char b0[1024];
2923     snprintf(b0, 1024, "%lld%s", (long long)(3), "\n");
2924     char const *_8 = b0;
2925     halide_print(_ucon, _8);
2926     int32_t _9 = 0;
2927     int32_t _10 = return_second(_9, 3);
2928     _6 = _10;
2929    } // if _7
2930    else
2931    {
2932     _6 = 3;
2933    } // if _7 else
2934    int32_t _11 = _6;
2935    float _12 = float_from_bits(1082130432 /* 4 */);
2936    bool _13 = _alpha > _12;
2937    int32_t _14 = (int32_t)(_13 ? _11 : 2);
2938    ((int32_t *)_buf)[_5] = _14;
2939   } // alloc _tmp_stack
2940   _tmp_heap_free.free();
2941  } // alloc _tmp_heap
2942  return 0;
2943 }
2944 
2945 #ifdef __cplusplus
2946 }  // extern "C"
2947 #endif
2948 
2949 )GOLDEN_CODE";
2950 
2951     if (src != correct_source) {
2952         int diff = 0;
2953         while (src[diff] == correct_source[diff]) {
2954             diff++;
2955         }
2956         int diff_end = diff + 1;
2957         while (diff > 0 && src[diff] != '\n') {
2958             diff--;
2959         }
2960         while (diff_end < (int)src.size() && src[diff_end] != '\n') {
2961             diff_end++;
2962         }
2963 
2964         internal_error
2965             << "Correct source code:\n"
2966             << correct_source
2967             << "Actual source code:\n"
2968             << src
2969             << "Difference starts at:\n"
2970             << "Correct: " << correct_source.substr(diff, diff_end - diff) << "\n"
2971             << "Actual: " << src.substr(diff, diff_end - diff) << "\n";
2972     }
2973 
2974     std::cout << "CodeGen_C test passed\n";
2975 }
2976 
2977 }  // namespace Internal
2978 }  // namespace Halide
2979