1 #include <array>
2 #include <iostream>
3 #include <string>
4 
5 #include "Target.h"
6 
7 #include "Debug.h"
8 #include "DeviceInterface.h"
9 #include "Error.h"
10 #include "Util.h"
11 #include "WasmExecutor.h"
12 
13 #if defined(__powerpc__) && (defined(__FreeBSD__) || defined(__linux__))
14 #if defined(__FreeBSD__)
15 #include <machine/cpu.h>
16 #include <sys/elf_common.h>
17 #endif
18 // This uses elf.h and must be included after "LLVM_Headers.h", which
19 // uses llvm/support/Elf.h.
20 #include <sys/auxv.h>
21 #endif
22 
23 #ifdef _MSC_VER
24 #include <intrin.h>
25 #endif  // _MSC_VER
26 
27 namespace Halide {
28 
29 using std::string;
30 using std::vector;
31 
32 namespace {
33 
34 #ifdef _MSC_VER
cpuid(int info[4],int infoType,int extra)35 static void cpuid(int info[4], int infoType, int extra) {
36     __cpuidex(info, infoType, extra);
37 }
38 #else
39 
40 #if defined(__x86_64__) || defined(__i386__)
41 // CPU feature detection code taken from ispc
42 // (https://github.com/ispc/ispc/blob/master/builtins/dispatch.ll)
43 
44 #ifdef _LP64
45 static void cpuid(int info[4], int infoType, int extra) {
46     __asm__ __volatile__(
47         "cpuid                 \n\t"
48         : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
49         : "0"(infoType), "2"(extra));
50 }
51 #else
52 static void cpuid(int info[4], int infoType, int extra) {
53     // We save %ebx in case it's the PIC register
54     __asm__ __volatile__(
55         "mov{l}\t{%%}ebx, %1  \n\t"
56         "cpuid                 \n\t"
57         "xchg{l}\t{%%}ebx, %1  \n\t"
58         : "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3])
59         : "0"(infoType), "2"(extra));
60 }
61 #endif
62 #endif
63 #endif
64 
calculate_host_target()65 Target calculate_host_target() {
66     Target::OS os = Target::OSUnknown;
67 #ifdef __linux__
68     os = Target::Linux;
69 #endif
70 #ifdef _WIN32
71     os = Target::Windows;
72 #endif
73 #ifdef __APPLE__
74     os = Target::OSX;
75 #endif
76 
77     bool use_64_bits = (sizeof(size_t) == 8);
78     int bits = use_64_bits ? 64 : 32;
79     std::vector<Target::Feature> initial_features;
80 
81 #if __riscv__
82     Target::Arch arch = Target::RISCV;
83 #else
84 #if __mips__ || __mips || __MIPS__
85     Target::Arch arch = Target::MIPS;
86 #else
87 #if defined(__arm__) || defined(__aarch64__)
88     Target::Arch arch = Target::ARM;
89 #else
90 #if defined(__powerpc__) && (defined(__FreeBSD__) || defined(__linux__))
91     Target::Arch arch = Target::POWERPC;
92 
93 #if defined(__linux__)
94     unsigned long hwcap = getauxval(AT_HWCAP);
95     unsigned long hwcap2 = getauxval(AT_HWCAP2);
96 #elif defined(__FreeBSD__)
97     unsigned long hwcap, hwcap2;
98     elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
99     elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
100 #endif
101     bool have_altivec = (hwcap & PPC_FEATURE_HAS_ALTIVEC) != 0;
102     bool have_vsx = (hwcap & PPC_FEATURE_HAS_VSX) != 0;
103     bool arch_2_07 = (hwcap2 & PPC_FEATURE2_ARCH_2_07) != 0;
104 
105     user_assert(have_altivec)
106         << "The POWERPC backend assumes at least AltiVec support. This machine does not appear to have AltiVec.\n";
107 
108     if (have_vsx) initial_features.push_back(Target::VSX);
109     if (arch_2_07) initial_features.push_back(Target::POWER_ARCH_2_07);
110 #else
111     Target::Arch arch = Target::X86;
112 
113     int info[4];
114     cpuid(info, 1, 0);
115     bool have_sse41 = (info[2] & (1 << 19)) != 0;
116     bool have_sse2 = (info[3] & (1 << 26)) != 0;
117     bool have_avx = (info[2] & (1 << 28)) != 0;
118     bool have_f16c = (info[2] & (1 << 29)) != 0;
119     bool have_rdrand = (info[2] & (1 << 30)) != 0;
120     bool have_fma = (info[2] & (1 << 12)) != 0;
121 
122     user_assert(have_sse2)
123         << "The x86 backend assumes at least sse2 support. This machine does not appear to have sse2.\n"
124         << "cpuid returned: "
125         << std::hex << info[0]
126         << ", " << info[1]
127         << ", " << info[2]
128         << ", " << info[3]
129         << std::dec << "\n";
130 
131     if (have_sse41) initial_features.push_back(Target::SSE41);
132     if (have_avx) initial_features.push_back(Target::AVX);
133     if (have_f16c) initial_features.push_back(Target::F16C);
134     if (have_fma) initial_features.push_back(Target::FMA);
135 
136     if (use_64_bits && have_avx && have_f16c && have_rdrand) {
137         // So far, so good.  AVX2/512?
138         // Call cpuid with eax=7, ecx=0
139         int info2[4];
140         cpuid(info2, 7, 0);
141         const uint32_t avx2 = 1U << 5;
142         const uint32_t avx512f = 1U << 16;
143         const uint32_t avx512dq = 1U << 17;
144         const uint32_t avx512pf = 1U << 26;
145         const uint32_t avx512er = 1U << 27;
146         const uint32_t avx512cd = 1U << 28;
147         const uint32_t avx512bw = 1U << 30;
148         const uint32_t avx512vl = 1U << 31;
149         const uint32_t avx512ifma = 1U << 21;
150         const uint32_t avx512 = avx512f | avx512cd;
151         const uint32_t avx512_knl = avx512 | avx512pf | avx512er;
152         const uint32_t avx512_skylake = avx512 | avx512vl | avx512bw | avx512dq;
153         const uint32_t avx512_cannonlake = avx512_skylake | avx512ifma;  // Assume ifma => vbmi
154         if ((info2[1] & avx2) == avx2) {
155             initial_features.push_back(Target::AVX2);
156         }
157         if ((info2[1] & avx512) == avx512) {
158             initial_features.push_back(Target::AVX512);
159             if ((info2[1] & avx512_knl) == avx512_knl) {
160                 initial_features.push_back(Target::AVX512_KNL);
161             }
162             if ((info2[1] & avx512_skylake) == avx512_skylake) {
163                 initial_features.push_back(Target::AVX512_Skylake);
164             }
165             if ((info2[1] & avx512_cannonlake) == avx512_cannonlake) {
166                 initial_features.push_back(Target::AVX512_Cannonlake);
167             }
168         }
169     }
170 #endif
171 #endif
172 #endif
173 #endif
174 
175     return {os, arch, bits, initial_features};
176 }
177 
is_using_hexagon(const Target & t)178 bool is_using_hexagon(const Target &t) {
179     return (t.has_feature(Target::HVX_64) ||
180             t.has_feature(Target::HVX_128) ||
181             t.has_feature(Target::HVX_v62) ||
182             t.has_feature(Target::HVX_v65) ||
183             t.has_feature(Target::HVX_v66) ||
184             t.has_feature(Target::HexagonDma) ||
185             t.has_feature(Target::HVX_shared_object) ||
186             t.arch == Target::Hexagon);
187 }
188 
get_hvx_lower_bound(const Target & t)189 int get_hvx_lower_bound(const Target &t) {
190     if (!is_using_hexagon(t)) {
191         return -1;
192     }
193     if (t.has_feature(Target::HVX_v62)) {
194         return 62;
195     }
196     if (t.has_feature(Target::HVX_v65)) {
197         return 65;
198     }
199     if (t.has_feature(Target::HVX_v66)) {
200         return 66;
201     }
202     return 60;
203 }
204 
205 }  // namespace
206 
get_host_target()207 Target get_host_target() {
208     // Calculating the host target isn't slow but it isn't free,
209     // and it's pointless to recalculate it every time we (e.g.) parse
210     // an arbitrary Target string. It won't ever change, so cache on first
211     // use.
212     static Target host_target = calculate_host_target();
213     return host_target;
214 }
215 
216 namespace {
217 
calculate_host_cuda_capability(Target t)218 Target::Feature calculate_host_cuda_capability(Target t) {
219     const auto *interface = get_device_interface_for_device_api(DeviceAPI::CUDA, t);
220     internal_assert(interface->compute_capability);
221     int major, minor;
222     int err = interface->compute_capability(nullptr, &major, &minor);
223     internal_assert(err == 0) << "Failed to query cuda compute capability\n";
224     int ver = major * 10 + minor;
225     if (ver < 30) {
226         return Target::FeatureEnd;
227     } else if (ver < 32) {
228         return Target::CUDACapability30;
229     } else if (ver < 35) {
230         return Target::CUDACapability32;
231     } else if (ver < 50) {
232         return Target::CUDACapability35;
233     } else if (ver < 61) {
234         return Target::CUDACapability50;
235     } else if (ver < 70) {
236         return Target::CUDACapability61;
237     } else if (ver < 75) {
238         return Target::CUDACapability70;
239     } else if (ver < 80) {
240         return Target::CUDACapability75;
241     } else {
242         return Target::CUDACapability80;
243     }
244 }
245 
get_host_cuda_capability(Target t)246 Target::Feature get_host_cuda_capability(Target t) {
247     static Target::Feature cap = calculate_host_cuda_capability(t);
248     return cap;
249 }
250 
251 const std::map<std::string, Target::OS> os_name_map = {
252     {"os_unknown", Target::OSUnknown},
253     {"linux", Target::Linux},
254     {"windows", Target::Windows},
255     {"osx", Target::OSX},
256     {"android", Target::Android},
257     {"ios", Target::IOS},
258     {"qurt", Target::QuRT},
259     {"noos", Target::NoOS},
260     {"fuchsia", Target::Fuchsia},
261     {"wasmrt", Target::WebAssemblyRuntime}};
262 
lookup_os(const std::string & tok,Target::OS & result)263 bool lookup_os(const std::string &tok, Target::OS &result) {
264     auto os_iter = os_name_map.find(tok);
265     if (os_iter != os_name_map.end()) {
266         result = os_iter->second;
267         return true;
268     }
269     return false;
270 }
271 
272 const std::map<std::string, Target::Arch> arch_name_map = {
273     {"arch_unknown", Target::ArchUnknown},
274     {"x86", Target::X86},
275     {"arm", Target::ARM},
276     {"mips", Target::MIPS},
277     {"powerpc", Target::POWERPC},
278     {"hexagon", Target::Hexagon},
279     {"wasm", Target::WebAssembly},
280     {"riscv", Target::RISCV},
281 };
282 
lookup_arch(const std::string & tok,Target::Arch & result)283 bool lookup_arch(const std::string &tok, Target::Arch &result) {
284     auto arch_iter = arch_name_map.find(tok);
285     if (arch_iter != arch_name_map.end()) {
286         result = arch_iter->second;
287         return true;
288     }
289     return false;
290 }
291 
292 const std::map<std::string, Target::Feature> feature_name_map = {
293     {"jit", Target::JIT},
294     {"debug", Target::Debug},
295     {"no_asserts", Target::NoAsserts},
296     {"no_bounds_query", Target::NoBoundsQuery},
297     {"sse41", Target::SSE41},
298     {"avx", Target::AVX},
299     {"avx2", Target::AVX2},
300     {"fma", Target::FMA},
301     {"fma4", Target::FMA4},
302     {"f16c", Target::F16C},
303     {"armv7s", Target::ARMv7s},
304     {"no_neon", Target::NoNEON},
305     {"vsx", Target::VSX},
306     {"power_arch_2_07", Target::POWER_ARCH_2_07},
307     {"cuda", Target::CUDA},
308     {"cuda_capability_30", Target::CUDACapability30},
309     {"cuda_capability_32", Target::CUDACapability32},
310     {"cuda_capability_35", Target::CUDACapability35},
311     {"cuda_capability_50", Target::CUDACapability50},
312     {"cuda_capability_61", Target::CUDACapability61},
313     {"cuda_capability_70", Target::CUDACapability70},
314     {"cuda_capability_75", Target::CUDACapability75},
315     {"cuda_capability_80", Target::CUDACapability80},
316     {"opencl", Target::OpenCL},
317     {"cl_doubles", Target::CLDoubles},
318     {"cl_half", Target::CLHalf},
319     {"cl_atomics64", Target::CLAtomics64},
320     {"opengl", Target::OpenGL},
321     {"openglcompute", Target::OpenGLCompute},
322     {"egl", Target::EGL},
323     {"user_context", Target::UserContext},
324     {"matlab", Target::Matlab},
325     {"profile", Target::Profile},
326     {"no_runtime", Target::NoRuntime},
327     {"metal", Target::Metal},
328     {"c_plus_plus_name_mangling", Target::CPlusPlusMangling},
329     {"large_buffers", Target::LargeBuffers},
330     {"hvx_64", Target::HVX_64},
331     {"hvx_128", Target::HVX_128},
332     {"hvx_v62", Target::HVX_v62},
333     {"hvx_v65", Target::HVX_v65},
334     {"hvx_v66", Target::HVX_v66},
335     {"hvx_shared_object", Target::HVX_shared_object},
336     {"fuzz_float_stores", Target::FuzzFloatStores},
337     {"soft_float_abi", Target::SoftFloatABI},
338     {"msan", Target::MSAN},
339     {"avx512", Target::AVX512},
340     {"avx512_knl", Target::AVX512_KNL},
341     {"avx512_skylake", Target::AVX512_Skylake},
342     {"avx512_cannonlake", Target::AVX512_Cannonlake},
343     {"trace_loads", Target::TraceLoads},
344     {"trace_stores", Target::TraceStores},
345     {"trace_realizations", Target::TraceRealizations},
346     {"trace_pipeline", Target::TracePipeline},
347     {"d3d12compute", Target::D3D12Compute},
348     {"strict_float", Target::StrictFloat},
349     {"tsan", Target::TSAN},
350     {"asan", Target::ASAN},
351     {"check_unsafe_promises", Target::CheckUnsafePromises},
352     {"hexagon_dma", Target::HexagonDma},
353     {"embed_bitcode", Target::EmbedBitcode},
354     {"disable_llvm_loop_opt", Target::DisableLLVMLoopOpt},
355     {"enable_llvm_loop_opt", Target::EnableLLVMLoopOpt},
356     {"wasm_simd128", Target::WasmSimd128},
357     {"wasm_signext", Target::WasmSignExt},
358     {"wasm_sat_float_to_int", Target::WasmSatFloatToInt},
359     {"sve", Target::SVE},
360     {"sve2", Target::SVE2},
361     {"arm_dot_prod", Target::ARMDotProd},
362     // NOTE: When adding features to this map, be sure to update PyEnums.cpp as well.
363 };
364 
lookup_feature(const std::string & tok,Target::Feature & result)365 bool lookup_feature(const std::string &tok, Target::Feature &result) {
366     auto feature_iter = feature_name_map.find(tok);
367     if (feature_iter != feature_name_map.end()) {
368         result = feature_iter->second;
369         return true;
370     }
371     return false;
372 }
373 
374 }  // End anonymous namespace
375 
get_target_from_environment()376 Target get_target_from_environment() {
377     string target = Internal::get_env_variable("HL_TARGET");
378     if (target.empty()) {
379         return get_host_target();
380     } else {
381         return Target(target);
382     }
383 }
384 
get_jit_target_from_environment()385 Target get_jit_target_from_environment() {
386     Target host = get_host_target();
387     host.set_feature(Target::JIT);
388 #if defined(__has_feature)
389 #if __has_feature(address_sanitizer)
390     host.set_feature(Target::ASAN);
391 #endif
392 #if __has_feature(memory_sanitizer)
393     host.set_feature(Target::MSAN);
394 #endif
395 #if __has_feature(thread_sanitizer)
396     host.set_feature(Target::TSAN);
397 #endif
398 #endif
399     string target = Internal::get_env_variable("HL_JIT_TARGET");
400     if (target.empty()) {
401         return host;
402     } else {
403         Target t(target);
404         t.set_feature(Target::JIT);
405         user_assert((t.os == host.os && t.arch == host.arch && t.bits == host.bits) || Internal::WasmModule::can_jit_target(t))
406             << "HL_JIT_TARGET must match the host OS, architecture, and bit width.\n"
407             << "HL_JIT_TARGET was " << target << ". "
408             << "Host is " << host.to_string() << ".\n";
409         return t;
410     }
411 }
412 
413 namespace {
merge_string(Target & t,const std::string & target)414 bool merge_string(Target &t, const std::string &target) {
415     string rest = target;
416     vector<string> tokens;
417     size_t first_dash;
418     while ((first_dash = rest.find('-')) != string::npos) {
419         //Internal::debug(0) << first_dash << ", " << rest << "\n";
420         tokens.push_back(rest.substr(0, first_dash));
421         rest = rest.substr(first_dash + 1);
422     }
423     tokens.push_back(rest);
424 
425     bool os_specified = false, arch_specified = false, bits_specified = false, features_specified = false;
426     bool is_host = false;
427 
428     for (size_t i = 0; i < tokens.size(); i++) {
429         const string &tok = tokens[i];
430         Target::Feature feature;
431 
432         if (tok == "host") {
433             if (i > 0) {
434                 // "host" is now only allowed as the first token.
435                 return false;
436             }
437             is_host = true;
438             t = get_host_target();
439         } else if (tok == "32" || tok == "64" || tok == "0") {
440             if (bits_specified) {
441                 return false;
442             }
443             bits_specified = true;
444             t.bits = std::stoi(tok);
445         } else if (lookup_arch(tok, t.arch)) {
446             if (arch_specified) {
447                 return false;
448             }
449             arch_specified = true;
450         } else if (lookup_os(tok, t.os)) {
451             if (os_specified) {
452                 return false;
453             }
454             os_specified = true;
455         } else if (lookup_feature(tok, feature)) {
456             t.set_feature(feature);
457             features_specified = true;
458         } else if (tok == "trace_all") {
459             t.set_features({Target::TraceLoads, Target::TraceStores, Target::TraceRealizations});
460             features_specified = true;
461         } else {
462             return false;
463         }
464     }
465 
466     if (is_host &&
467         t.has_feature(Target::CUDA) &&
468         !t.has_feature(Target::CUDACapability30) &&
469         !t.has_feature(Target::CUDACapability32) &&
470         !t.has_feature(Target::CUDACapability35) &&
471         !t.has_feature(Target::CUDACapability50) &&
472         !t.has_feature(Target::CUDACapability61) &&
473         !t.has_feature(Target::CUDACapability70) &&
474         !t.has_feature(Target::CUDACapability75) &&
475         !t.has_feature(Target::CUDACapability80)) {
476         // Detect host cuda capability
477         t.set_feature(get_host_cuda_capability(t));
478     }
479 
480     if (arch_specified && !bits_specified) {
481         return false;
482     }
483 
484     if (bits_specified && t.bits == 0) {
485         // bits == 0 is allowed iff arch and os are "unknown" and no features are set,
486         // to allow for roundtripping the string for default Target() ctor.
487         if (!(arch_specified && t.arch == Target::ArchUnknown) ||
488             !(os_specified && t.os == Target::OSUnknown) ||
489             features_specified) {
490             return false;
491         }
492     }
493 
494     return true;
495 }
496 
bad_target_string(const std::string & target)497 void bad_target_string(const std::string &target) {
498     const char *separator = "";
499     std::string architectures;
500     for (const auto &arch_entry : arch_name_map) {
501         architectures += separator + arch_entry.first;
502         separator = ", ";
503     }
504     separator = "";
505     std::string oses;
506     for (const auto &os_entry : os_name_map) {
507         oses += separator + os_entry.first;
508         separator = ", ";
509     }
510     separator = "";
511     // Format the features to go one feature over 70 characters per line,
512     // assume the first line starts with "Features are ".
513     int line_char_start = -(int)sizeof("Features are");
514     std::string features;
515     for (const auto &feature_entry : feature_name_map) {
516         features += separator + feature_entry.first;
517         if (features.length() - line_char_start > 70) {
518             separator = "\n";
519             line_char_start = features.length();
520         } else {
521             separator = ", ";
522         }
523     }
524     user_error << "Did not understand Halide target " << target << "\n"
525                << "Expected format is arch-bits-os-feature1-feature2-...\n"
526                << "Where arch is: " << architectures << ".\n"
527                << "bits is either 32 or 64.\n"
528                << "os is: " << oses << ".\n"
529                << "\n"
530                << "If arch, bits, or os are omitted, they default to the host.\n"
531                << "\n"
532                << "Features are: " << features << ".\n"
533                << "\n"
534                << "The target can also begin with \"host\", which sets the "
535                << "host's architecture, os, and feature set, with the "
536                << "exception of the GPU runtimes, which default to off.\n"
537                << "\n"
538                << "On this platform, the host target is: " << get_host_target().to_string() << "\n";
539 }
540 
541 }  // namespace
542 
Target(const std::string & target)543 Target::Target(const std::string &target)
544     : os(OSUnknown), arch(ArchUnknown), bits(0) {
545     Target host = get_host_target();
546 
547     if (target.empty()) {
548         // If nothing is specified, use the full host target.
549         *this = host;
550     } else {
551         if (!merge_string(*this, target) || has_unknowns()) {
552             bad_target_string(target);
553         }
554     }
555 }
556 
Target(const char * s)557 Target::Target(const char *s)
558     : Target(std::string(s)) {
559 }
560 
validate_target_string(const std::string & s)561 bool Target::validate_target_string(const std::string &s) {
562     Target t;
563     return merge_string(t, s) && !t.has_unknowns();
564 }
565 
feature_to_name(Target::Feature feature)566 std::string Target::feature_to_name(Target::Feature feature) {
567     for (const auto &feature_entry : feature_name_map) {
568         if (feature == feature_entry.second) {
569             return feature_entry.first;
570         }
571     }
572     internal_error;
573     return "";
574 }
575 
feature_from_name(const std::string & name)576 Target::Feature Target::feature_from_name(const std::string &name) {
577     Target::Feature feature;
578     if (lookup_feature(name, feature)) {
579         return feature;
580     }
581     return Target::FeatureEnd;
582 }
583 
to_string() const584 std::string Target::to_string() const {
585     string result;
586     for (const auto &arch_entry : arch_name_map) {
587         if (arch_entry.second == arch) {
588             result += arch_entry.first;
589             break;
590         }
591     }
592     result += "-" + std::to_string(bits);
593     for (const auto &os_entry : os_name_map) {
594         if (os_entry.second == os) {
595             result += "-" + os_entry.first;
596             break;
597         }
598     }
599     for (const auto &feature_entry : feature_name_map) {
600         if (has_feature(feature_entry.second)) {
601             result += "-" + feature_entry.first;
602         }
603     }
604     // Use has_feature() multiple times (rather than features_any_of())
605     // to avoid constructing a temporary vector for this rather-common call.
606     if (has_feature(Target::TraceLoads) && has_feature(Target::TraceStores) && has_feature(Target::TraceRealizations)) {
607         result = Internal::replace_all(result, "trace_loads-trace_realizations-trace_stores", "trace_all");
608     }
609     return result;
610 }
611 
612 /** Was libHalide compiled with support for this target? */
supported() const613 bool Target::supported() const {
614     bool bad = false;
615 #if !defined(WITH_ARM)
616     bad |= arch == Target::ARM && bits == 32;
617 #endif
618 #if !defined(WITH_AARCH64)
619     bad |= arch == Target::ARM && bits == 64;
620 #endif
621 #if !defined(WITH_X86)
622     bad |= arch == Target::X86;
623 #endif
624 #if !defined(WITH_MIPS)
625     bad |= arch == Target::MIPS;
626 #endif
627 #if !defined(WITH_POWERPC)
628     bad |= arch == Target::POWERPC;
629 #endif
630 #if !defined(WITH_HEXAGON)
631     bad |= arch == Target::Hexagon;
632 #endif
633 #if !defined(WITH_WEBASSEMBLY)
634     bad |= arch == Target::WebAssembly;
635 #endif
636 #if !defined(WITH_RISCV)
637     bad |= arch == Target::RISCV;
638 #endif
639 #if !defined(WITH_NVPTX)
640     bad |= has_feature(Target::CUDA);
641 #endif
642 #if !defined(WITH_OPENCL)
643     bad |= has_feature(Target::OpenCL);
644 #endif
645 #if !defined(WITH_METAL)
646     bad |= has_feature(Target::Metal);
647 #endif
648 #if !defined(WITH_OPENGL)
649     bad |= has_feature(Target::OpenGL) || has_feature(Target::OpenGLCompute);
650 #endif
651 #if !defined(WITH_D3D12)
652     bad |= has_feature(Target::D3D12Compute);
653 #endif
654     return !bad;
655 }
656 
has_unknowns() const657 bool Target::has_unknowns() const {
658     return os == OSUnknown || arch == ArchUnknown || bits == 0;
659 }
660 
set_feature(Feature f,bool value)661 void Target::set_feature(Feature f, bool value) {
662     if (f == FeatureEnd) return;
663     user_assert(f < FeatureEnd) << "Invalid Target feature.\n";
664     features.set(f, value);
665 }
666 
set_features(const std::vector<Feature> & features_to_set,bool value)667 void Target::set_features(const std::vector<Feature> &features_to_set, bool value) {
668     for (Feature f : features_to_set) {
669         set_feature(f, value);
670     }
671 }
672 
has_feature(Feature f) const673 bool Target::has_feature(Feature f) const {
674     if (f == FeatureEnd) return true;
675     user_assert(f < FeatureEnd) << "Invalid Target feature.\n";
676     return features[f];
677 }
678 
features_any_of(const std::vector<Feature> & test_features) const679 bool Target::features_any_of(const std::vector<Feature> &test_features) const {
680     for (Feature f : test_features) {
681         if (has_feature(f)) {
682             return true;
683         }
684     }
685     return false;
686 }
687 
features_all_of(const std::vector<Feature> & test_features) const688 bool Target::features_all_of(const std::vector<Feature> &test_features) const {
689     for (Feature f : test_features) {
690         if (!has_feature(f)) {
691             return false;
692         }
693     }
694     return true;
695 }
696 
with_feature(Feature f) const697 Target Target::with_feature(Feature f) const {
698     Target copy = *this;
699     copy.set_feature(f);
700     return copy;
701 }
702 
without_feature(Feature f) const703 Target Target::without_feature(Feature f) const {
704     Target copy = *this;
705     copy.set_feature(f, false);
706     return copy;
707 }
708 
has_gpu_feature() const709 bool Target::has_gpu_feature() const {
710     return (has_feature(CUDA) ||
711             has_feature(OpenCL) ||
712             has_feature(Metal) ||
713             has_feature(D3D12Compute) ||
714             has_feature(OpenGLCompute));
715 }
716 
get_cuda_capability_lower_bound() const717 int Target::get_cuda_capability_lower_bound() const {
718     if (!has_feature(Target::CUDA)) {
719         return -1;
720     }
721     if (has_feature(Target::CUDACapability30)) {
722         return 30;
723     }
724     if (has_feature(Target::CUDACapability32)) {
725         return 32;
726     }
727     if (has_feature(Target::CUDACapability35)) {
728         return 35;
729     }
730     if (has_feature(Target::CUDACapability50)) {
731         return 50;
732     }
733     if (has_feature(Target::CUDACapability61)) {
734         return 61;
735     }
736     if (has_feature(Target::CUDACapability70)) {
737         return 70;
738     }
739     if (has_feature(Target::CUDACapability75)) {
740         return 75;
741     }
742     if (has_feature(Target::CUDACapability80)) {
743         return 80;
744     }
745     return 20;
746 }
747 
supports_type(const Type & t) const748 bool Target::supports_type(const Type &t) const {
749     if (t.bits() == 64) {
750         if (t.is_float()) {
751             return !has_feature(Metal) &&
752                    !has_feature(OpenGL) &&
753                    !has_feature(OpenGLCompute) &&
754                    !has_feature(D3D12Compute) &&
755                    (!has_feature(Target::OpenCL) || has_feature(Target::CLDoubles));
756         } else {
757             return (!has_feature(Metal) &&
758                     !has_feature(OpenGLCompute) &&
759                     !has_feature(OpenGL) &&
760                     !has_feature(D3D12Compute));
761         }
762     }
763     return true;
764 }
765 
supports_type(const Type & t,DeviceAPI device) const766 bool Target::supports_type(const Type &t, DeviceAPI device) const {
767     if (device == DeviceAPI::Default_GPU) {
768         device = get_default_device_api_for_target(*this);
769     }
770 
771     if (device == DeviceAPI::Hexagon) {
772         // HVX supports doubles and long long in the scalar unit only.
773         if (t.is_float() || t.bits() == 64) {
774             return t.lanes() == 1;
775         }
776     } else if (device == DeviceAPI::Metal) {
777         // Metal spec says no double or long long.
778         if (t.bits() == 64) {
779             return false;
780         }
781     } else if (device == DeviceAPI::OpenCL) {
782         if (t.is_float() && t.bits() == 64) {
783             return has_feature(Target::CLDoubles);
784         }
785     } else if (device == DeviceAPI::D3D12Compute) {
786         // Shader Model 5.x can optionally support double-precision; 64-bit int
787         // types are not supported.
788         return t.bits() < 64;
789     } else if (device == DeviceAPI::OpenGLCompute) {
790         return t.bits() < 64;
791     }
792 
793     return true;
794 }
795 
supports_device_api(DeviceAPI api) const796 bool Target::supports_device_api(DeviceAPI api) const {
797     switch (api) {
798     case DeviceAPI::None:
799         return true;
800     case DeviceAPI::Host:
801         return true;
802     case DeviceAPI::Default_GPU:
803         return has_gpu_feature();
804     case DeviceAPI::Hexagon:
805         return has_feature(Target::HVX_64) || has_feature(Target::HVX_128);
806     case DeviceAPI::HexagonDma:
807         return has_feature(Target::HexagonDma);
808     default:
809         return has_feature(target_feature_for_device_api(api));
810     }
811 }
812 
get_required_device_api() const813 DeviceAPI Target::get_required_device_api() const {
814     if (has_feature(Target::CUDA)) return DeviceAPI::CUDA;
815     if (has_feature(Target::D3D12Compute)) return DeviceAPI::D3D12Compute;
816     if (has_feature(Target::HVX_128)) return DeviceAPI::Hexagon;
817     if (has_feature(Target::HexagonDma)) return DeviceAPI::HexagonDma;
818     if (has_feature(Target::Metal)) return DeviceAPI::Metal;
819     if (has_feature(Target::OpenCL)) return DeviceAPI::OpenCL;
820     if (has_feature(Target::OpenGL)) return DeviceAPI::GLSL;
821     if (has_feature(Target::OpenGLCompute)) return DeviceAPI::OpenGLCompute;
822     return DeviceAPI::None;
823 }
824 
target_feature_for_device_api(DeviceAPI api)825 Target::Feature target_feature_for_device_api(DeviceAPI api) {
826     switch (api) {
827     case DeviceAPI::CUDA:
828         return Target::CUDA;
829     case DeviceAPI::OpenCL:
830         return Target::OpenCL;
831     case DeviceAPI::GLSL:
832         return Target::OpenGL;
833     case DeviceAPI::OpenGLCompute:
834         return Target::OpenGLCompute;
835     case DeviceAPI::Metal:
836         return Target::Metal;
837     case DeviceAPI::Hexagon:
838         return Target::HVX_128;
839     case DeviceAPI::D3D12Compute:
840         return Target::D3D12Compute;
841     default:
842         return Target::FeatureEnd;
843     }
844 }
845 
natural_vector_size(const Halide::Type & t) const846 int Target::natural_vector_size(const Halide::Type &t) const {
847     user_assert(!has_unknowns())
848         << "natural_vector_size cannot be used on a Target with Unknown values.\n";
849 
850     const bool is_integer = t.is_int() || t.is_uint();
851     const int data_size = t.bytes();
852 
853     if (arch == Target::Hexagon) {
854         if (is_integer) {
855             // HVX is either 64 or 128 *byte* vector size.
856             if (has_feature(Halide::Target::HVX_128)) {
857                 return 128 / data_size;
858             } else if (has_feature(Halide::Target::HVX_64)) {
859                 return 64 / data_size;
860             } else {
861                 user_error << "Target uses hexagon arch without hvx_128 or hvx_64 set.\n";
862                 return 0;
863             }
864         } else {
865             // HVX does not have vector float instructions.
866             return 1;
867         }
868     } else if (arch == Target::X86) {
869         if (is_integer && (has_feature(Halide::Target::AVX512_Skylake) ||
870                            has_feature(Halide::Target::AVX512_Cannonlake))) {
871             // AVX512BW exists on Skylake and Cannonlake
872             return 64 / data_size;
873         } else if (t.is_float() && (has_feature(Halide::Target::AVX512) ||
874                                     has_feature(Halide::Target::AVX512_KNL) ||
875                                     has_feature(Halide::Target::AVX512_Skylake) ||
876                                     has_feature(Halide::Target::AVX512_Cannonlake))) {
877             // AVX512F is on all AVX512 architectures
878             return 64 / data_size;
879         } else if (has_feature(Halide::Target::AVX2)) {
880             // AVX2 uses 256-bit vectors for everything.
881             return 32 / data_size;
882         } else if (!is_integer && has_feature(Halide::Target::AVX)) {
883             // AVX 1 has 256-bit vectors for float, but not for
884             // integer instructions.
885             return 32 / data_size;
886         } else {
887             // SSE was all 128-bit. We ignore MMX.
888             return 16 / data_size;
889         }
890     } else if (arch == Target::WebAssembly) {
891         if (has_feature(Halide::Target::WasmSimd128)) {
892             if (t.bits() == 64) {
893                 // int64 and float64 aren't supported in simd128.
894                 return 1;
895             }
896             // 128-bit vectors for other types.
897             return 16 / data_size;
898         } else {
899             // No vectors, sorry.
900             return 1;
901         }
902     } else {
903         // Assume 128-bit vectors on other targets.
904         return 16 / data_size;
905     }
906 }
907 
get_runtime_compatible_target(const Target & other,Target & result)908 bool Target::get_runtime_compatible_target(const Target &other, Target &result) {
909     // Create mask to select features that:
910     // (a) must be included if either target has the feature (union)
911     // (b) must be included if both targets have the feature (intersection)
912     // (c) must match across both targets; it is an error if one target has the feature and the other doesn't
913     const std::array<Feature, 18> union_features = {{// These are true union features.
914                                                      CUDA, OpenCL, OpenGL, OpenGLCompute, Metal, D3D12Compute, NoNEON,
915 
916                                                      // These features are actually intersection-y, but because targets only record the _highest_,
917                                                      // we have to put their union in the result and then take a lower bound.
918                                                      CUDACapability30, CUDACapability32, CUDACapability35, CUDACapability50, CUDACapability61, CUDACapability70, CUDACapability75, CUDACapability80,
919                                                      HVX_v62, HVX_v65, HVX_v66}};
920 
921     const std::array<Feature, 12> intersection_features = {{SSE41, AVX, AVX2, FMA, FMA4, F16C, ARMv7s, VSX, AVX512, AVX512_KNL, AVX512_Skylake, AVX512_Cannonlake}};
922 
923     const std::array<Feature, 10> matching_features = {{SoftFloatABI, Debug, TSAN, ASAN, MSAN, HVX_64, HVX_128, HexagonDma, HVX_shared_object}};
924 
925     // bitsets need to be the same width.
926     decltype(result.features) union_mask;
927     decltype(result.features) intersection_mask;
928     decltype(result.features) matching_mask;
929 
930     for (auto &feature : union_features) {
931         union_mask.set(feature);
932     }
933 
934     for (auto &feature : intersection_features) {
935         intersection_mask.set(feature);
936     }
937 
938     for (auto &feature : matching_features) {
939         matching_mask.set(feature);
940     }
941 
942     if (arch != other.arch || bits != other.bits || os != other.os) {
943         Internal::debug(1) << "runtime targets must agree on platform (arch-bits-os)\n"
944                            << "  this:  " << *this << "\n"
945                            << "  other: " << other << "\n";
946         return false;
947     }
948 
949     if ((features & matching_mask) != (other.features & matching_mask)) {
950         Internal::debug(1) << "runtime targets must agree on SoftFloatABI, Debug, TSAN, ASAN, MSAN, HVX_64, HVX_128, HexagonDma, and HVX_shared_object\n"
951                            << "  this:  " << *this << "\n"
952                            << "  other: " << other << "\n";
953         return false;
954     }
955 
956     // Union of features is computed through bitwise-or, and masked away by the features we care about
957     // Intersection of features is computed through bitwise-and and masked away, too.
958     // We merge the bits via bitwise or.
959     Target output = Target{os, arch, bits};
960     output.features = ((features | other.features) & union_mask) | ((features | other.features) & matching_mask) | ((features & other.features) & intersection_mask);
961 
962     // Pick tight lower bound for CUDA capability. Use fall-through to clear redundant features
963     int cuda_a = get_cuda_capability_lower_bound();
964     int cuda_b = other.get_cuda_capability_lower_bound();
965 
966     // get_cuda_capability_lower_bound returns -1 when unused. Casting to unsigned makes this
967     // large, so min selects the true lower bound when one target doesn't specify a capability,
968     // and the other doesn't use CUDA at all.
969     int cuda_capability = std::min((unsigned)cuda_a, (unsigned)cuda_b);
970     if (cuda_capability < 30) output.features.reset(CUDACapability30);
971     if (cuda_capability < 32) output.features.reset(CUDACapability32);
972     if (cuda_capability < 35) output.features.reset(CUDACapability35);
973     if (cuda_capability < 50) output.features.reset(CUDACapability50);
974     if (cuda_capability < 61) output.features.reset(CUDACapability61);
975     if (cuda_capability < 70) output.features.reset(CUDACapability70);
976     if (cuda_capability < 75) output.features.reset(CUDACapability75);
977     if (cuda_capability < 80) output.features.reset(CUDACapability80);
978 
979     // Pick tight lower bound for HVX version. Use fall-through to clear redundant features
980     int hvx_a = get_hvx_lower_bound(*this);
981     int hvx_b = get_hvx_lower_bound(other);
982 
983     // Same trick as above for CUDA
984     int hvx_version = std::min((unsigned)hvx_a, (unsigned)hvx_b);
985     if (hvx_version < 62) output.features.reset(HVX_v62);
986     if (hvx_version < 65) output.features.reset(HVX_v65);
987     if (hvx_version < 66) output.features.reset(HVX_v66);
988 
989     result = output;
990     return true;
991 }
992 
993 namespace Internal {
994 
target_test()995 void target_test() {
996     Target t;
997     for (const auto &feature : feature_name_map) {
998         t.set_feature(feature.second);
999     }
1000     for (int i = 0; i < (int)(Target::FeatureEnd); i++) {
1001         internal_assert(t.has_feature((Target::Feature)i)) << "Feature " << i << " not in feature_names_map.\n";
1002     }
1003 
1004     // 3 targets: {A,B,C}. Want gcd(A,B)=C
1005     std::vector<std::array<std::string, 3>> gcd_tests = {
1006         {{"x86-64-linux-sse41-fma", "x86-64-linux-sse41-fma", "x86-64-linux-sse41-fma"}},
1007         {{"x86-64-linux-sse41-fma-no_asserts-no_runtime", "x86-64-linux-sse41-fma", "x86-64-linux-sse41-fma"}},
1008         {{"x86-64-linux-avx2-sse41", "x86-64-linux-sse41-fma", "x86-64-linux-sse41"}},
1009         {{"x86-64-linux-avx2-sse41", "x86-32-linux-sse41-fma", ""}},
1010         {{"x86-64-linux-cuda", "x86-64-linux", "x86-64-linux-cuda"}},
1011         {{"x86-64-linux-cuda-cuda_capability_50", "x86-64-linux-cuda", "x86-64-linux-cuda"}},
1012         {{"x86-64-linux-cuda-cuda_capability_50", "x86-64-linux-cuda-cuda_capability_30", "x86-64-linux-cuda-cuda_capability_30"}},
1013         {{"x86-64-linux-cuda", "x86-64-linux-opengl", "x86-64-linux-cuda-opengl"}},
1014         {{"hexagon-32-qurt-hvx_v65", "hexagon-32-qurt-hvx_v62", "hexagon-32-qurt-hvx_v62"}},
1015         {{"hexagon-32-qurt-hvx_v62", "hexagon-32-qurt", "hexagon-32-qurt"}},
1016         {{"hexagon-32-qurt-hvx_v62-hvx_64", "hexagon-32-qurt", ""}},
1017         {{"hexagon-32-qurt-hvx_v62-hvx_64", "hexagon-32-qurt-hvx_64", "hexagon-32-qurt-hvx_64"}},
1018     };
1019 
1020     for (const auto &test : gcd_tests) {
1021         Target result{};
1022         Target a{test[0]};
1023         Target b{test[1]};
1024         if (a.get_runtime_compatible_target(b, result)) {
1025             internal_assert(!test[2].empty() && result == Target{test[2]})
1026                 << "Targets " << a.to_string() << " and " << b.to_string() << " were computed to have gcd "
1027                 << result.to_string() << " but expected '" << test[2] << "'\n";
1028         } else {
1029             internal_assert(test[2].empty())
1030                 << "Targets " << a.to_string() << " and " << b.to_string() << " were computed to have no gcd "
1031                 << "but " << test[2] << " was expected.";
1032         }
1033     }
1034 
1035     std::cout << "Target test passed" << std::endl;
1036 }
1037 
1038 }  // namespace Internal
1039 
1040 }  // namespace Halide
1041