1 #include <array>
2 #include <iostream>
3 #include <string>
4
5 #include "Target.h"
6
7 #include "Debug.h"
8 #include "DeviceInterface.h"
9 #include "Error.h"
10 #include "Util.h"
11 #include "WasmExecutor.h"
12
13 #if defined(__powerpc__) && (defined(__FreeBSD__) || defined(__linux__))
14 #if defined(__FreeBSD__)
15 #include <machine/cpu.h>
16 #include <sys/elf_common.h>
17 #endif
18 // This uses elf.h and must be included after "LLVM_Headers.h", which
19 // uses llvm/support/Elf.h.
20 #include <sys/auxv.h>
21 #endif
22
23 #ifdef _MSC_VER
24 #include <intrin.h>
25 #endif // _MSC_VER
26
27 namespace Halide {
28
29 using std::string;
30 using std::vector;
31
32 namespace {
33
34 #ifdef _MSC_VER
cpuid(int info[4],int infoType,int extra)35 static void cpuid(int info[4], int infoType, int extra) {
36 __cpuidex(info, infoType, extra);
37 }
38 #else
39
40 #if defined(__x86_64__) || defined(__i386__)
41 // CPU feature detection code taken from ispc
42 // (https://github.com/ispc/ispc/blob/master/builtins/dispatch.ll)
43
44 #ifdef _LP64
45 static void cpuid(int info[4], int infoType, int extra) {
46 __asm__ __volatile__(
47 "cpuid \n\t"
48 : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
49 : "0"(infoType), "2"(extra));
50 }
51 #else
52 static void cpuid(int info[4], int infoType, int extra) {
53 // We save %ebx in case it's the PIC register
54 __asm__ __volatile__(
55 "mov{l}\t{%%}ebx, %1 \n\t"
56 "cpuid \n\t"
57 "xchg{l}\t{%%}ebx, %1 \n\t"
58 : "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3])
59 : "0"(infoType), "2"(extra));
60 }
61 #endif
62 #endif
63 #endif
64
calculate_host_target()65 Target calculate_host_target() {
66 Target::OS os = Target::OSUnknown;
67 #ifdef __linux__
68 os = Target::Linux;
69 #endif
70 #ifdef _WIN32
71 os = Target::Windows;
72 #endif
73 #ifdef __APPLE__
74 os = Target::OSX;
75 #endif
76
77 bool use_64_bits = (sizeof(size_t) == 8);
78 int bits = use_64_bits ? 64 : 32;
79 std::vector<Target::Feature> initial_features;
80
81 #if __riscv__
82 Target::Arch arch = Target::RISCV;
83 #else
84 #if __mips__ || __mips || __MIPS__
85 Target::Arch arch = Target::MIPS;
86 #else
87 #if defined(__arm__) || defined(__aarch64__)
88 Target::Arch arch = Target::ARM;
89 #else
90 #if defined(__powerpc__) && (defined(__FreeBSD__) || defined(__linux__))
91 Target::Arch arch = Target::POWERPC;
92
93 #if defined(__linux__)
94 unsigned long hwcap = getauxval(AT_HWCAP);
95 unsigned long hwcap2 = getauxval(AT_HWCAP2);
96 #elif defined(__FreeBSD__)
97 unsigned long hwcap, hwcap2;
98 elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
99 elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
100 #endif
101 bool have_altivec = (hwcap & PPC_FEATURE_HAS_ALTIVEC) != 0;
102 bool have_vsx = (hwcap & PPC_FEATURE_HAS_VSX) != 0;
103 bool arch_2_07 = (hwcap2 & PPC_FEATURE2_ARCH_2_07) != 0;
104
105 user_assert(have_altivec)
106 << "The POWERPC backend assumes at least AltiVec support. This machine does not appear to have AltiVec.\n";
107
108 if (have_vsx) initial_features.push_back(Target::VSX);
109 if (arch_2_07) initial_features.push_back(Target::POWER_ARCH_2_07);
110 #else
111 Target::Arch arch = Target::X86;
112
113 int info[4];
114 cpuid(info, 1, 0);
115 bool have_sse41 = (info[2] & (1 << 19)) != 0;
116 bool have_sse2 = (info[3] & (1 << 26)) != 0;
117 bool have_avx = (info[2] & (1 << 28)) != 0;
118 bool have_f16c = (info[2] & (1 << 29)) != 0;
119 bool have_rdrand = (info[2] & (1 << 30)) != 0;
120 bool have_fma = (info[2] & (1 << 12)) != 0;
121
122 user_assert(have_sse2)
123 << "The x86 backend assumes at least sse2 support. This machine does not appear to have sse2.\n"
124 << "cpuid returned: "
125 << std::hex << info[0]
126 << ", " << info[1]
127 << ", " << info[2]
128 << ", " << info[3]
129 << std::dec << "\n";
130
131 if (have_sse41) initial_features.push_back(Target::SSE41);
132 if (have_avx) initial_features.push_back(Target::AVX);
133 if (have_f16c) initial_features.push_back(Target::F16C);
134 if (have_fma) initial_features.push_back(Target::FMA);
135
136 if (use_64_bits && have_avx && have_f16c && have_rdrand) {
137 // So far, so good. AVX2/512?
138 // Call cpuid with eax=7, ecx=0
139 int info2[4];
140 cpuid(info2, 7, 0);
141 const uint32_t avx2 = 1U << 5;
142 const uint32_t avx512f = 1U << 16;
143 const uint32_t avx512dq = 1U << 17;
144 const uint32_t avx512pf = 1U << 26;
145 const uint32_t avx512er = 1U << 27;
146 const uint32_t avx512cd = 1U << 28;
147 const uint32_t avx512bw = 1U << 30;
148 const uint32_t avx512vl = 1U << 31;
149 const uint32_t avx512ifma = 1U << 21;
150 const uint32_t avx512 = avx512f | avx512cd;
151 const uint32_t avx512_knl = avx512 | avx512pf | avx512er;
152 const uint32_t avx512_skylake = avx512 | avx512vl | avx512bw | avx512dq;
153 const uint32_t avx512_cannonlake = avx512_skylake | avx512ifma; // Assume ifma => vbmi
154 if ((info2[1] & avx2) == avx2) {
155 initial_features.push_back(Target::AVX2);
156 }
157 if ((info2[1] & avx512) == avx512) {
158 initial_features.push_back(Target::AVX512);
159 if ((info2[1] & avx512_knl) == avx512_knl) {
160 initial_features.push_back(Target::AVX512_KNL);
161 }
162 if ((info2[1] & avx512_skylake) == avx512_skylake) {
163 initial_features.push_back(Target::AVX512_Skylake);
164 }
165 if ((info2[1] & avx512_cannonlake) == avx512_cannonlake) {
166 initial_features.push_back(Target::AVX512_Cannonlake);
167 }
168 }
169 }
170 #endif
171 #endif
172 #endif
173 #endif
174
175 return {os, arch, bits, initial_features};
176 }
177
is_using_hexagon(const Target & t)178 bool is_using_hexagon(const Target &t) {
179 return (t.has_feature(Target::HVX_64) ||
180 t.has_feature(Target::HVX_128) ||
181 t.has_feature(Target::HVX_v62) ||
182 t.has_feature(Target::HVX_v65) ||
183 t.has_feature(Target::HVX_v66) ||
184 t.has_feature(Target::HexagonDma) ||
185 t.has_feature(Target::HVX_shared_object) ||
186 t.arch == Target::Hexagon);
187 }
188
get_hvx_lower_bound(const Target & t)189 int get_hvx_lower_bound(const Target &t) {
190 if (!is_using_hexagon(t)) {
191 return -1;
192 }
193 if (t.has_feature(Target::HVX_v62)) {
194 return 62;
195 }
196 if (t.has_feature(Target::HVX_v65)) {
197 return 65;
198 }
199 if (t.has_feature(Target::HVX_v66)) {
200 return 66;
201 }
202 return 60;
203 }
204
205 } // namespace
206
get_host_target()207 Target get_host_target() {
208 // Calculating the host target isn't slow but it isn't free,
209 // and it's pointless to recalculate it every time we (e.g.) parse
210 // an arbitrary Target string. It won't ever change, so cache on first
211 // use.
212 static Target host_target = calculate_host_target();
213 return host_target;
214 }
215
216 namespace {
217
calculate_host_cuda_capability(Target t)218 Target::Feature calculate_host_cuda_capability(Target t) {
219 const auto *interface = get_device_interface_for_device_api(DeviceAPI::CUDA, t);
220 internal_assert(interface->compute_capability);
221 int major, minor;
222 int err = interface->compute_capability(nullptr, &major, &minor);
223 internal_assert(err == 0) << "Failed to query cuda compute capability\n";
224 int ver = major * 10 + minor;
225 if (ver < 30) {
226 return Target::FeatureEnd;
227 } else if (ver < 32) {
228 return Target::CUDACapability30;
229 } else if (ver < 35) {
230 return Target::CUDACapability32;
231 } else if (ver < 50) {
232 return Target::CUDACapability35;
233 } else if (ver < 61) {
234 return Target::CUDACapability50;
235 } else if (ver < 70) {
236 return Target::CUDACapability61;
237 } else if (ver < 75) {
238 return Target::CUDACapability70;
239 } else if (ver < 80) {
240 return Target::CUDACapability75;
241 } else {
242 return Target::CUDACapability80;
243 }
244 }
245
get_host_cuda_capability(Target t)246 Target::Feature get_host_cuda_capability(Target t) {
247 static Target::Feature cap = calculate_host_cuda_capability(t);
248 return cap;
249 }
250
251 const std::map<std::string, Target::OS> os_name_map = {
252 {"os_unknown", Target::OSUnknown},
253 {"linux", Target::Linux},
254 {"windows", Target::Windows},
255 {"osx", Target::OSX},
256 {"android", Target::Android},
257 {"ios", Target::IOS},
258 {"qurt", Target::QuRT},
259 {"noos", Target::NoOS},
260 {"fuchsia", Target::Fuchsia},
261 {"wasmrt", Target::WebAssemblyRuntime}};
262
lookup_os(const std::string & tok,Target::OS & result)263 bool lookup_os(const std::string &tok, Target::OS &result) {
264 auto os_iter = os_name_map.find(tok);
265 if (os_iter != os_name_map.end()) {
266 result = os_iter->second;
267 return true;
268 }
269 return false;
270 }
271
272 const std::map<std::string, Target::Arch> arch_name_map = {
273 {"arch_unknown", Target::ArchUnknown},
274 {"x86", Target::X86},
275 {"arm", Target::ARM},
276 {"mips", Target::MIPS},
277 {"powerpc", Target::POWERPC},
278 {"hexagon", Target::Hexagon},
279 {"wasm", Target::WebAssembly},
280 {"riscv", Target::RISCV},
281 };
282
lookup_arch(const std::string & tok,Target::Arch & result)283 bool lookup_arch(const std::string &tok, Target::Arch &result) {
284 auto arch_iter = arch_name_map.find(tok);
285 if (arch_iter != arch_name_map.end()) {
286 result = arch_iter->second;
287 return true;
288 }
289 return false;
290 }
291
292 const std::map<std::string, Target::Feature> feature_name_map = {
293 {"jit", Target::JIT},
294 {"debug", Target::Debug},
295 {"no_asserts", Target::NoAsserts},
296 {"no_bounds_query", Target::NoBoundsQuery},
297 {"sse41", Target::SSE41},
298 {"avx", Target::AVX},
299 {"avx2", Target::AVX2},
300 {"fma", Target::FMA},
301 {"fma4", Target::FMA4},
302 {"f16c", Target::F16C},
303 {"armv7s", Target::ARMv7s},
304 {"no_neon", Target::NoNEON},
305 {"vsx", Target::VSX},
306 {"power_arch_2_07", Target::POWER_ARCH_2_07},
307 {"cuda", Target::CUDA},
308 {"cuda_capability_30", Target::CUDACapability30},
309 {"cuda_capability_32", Target::CUDACapability32},
310 {"cuda_capability_35", Target::CUDACapability35},
311 {"cuda_capability_50", Target::CUDACapability50},
312 {"cuda_capability_61", Target::CUDACapability61},
313 {"cuda_capability_70", Target::CUDACapability70},
314 {"cuda_capability_75", Target::CUDACapability75},
315 {"cuda_capability_80", Target::CUDACapability80},
316 {"opencl", Target::OpenCL},
317 {"cl_doubles", Target::CLDoubles},
318 {"cl_half", Target::CLHalf},
319 {"cl_atomics64", Target::CLAtomics64},
320 {"opengl", Target::OpenGL},
321 {"openglcompute", Target::OpenGLCompute},
322 {"egl", Target::EGL},
323 {"user_context", Target::UserContext},
324 {"matlab", Target::Matlab},
325 {"profile", Target::Profile},
326 {"no_runtime", Target::NoRuntime},
327 {"metal", Target::Metal},
328 {"c_plus_plus_name_mangling", Target::CPlusPlusMangling},
329 {"large_buffers", Target::LargeBuffers},
330 {"hvx_64", Target::HVX_64},
331 {"hvx_128", Target::HVX_128},
332 {"hvx_v62", Target::HVX_v62},
333 {"hvx_v65", Target::HVX_v65},
334 {"hvx_v66", Target::HVX_v66},
335 {"hvx_shared_object", Target::HVX_shared_object},
336 {"fuzz_float_stores", Target::FuzzFloatStores},
337 {"soft_float_abi", Target::SoftFloatABI},
338 {"msan", Target::MSAN},
339 {"avx512", Target::AVX512},
340 {"avx512_knl", Target::AVX512_KNL},
341 {"avx512_skylake", Target::AVX512_Skylake},
342 {"avx512_cannonlake", Target::AVX512_Cannonlake},
343 {"trace_loads", Target::TraceLoads},
344 {"trace_stores", Target::TraceStores},
345 {"trace_realizations", Target::TraceRealizations},
346 {"trace_pipeline", Target::TracePipeline},
347 {"d3d12compute", Target::D3D12Compute},
348 {"strict_float", Target::StrictFloat},
349 {"tsan", Target::TSAN},
350 {"asan", Target::ASAN},
351 {"check_unsafe_promises", Target::CheckUnsafePromises},
352 {"hexagon_dma", Target::HexagonDma},
353 {"embed_bitcode", Target::EmbedBitcode},
354 {"disable_llvm_loop_opt", Target::DisableLLVMLoopOpt},
355 {"enable_llvm_loop_opt", Target::EnableLLVMLoopOpt},
356 {"wasm_simd128", Target::WasmSimd128},
357 {"wasm_signext", Target::WasmSignExt},
358 {"wasm_sat_float_to_int", Target::WasmSatFloatToInt},
359 {"sve", Target::SVE},
360 {"sve2", Target::SVE2},
361 {"arm_dot_prod", Target::ARMDotProd},
362 // NOTE: When adding features to this map, be sure to update PyEnums.cpp as well.
363 };
364
lookup_feature(const std::string & tok,Target::Feature & result)365 bool lookup_feature(const std::string &tok, Target::Feature &result) {
366 auto feature_iter = feature_name_map.find(tok);
367 if (feature_iter != feature_name_map.end()) {
368 result = feature_iter->second;
369 return true;
370 }
371 return false;
372 }
373
374 } // End anonymous namespace
375
get_target_from_environment()376 Target get_target_from_environment() {
377 string target = Internal::get_env_variable("HL_TARGET");
378 if (target.empty()) {
379 return get_host_target();
380 } else {
381 return Target(target);
382 }
383 }
384
get_jit_target_from_environment()385 Target get_jit_target_from_environment() {
386 Target host = get_host_target();
387 host.set_feature(Target::JIT);
388 #if defined(__has_feature)
389 #if __has_feature(address_sanitizer)
390 host.set_feature(Target::ASAN);
391 #endif
392 #if __has_feature(memory_sanitizer)
393 host.set_feature(Target::MSAN);
394 #endif
395 #if __has_feature(thread_sanitizer)
396 host.set_feature(Target::TSAN);
397 #endif
398 #endif
399 string target = Internal::get_env_variable("HL_JIT_TARGET");
400 if (target.empty()) {
401 return host;
402 } else {
403 Target t(target);
404 t.set_feature(Target::JIT);
405 user_assert((t.os == host.os && t.arch == host.arch && t.bits == host.bits) || Internal::WasmModule::can_jit_target(t))
406 << "HL_JIT_TARGET must match the host OS, architecture, and bit width.\n"
407 << "HL_JIT_TARGET was " << target << ". "
408 << "Host is " << host.to_string() << ".\n";
409 return t;
410 }
411 }
412
413 namespace {
merge_string(Target & t,const std::string & target)414 bool merge_string(Target &t, const std::string &target) {
415 string rest = target;
416 vector<string> tokens;
417 size_t first_dash;
418 while ((first_dash = rest.find('-')) != string::npos) {
419 //Internal::debug(0) << first_dash << ", " << rest << "\n";
420 tokens.push_back(rest.substr(0, first_dash));
421 rest = rest.substr(first_dash + 1);
422 }
423 tokens.push_back(rest);
424
425 bool os_specified = false, arch_specified = false, bits_specified = false, features_specified = false;
426 bool is_host = false;
427
428 for (size_t i = 0; i < tokens.size(); i++) {
429 const string &tok = tokens[i];
430 Target::Feature feature;
431
432 if (tok == "host") {
433 if (i > 0) {
434 // "host" is now only allowed as the first token.
435 return false;
436 }
437 is_host = true;
438 t = get_host_target();
439 } else if (tok == "32" || tok == "64" || tok == "0") {
440 if (bits_specified) {
441 return false;
442 }
443 bits_specified = true;
444 t.bits = std::stoi(tok);
445 } else if (lookup_arch(tok, t.arch)) {
446 if (arch_specified) {
447 return false;
448 }
449 arch_specified = true;
450 } else if (lookup_os(tok, t.os)) {
451 if (os_specified) {
452 return false;
453 }
454 os_specified = true;
455 } else if (lookup_feature(tok, feature)) {
456 t.set_feature(feature);
457 features_specified = true;
458 } else if (tok == "trace_all") {
459 t.set_features({Target::TraceLoads, Target::TraceStores, Target::TraceRealizations});
460 features_specified = true;
461 } else {
462 return false;
463 }
464 }
465
466 if (is_host &&
467 t.has_feature(Target::CUDA) &&
468 !t.has_feature(Target::CUDACapability30) &&
469 !t.has_feature(Target::CUDACapability32) &&
470 !t.has_feature(Target::CUDACapability35) &&
471 !t.has_feature(Target::CUDACapability50) &&
472 !t.has_feature(Target::CUDACapability61) &&
473 !t.has_feature(Target::CUDACapability70) &&
474 !t.has_feature(Target::CUDACapability75) &&
475 !t.has_feature(Target::CUDACapability80)) {
476 // Detect host cuda capability
477 t.set_feature(get_host_cuda_capability(t));
478 }
479
480 if (arch_specified && !bits_specified) {
481 return false;
482 }
483
484 if (bits_specified && t.bits == 0) {
485 // bits == 0 is allowed iff arch and os are "unknown" and no features are set,
486 // to allow for roundtripping the string for default Target() ctor.
487 if (!(arch_specified && t.arch == Target::ArchUnknown) ||
488 !(os_specified && t.os == Target::OSUnknown) ||
489 features_specified) {
490 return false;
491 }
492 }
493
494 return true;
495 }
496
bad_target_string(const std::string & target)497 void bad_target_string(const std::string &target) {
498 const char *separator = "";
499 std::string architectures;
500 for (const auto &arch_entry : arch_name_map) {
501 architectures += separator + arch_entry.first;
502 separator = ", ";
503 }
504 separator = "";
505 std::string oses;
506 for (const auto &os_entry : os_name_map) {
507 oses += separator + os_entry.first;
508 separator = ", ";
509 }
510 separator = "";
511 // Format the features to go one feature over 70 characters per line,
512 // assume the first line starts with "Features are ".
513 int line_char_start = -(int)sizeof("Features are");
514 std::string features;
515 for (const auto &feature_entry : feature_name_map) {
516 features += separator + feature_entry.first;
517 if (features.length() - line_char_start > 70) {
518 separator = "\n";
519 line_char_start = features.length();
520 } else {
521 separator = ", ";
522 }
523 }
524 user_error << "Did not understand Halide target " << target << "\n"
525 << "Expected format is arch-bits-os-feature1-feature2-...\n"
526 << "Where arch is: " << architectures << ".\n"
527 << "bits is either 32 or 64.\n"
528 << "os is: " << oses << ".\n"
529 << "\n"
530 << "If arch, bits, or os are omitted, they default to the host.\n"
531 << "\n"
532 << "Features are: " << features << ".\n"
533 << "\n"
534 << "The target can also begin with \"host\", which sets the "
535 << "host's architecture, os, and feature set, with the "
536 << "exception of the GPU runtimes, which default to off.\n"
537 << "\n"
538 << "On this platform, the host target is: " << get_host_target().to_string() << "\n";
539 }
540
541 } // namespace
542
Target(const std::string & target)543 Target::Target(const std::string &target)
544 : os(OSUnknown), arch(ArchUnknown), bits(0) {
545 Target host = get_host_target();
546
547 if (target.empty()) {
548 // If nothing is specified, use the full host target.
549 *this = host;
550 } else {
551 if (!merge_string(*this, target) || has_unknowns()) {
552 bad_target_string(target);
553 }
554 }
555 }
556
Target(const char * s)557 Target::Target(const char *s)
558 : Target(std::string(s)) {
559 }
560
validate_target_string(const std::string & s)561 bool Target::validate_target_string(const std::string &s) {
562 Target t;
563 return merge_string(t, s) && !t.has_unknowns();
564 }
565
feature_to_name(Target::Feature feature)566 std::string Target::feature_to_name(Target::Feature feature) {
567 for (const auto &feature_entry : feature_name_map) {
568 if (feature == feature_entry.second) {
569 return feature_entry.first;
570 }
571 }
572 internal_error;
573 return "";
574 }
575
feature_from_name(const std::string & name)576 Target::Feature Target::feature_from_name(const std::string &name) {
577 Target::Feature feature;
578 if (lookup_feature(name, feature)) {
579 return feature;
580 }
581 return Target::FeatureEnd;
582 }
583
to_string() const584 std::string Target::to_string() const {
585 string result;
586 for (const auto &arch_entry : arch_name_map) {
587 if (arch_entry.second == arch) {
588 result += arch_entry.first;
589 break;
590 }
591 }
592 result += "-" + std::to_string(bits);
593 for (const auto &os_entry : os_name_map) {
594 if (os_entry.second == os) {
595 result += "-" + os_entry.first;
596 break;
597 }
598 }
599 for (const auto &feature_entry : feature_name_map) {
600 if (has_feature(feature_entry.second)) {
601 result += "-" + feature_entry.first;
602 }
603 }
604 // Use has_feature() multiple times (rather than features_any_of())
605 // to avoid constructing a temporary vector for this rather-common call.
606 if (has_feature(Target::TraceLoads) && has_feature(Target::TraceStores) && has_feature(Target::TraceRealizations)) {
607 result = Internal::replace_all(result, "trace_loads-trace_realizations-trace_stores", "trace_all");
608 }
609 return result;
610 }
611
612 /** Was libHalide compiled with support for this target? */
supported() const613 bool Target::supported() const {
614 bool bad = false;
615 #if !defined(WITH_ARM)
616 bad |= arch == Target::ARM && bits == 32;
617 #endif
618 #if !defined(WITH_AARCH64)
619 bad |= arch == Target::ARM && bits == 64;
620 #endif
621 #if !defined(WITH_X86)
622 bad |= arch == Target::X86;
623 #endif
624 #if !defined(WITH_MIPS)
625 bad |= arch == Target::MIPS;
626 #endif
627 #if !defined(WITH_POWERPC)
628 bad |= arch == Target::POWERPC;
629 #endif
630 #if !defined(WITH_HEXAGON)
631 bad |= arch == Target::Hexagon;
632 #endif
633 #if !defined(WITH_WEBASSEMBLY)
634 bad |= arch == Target::WebAssembly;
635 #endif
636 #if !defined(WITH_RISCV)
637 bad |= arch == Target::RISCV;
638 #endif
639 #if !defined(WITH_NVPTX)
640 bad |= has_feature(Target::CUDA);
641 #endif
642 #if !defined(WITH_OPENCL)
643 bad |= has_feature(Target::OpenCL);
644 #endif
645 #if !defined(WITH_METAL)
646 bad |= has_feature(Target::Metal);
647 #endif
648 #if !defined(WITH_OPENGL)
649 bad |= has_feature(Target::OpenGL) || has_feature(Target::OpenGLCompute);
650 #endif
651 #if !defined(WITH_D3D12)
652 bad |= has_feature(Target::D3D12Compute);
653 #endif
654 return !bad;
655 }
656
has_unknowns() const657 bool Target::has_unknowns() const {
658 return os == OSUnknown || arch == ArchUnknown || bits == 0;
659 }
660
set_feature(Feature f,bool value)661 void Target::set_feature(Feature f, bool value) {
662 if (f == FeatureEnd) return;
663 user_assert(f < FeatureEnd) << "Invalid Target feature.\n";
664 features.set(f, value);
665 }
666
set_features(const std::vector<Feature> & features_to_set,bool value)667 void Target::set_features(const std::vector<Feature> &features_to_set, bool value) {
668 for (Feature f : features_to_set) {
669 set_feature(f, value);
670 }
671 }
672
has_feature(Feature f) const673 bool Target::has_feature(Feature f) const {
674 if (f == FeatureEnd) return true;
675 user_assert(f < FeatureEnd) << "Invalid Target feature.\n";
676 return features[f];
677 }
678
features_any_of(const std::vector<Feature> & test_features) const679 bool Target::features_any_of(const std::vector<Feature> &test_features) const {
680 for (Feature f : test_features) {
681 if (has_feature(f)) {
682 return true;
683 }
684 }
685 return false;
686 }
687
features_all_of(const std::vector<Feature> & test_features) const688 bool Target::features_all_of(const std::vector<Feature> &test_features) const {
689 for (Feature f : test_features) {
690 if (!has_feature(f)) {
691 return false;
692 }
693 }
694 return true;
695 }
696
with_feature(Feature f) const697 Target Target::with_feature(Feature f) const {
698 Target copy = *this;
699 copy.set_feature(f);
700 return copy;
701 }
702
without_feature(Feature f) const703 Target Target::without_feature(Feature f) const {
704 Target copy = *this;
705 copy.set_feature(f, false);
706 return copy;
707 }
708
has_gpu_feature() const709 bool Target::has_gpu_feature() const {
710 return (has_feature(CUDA) ||
711 has_feature(OpenCL) ||
712 has_feature(Metal) ||
713 has_feature(D3D12Compute) ||
714 has_feature(OpenGLCompute));
715 }
716
get_cuda_capability_lower_bound() const717 int Target::get_cuda_capability_lower_bound() const {
718 if (!has_feature(Target::CUDA)) {
719 return -1;
720 }
721 if (has_feature(Target::CUDACapability30)) {
722 return 30;
723 }
724 if (has_feature(Target::CUDACapability32)) {
725 return 32;
726 }
727 if (has_feature(Target::CUDACapability35)) {
728 return 35;
729 }
730 if (has_feature(Target::CUDACapability50)) {
731 return 50;
732 }
733 if (has_feature(Target::CUDACapability61)) {
734 return 61;
735 }
736 if (has_feature(Target::CUDACapability70)) {
737 return 70;
738 }
739 if (has_feature(Target::CUDACapability75)) {
740 return 75;
741 }
742 if (has_feature(Target::CUDACapability80)) {
743 return 80;
744 }
745 return 20;
746 }
747
supports_type(const Type & t) const748 bool Target::supports_type(const Type &t) const {
749 if (t.bits() == 64) {
750 if (t.is_float()) {
751 return !has_feature(Metal) &&
752 !has_feature(OpenGL) &&
753 !has_feature(OpenGLCompute) &&
754 !has_feature(D3D12Compute) &&
755 (!has_feature(Target::OpenCL) || has_feature(Target::CLDoubles));
756 } else {
757 return (!has_feature(Metal) &&
758 !has_feature(OpenGLCompute) &&
759 !has_feature(OpenGL) &&
760 !has_feature(D3D12Compute));
761 }
762 }
763 return true;
764 }
765
supports_type(const Type & t,DeviceAPI device) const766 bool Target::supports_type(const Type &t, DeviceAPI device) const {
767 if (device == DeviceAPI::Default_GPU) {
768 device = get_default_device_api_for_target(*this);
769 }
770
771 if (device == DeviceAPI::Hexagon) {
772 // HVX supports doubles and long long in the scalar unit only.
773 if (t.is_float() || t.bits() == 64) {
774 return t.lanes() == 1;
775 }
776 } else if (device == DeviceAPI::Metal) {
777 // Metal spec says no double or long long.
778 if (t.bits() == 64) {
779 return false;
780 }
781 } else if (device == DeviceAPI::OpenCL) {
782 if (t.is_float() && t.bits() == 64) {
783 return has_feature(Target::CLDoubles);
784 }
785 } else if (device == DeviceAPI::D3D12Compute) {
786 // Shader Model 5.x can optionally support double-precision; 64-bit int
787 // types are not supported.
788 return t.bits() < 64;
789 } else if (device == DeviceAPI::OpenGLCompute) {
790 return t.bits() < 64;
791 }
792
793 return true;
794 }
795
supports_device_api(DeviceAPI api) const796 bool Target::supports_device_api(DeviceAPI api) const {
797 switch (api) {
798 case DeviceAPI::None:
799 return true;
800 case DeviceAPI::Host:
801 return true;
802 case DeviceAPI::Default_GPU:
803 return has_gpu_feature();
804 case DeviceAPI::Hexagon:
805 return has_feature(Target::HVX_64) || has_feature(Target::HVX_128);
806 case DeviceAPI::HexagonDma:
807 return has_feature(Target::HexagonDma);
808 default:
809 return has_feature(target_feature_for_device_api(api));
810 }
811 }
812
get_required_device_api() const813 DeviceAPI Target::get_required_device_api() const {
814 if (has_feature(Target::CUDA)) return DeviceAPI::CUDA;
815 if (has_feature(Target::D3D12Compute)) return DeviceAPI::D3D12Compute;
816 if (has_feature(Target::HVX_128)) return DeviceAPI::Hexagon;
817 if (has_feature(Target::HexagonDma)) return DeviceAPI::HexagonDma;
818 if (has_feature(Target::Metal)) return DeviceAPI::Metal;
819 if (has_feature(Target::OpenCL)) return DeviceAPI::OpenCL;
820 if (has_feature(Target::OpenGL)) return DeviceAPI::GLSL;
821 if (has_feature(Target::OpenGLCompute)) return DeviceAPI::OpenGLCompute;
822 return DeviceAPI::None;
823 }
824
target_feature_for_device_api(DeviceAPI api)825 Target::Feature target_feature_for_device_api(DeviceAPI api) {
826 switch (api) {
827 case DeviceAPI::CUDA:
828 return Target::CUDA;
829 case DeviceAPI::OpenCL:
830 return Target::OpenCL;
831 case DeviceAPI::GLSL:
832 return Target::OpenGL;
833 case DeviceAPI::OpenGLCompute:
834 return Target::OpenGLCompute;
835 case DeviceAPI::Metal:
836 return Target::Metal;
837 case DeviceAPI::Hexagon:
838 return Target::HVX_128;
839 case DeviceAPI::D3D12Compute:
840 return Target::D3D12Compute;
841 default:
842 return Target::FeatureEnd;
843 }
844 }
845
natural_vector_size(const Halide::Type & t) const846 int Target::natural_vector_size(const Halide::Type &t) const {
847 user_assert(!has_unknowns())
848 << "natural_vector_size cannot be used on a Target with Unknown values.\n";
849
850 const bool is_integer = t.is_int() || t.is_uint();
851 const int data_size = t.bytes();
852
853 if (arch == Target::Hexagon) {
854 if (is_integer) {
855 // HVX is either 64 or 128 *byte* vector size.
856 if (has_feature(Halide::Target::HVX_128)) {
857 return 128 / data_size;
858 } else if (has_feature(Halide::Target::HVX_64)) {
859 return 64 / data_size;
860 } else {
861 user_error << "Target uses hexagon arch without hvx_128 or hvx_64 set.\n";
862 return 0;
863 }
864 } else {
865 // HVX does not have vector float instructions.
866 return 1;
867 }
868 } else if (arch == Target::X86) {
869 if (is_integer && (has_feature(Halide::Target::AVX512_Skylake) ||
870 has_feature(Halide::Target::AVX512_Cannonlake))) {
871 // AVX512BW exists on Skylake and Cannonlake
872 return 64 / data_size;
873 } else if (t.is_float() && (has_feature(Halide::Target::AVX512) ||
874 has_feature(Halide::Target::AVX512_KNL) ||
875 has_feature(Halide::Target::AVX512_Skylake) ||
876 has_feature(Halide::Target::AVX512_Cannonlake))) {
877 // AVX512F is on all AVX512 architectures
878 return 64 / data_size;
879 } else if (has_feature(Halide::Target::AVX2)) {
880 // AVX2 uses 256-bit vectors for everything.
881 return 32 / data_size;
882 } else if (!is_integer && has_feature(Halide::Target::AVX)) {
883 // AVX 1 has 256-bit vectors for float, but not for
884 // integer instructions.
885 return 32 / data_size;
886 } else {
887 // SSE was all 128-bit. We ignore MMX.
888 return 16 / data_size;
889 }
890 } else if (arch == Target::WebAssembly) {
891 if (has_feature(Halide::Target::WasmSimd128)) {
892 if (t.bits() == 64) {
893 // int64 and float64 aren't supported in simd128.
894 return 1;
895 }
896 // 128-bit vectors for other types.
897 return 16 / data_size;
898 } else {
899 // No vectors, sorry.
900 return 1;
901 }
902 } else {
903 // Assume 128-bit vectors on other targets.
904 return 16 / data_size;
905 }
906 }
907
get_runtime_compatible_target(const Target & other,Target & result)908 bool Target::get_runtime_compatible_target(const Target &other, Target &result) {
909 // Create mask to select features that:
910 // (a) must be included if either target has the feature (union)
911 // (b) must be included if both targets have the feature (intersection)
912 // (c) must match across both targets; it is an error if one target has the feature and the other doesn't
913 const std::array<Feature, 18> union_features = {{// These are true union features.
914 CUDA, OpenCL, OpenGL, OpenGLCompute, Metal, D3D12Compute, NoNEON,
915
916 // These features are actually intersection-y, but because targets only record the _highest_,
917 // we have to put their union in the result and then take a lower bound.
918 CUDACapability30, CUDACapability32, CUDACapability35, CUDACapability50, CUDACapability61, CUDACapability70, CUDACapability75, CUDACapability80,
919 HVX_v62, HVX_v65, HVX_v66}};
920
921 const std::array<Feature, 12> intersection_features = {{SSE41, AVX, AVX2, FMA, FMA4, F16C, ARMv7s, VSX, AVX512, AVX512_KNL, AVX512_Skylake, AVX512_Cannonlake}};
922
923 const std::array<Feature, 10> matching_features = {{SoftFloatABI, Debug, TSAN, ASAN, MSAN, HVX_64, HVX_128, HexagonDma, HVX_shared_object}};
924
925 // bitsets need to be the same width.
926 decltype(result.features) union_mask;
927 decltype(result.features) intersection_mask;
928 decltype(result.features) matching_mask;
929
930 for (auto &feature : union_features) {
931 union_mask.set(feature);
932 }
933
934 for (auto &feature : intersection_features) {
935 intersection_mask.set(feature);
936 }
937
938 for (auto &feature : matching_features) {
939 matching_mask.set(feature);
940 }
941
942 if (arch != other.arch || bits != other.bits || os != other.os) {
943 Internal::debug(1) << "runtime targets must agree on platform (arch-bits-os)\n"
944 << " this: " << *this << "\n"
945 << " other: " << other << "\n";
946 return false;
947 }
948
949 if ((features & matching_mask) != (other.features & matching_mask)) {
950 Internal::debug(1) << "runtime targets must agree on SoftFloatABI, Debug, TSAN, ASAN, MSAN, HVX_64, HVX_128, HexagonDma, and HVX_shared_object\n"
951 << " this: " << *this << "\n"
952 << " other: " << other << "\n";
953 return false;
954 }
955
956 // Union of features is computed through bitwise-or, and masked away by the features we care about
957 // Intersection of features is computed through bitwise-and and masked away, too.
958 // We merge the bits via bitwise or.
959 Target output = Target{os, arch, bits};
960 output.features = ((features | other.features) & union_mask) | ((features | other.features) & matching_mask) | ((features & other.features) & intersection_mask);
961
962 // Pick tight lower bound for CUDA capability. Use fall-through to clear redundant features
963 int cuda_a = get_cuda_capability_lower_bound();
964 int cuda_b = other.get_cuda_capability_lower_bound();
965
966 // get_cuda_capability_lower_bound returns -1 when unused. Casting to unsigned makes this
967 // large, so min selects the true lower bound when one target doesn't specify a capability,
968 // and the other doesn't use CUDA at all.
969 int cuda_capability = std::min((unsigned)cuda_a, (unsigned)cuda_b);
970 if (cuda_capability < 30) output.features.reset(CUDACapability30);
971 if (cuda_capability < 32) output.features.reset(CUDACapability32);
972 if (cuda_capability < 35) output.features.reset(CUDACapability35);
973 if (cuda_capability < 50) output.features.reset(CUDACapability50);
974 if (cuda_capability < 61) output.features.reset(CUDACapability61);
975 if (cuda_capability < 70) output.features.reset(CUDACapability70);
976 if (cuda_capability < 75) output.features.reset(CUDACapability75);
977 if (cuda_capability < 80) output.features.reset(CUDACapability80);
978
979 // Pick tight lower bound for HVX version. Use fall-through to clear redundant features
980 int hvx_a = get_hvx_lower_bound(*this);
981 int hvx_b = get_hvx_lower_bound(other);
982
983 // Same trick as above for CUDA
984 int hvx_version = std::min((unsigned)hvx_a, (unsigned)hvx_b);
985 if (hvx_version < 62) output.features.reset(HVX_v62);
986 if (hvx_version < 65) output.features.reset(HVX_v65);
987 if (hvx_version < 66) output.features.reset(HVX_v66);
988
989 result = output;
990 return true;
991 }
992
993 namespace Internal {
994
target_test()995 void target_test() {
996 Target t;
997 for (const auto &feature : feature_name_map) {
998 t.set_feature(feature.second);
999 }
1000 for (int i = 0; i < (int)(Target::FeatureEnd); i++) {
1001 internal_assert(t.has_feature((Target::Feature)i)) << "Feature " << i << " not in feature_names_map.\n";
1002 }
1003
1004 // 3 targets: {A,B,C}. Want gcd(A,B)=C
1005 std::vector<std::array<std::string, 3>> gcd_tests = {
1006 {{"x86-64-linux-sse41-fma", "x86-64-linux-sse41-fma", "x86-64-linux-sse41-fma"}},
1007 {{"x86-64-linux-sse41-fma-no_asserts-no_runtime", "x86-64-linux-sse41-fma", "x86-64-linux-sse41-fma"}},
1008 {{"x86-64-linux-avx2-sse41", "x86-64-linux-sse41-fma", "x86-64-linux-sse41"}},
1009 {{"x86-64-linux-avx2-sse41", "x86-32-linux-sse41-fma", ""}},
1010 {{"x86-64-linux-cuda", "x86-64-linux", "x86-64-linux-cuda"}},
1011 {{"x86-64-linux-cuda-cuda_capability_50", "x86-64-linux-cuda", "x86-64-linux-cuda"}},
1012 {{"x86-64-linux-cuda-cuda_capability_50", "x86-64-linux-cuda-cuda_capability_30", "x86-64-linux-cuda-cuda_capability_30"}},
1013 {{"x86-64-linux-cuda", "x86-64-linux-opengl", "x86-64-linux-cuda-opengl"}},
1014 {{"hexagon-32-qurt-hvx_v65", "hexagon-32-qurt-hvx_v62", "hexagon-32-qurt-hvx_v62"}},
1015 {{"hexagon-32-qurt-hvx_v62", "hexagon-32-qurt", "hexagon-32-qurt"}},
1016 {{"hexagon-32-qurt-hvx_v62-hvx_64", "hexagon-32-qurt", ""}},
1017 {{"hexagon-32-qurt-hvx_v62-hvx_64", "hexagon-32-qurt-hvx_64", "hexagon-32-qurt-hvx_64"}},
1018 };
1019
1020 for (const auto &test : gcd_tests) {
1021 Target result{};
1022 Target a{test[0]};
1023 Target b{test[1]};
1024 if (a.get_runtime_compatible_target(b, result)) {
1025 internal_assert(!test[2].empty() && result == Target{test[2]})
1026 << "Targets " << a.to_string() << " and " << b.to_string() << " were computed to have gcd "
1027 << result.to_string() << " but expected '" << test[2] << "'\n";
1028 } else {
1029 internal_assert(test[2].empty())
1030 << "Targets " << a.to_string() << " and " << b.to_string() << " were computed to have no gcd "
1031 << "but " << test[2] << " was expected.";
1032 }
1033 }
1034
1035 std::cout << "Target test passed" << std::endl;
1036 }
1037
1038 } // namespace Internal
1039
1040 } // namespace Halide
1041