1 /*
2 Copyright (c) 2010-2021, Intel Corporation
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are
7 met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15
16 * Neither the name of Intel Corporation nor the names of its
17 contributors may be used to endorse or promote products derived from
18 this software without specific prior written permission.
19
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
22 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
25 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /** @file ispc.cpp
35 @brief ispc global definitions
36 */
37
38 #include "ispc.h"
39 #include "llvmutil.h"
40 #include "module.h"
41 #include "util.h"
42
43 #include <sstream>
44 #include <stdarg.h> /* va_list, va_start, va_arg, va_end */
45 #include <stdio.h>
46 #ifdef ISPC_HOST_IS_WINDOWS
47 #include <direct.h>
48 #include <windows.h>
49 #define strcasecmp stricmp
50 #include <intrin.h>
51 #else // !ISPC_HOST_IS_WINDOWS
52 #include <sys/types.h>
53 #include <unistd.h>
54 #endif // ISPC_HOST_IS_WINDOWS
55
56 #include <llvm/BinaryFormat/Dwarf.h>
57 #include <llvm/CodeGen/TargetLowering.h>
58 #include <llvm/CodeGen/TargetSubtargetInfo.h>
59 #include <llvm/IR/Attributes.h>
60 #include <llvm/IR/DIBuilder.h>
61 #include <llvm/IR/DataLayout.h>
62 #include <llvm/IR/DebugInfo.h>
63 #include <llvm/IR/Instructions.h>
64 #include <llvm/IR/LLVMContext.h>
65 #include <llvm/IR/Module.h>
66 #include <llvm/Support/CodeGen.h>
67 #include <llvm/Support/Host.h>
68 #include <llvm/Support/TargetRegistry.h>
69 #include <llvm/Support/TargetSelect.h>
70 #include <llvm/Target/TargetMachine.h>
71 #include <llvm/Target/TargetOptions.h>
72
73 using namespace ispc;
74
75 Globals *ispc::g;
76 Module *ispc::m;
77
78 ///////////////////////////////////////////////////////////////////////////
79 // Target
80
81 #if defined(__arm__) || defined(__aarch64__)
82 #define ARM_HOST
83 #endif
84
85 #if !defined(ISPC_HOST_IS_WINDOWS) && !defined(ARM_HOST)
86 // __cpuid() and __cpuidex() are defined on Windows in <intrin.h> for x86/x64.
87 // On *nix they need to be defined manually through inline assembler.
__cpuid(int info[4],int infoType)88 static void __cpuid(int info[4], int infoType) {
89 __asm__ __volatile__("cpuid" : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) : "0"(infoType));
90 }
91
92 /* Save %ebx in case it's the PIC register */
__cpuidex(int info[4],int level,int count)93 static void __cpuidex(int info[4], int level, int count) {
94 __asm__ __volatile__("xchg{l}\t{%%}ebx, %1\n\t"
95 "cpuid\n\t"
96 "xchg{l}\t{%%}ebx, %1\n\t"
97 : "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3])
98 : "0"(level), "2"(count));
99 }
100 #endif // !ISPC_HOST_IS_WINDOWS && !__ARM__ && !__AARCH64__
101
102 #ifndef ARM_HOST
__os_has_avx_support()103 static bool __os_has_avx_support() {
104 #if defined(ISPC_HOST_IS_WINDOWS)
105 // Check if the OS will save the YMM registers
106 unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
107 return (xcrFeatureMask & 6) == 6;
108 #else // !defined(ISPC_HOST_IS_WINDOWS)
109 // Check xgetbv; this uses a .byte sequence instead of the instruction
110 // directly because older assemblers do not include support for xgetbv and
111 // there is no easy way to conditionally compile based on the assembler used.
112 int rEAX, rEDX;
113 __asm__ __volatile__(".byte 0x0f, 0x01, 0xd0" : "=a"(rEAX), "=d"(rEDX) : "c"(0));
114 return (rEAX & 6) == 6;
115 #endif // !defined(ISPC_HOST_IS_WINDOWS)
116 }
117
__os_has_avx512_support()118 static bool __os_has_avx512_support() {
119 #if defined(ISPC_HOST_IS_WINDOWS)
120 // Check if the OS saves the XMM, YMM and ZMM registers, i.e. it supports AVX2 and AVX512.
121 // See section 2.1 of software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf
122 unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
123 return (xcrFeatureMask & 0xE6) == 0xE6;
124 #elif defined(ISPC_HOST_IS_APPLE)
125 // macOS has different way of dealing with AVX512 than Windows and Linux:
126 // - by default AVX512 is off in the newly created thread, which means CPUID flags will
127 // indicate AVX512 availability, but OS support check (XCR0) will not succeed.
128 // - AVX512 can be enabled either by calling thread_set_state() or by executing any
129 // AVX512 instruction, which would cause #UD exception handled by the OS.
130 // The purpose of this check is to identify if AVX512 is potentially available, so we
131 // need to bypass OS check and look at CPUID flags only.
132 // See ispc issue #1854 for more details.
133 return true;
134 #else // !defined(ISPC_HOST_IS_WINDOWS)
135 // Check xgetbv; this uses a .byte sequence instead of the instruction
136 // directly because older assemblers do not include support for xgetbv and
137 // there is no easy way to conditionally compile based on the assembler used.
138 int rEAX, rEDX;
139 __asm__ __volatile__(".byte 0x0f, 0x01, 0xd0" : "=a"(rEAX), "=d"(rEDX) : "c"(0));
140 return (rEAX & 0xE6) == 0xE6;
141 #endif // !defined(ISPC_HOST_IS_WINDOWS)
142 }
143 #endif // !ARM_HOST
144
lGetSystemISA()145 static ISPCTarget lGetSystemISA() {
146 #ifdef ARM_HOST
147 return ISPCTarget::neon_i32x4;
148 #else
149 int info[4];
150 __cpuid(info, 1);
151
152 int info2[4];
153 // Call cpuid with eax=7, ecx=0
154 __cpuidex(info2, 7, 0);
155
156 if ((info[2] & (1 << 27)) != 0 && // OSXSAVE
157 (info2[1] & (1 << 5)) != 0 && // AVX2
158 (info2[1] & (1 << 16)) != 0 && // AVX512 F
159 __os_has_avx512_support()) {
160 // We need to verify that AVX2 is also available,
161 // as well as AVX512, because our targets are supposed
162 // to use both.
163
164 if ((info2[1] & (1 << 17)) != 0 && // AVX512 DQ
165 (info2[1] & (1 << 28)) != 0 && // AVX512 CDI
166 (info2[1] & (1 << 30)) != 0 && // AVX512 BW
167 (info2[1] & (1 << 31)) != 0) { // AVX512 VL
168 return ISPCTarget::avx512skx_i32x16;
169 } else if ((info2[1] & (1 << 26)) != 0 && // AVX512 PF
170 (info2[1] & (1 << 27)) != 0 && // AVX512 ER
171 (info2[1] & (1 << 28)) != 0) { // AVX512 CDI
172 return ISPCTarget::avx512knl_i32x16;
173 }
174 // If it's unknown AVX512 target, fall through and use AVX2
175 // or whatever is available in the machine.
176 }
177
178 if ((info[2] & (1 << 27)) != 0 && // OSXSAVE
179 (info[2] & (1 << 28)) != 0 && __os_has_avx_support()) { // AVX
180 // AVX1 for sure....
181 // Ivy Bridge?
182 if ((info[2] & (1 << 29)) != 0 && // F16C
183 (info[2] & (1 << 30)) != 0 && // RDRAND
184 (info2[1] & (1 << 5)) != 0) { // AVX2.
185 return ISPCTarget::avx2_i32x8;
186 }
187 // Regular AVX
188 return ISPCTarget::avx1_i32x8;
189 } else if ((info[2] & (1 << 19)) != 0)
190 return ISPCTarget::sse4_i32x4;
191 else if ((info[3] & (1 << 26)) != 0)
192 return ISPCTarget::sse2_i32x4;
193 else {
194 Error(SourcePos(), "Unable to detect supported SSE/AVX ISA. Exiting.");
195 exit(1);
196 }
197 #endif
198 }
199
lIsTargetValidforArch(ISPCTarget target,Arch arch)200 static const bool lIsTargetValidforArch(ISPCTarget target, Arch arch) {
201 bool ret = true;
202 // If target name starts with sse or avx, has to be x86 or x86-64.
203 if (ISPCTargetIsX86(target)) {
204 if (arch != Arch::x86_64 && arch != Arch::x86)
205 ret = false;
206 } else if (target == ISPCTarget::neon_i8x16 || target == ISPCTarget::neon_i16x8) {
207 if (arch != Arch::arm)
208 ret = false;
209 } else if (target == ISPCTarget::neon_i32x4 || target == ISPCTarget::neon_i32x8) {
210 if (arch != Arch::arm && arch != Arch::aarch64)
211 ret = false;
212 } else if (ISPCTargetIsGen(target)) {
213 if (arch != Arch::genx32 && arch != Arch::genx64)
214 ret = false;
215 }
216
217 return ret;
218 }
219 typedef enum {
220 // Special value, indicates that no CPU is present.
221 CPU_None = 0,
222
223 // A generic 64-bit specific x86 processor model which tries to be good
224 // for modern chips without enabling instruction set encodings past the
225 // basic SSE2 and 64-bit ones
226 CPU_x86_64 = 1,
227
228 // Early Atom CPU. Supports SSSE3.
229 CPU_Bonnell,
230
231 // Generic Core2-like. Supports SSSE3. Isn`t quite compatible with Bonnell,
232 // but for ISPC the difference is negligible; ISPC doesn`t make use of it.
233 CPU_Core2,
234
235 // Core2 Solo/Duo/Quad/Extreme. Supports SSE 4.1 (but not 4.2).
236 CPU_Penryn,
237
238 // Late Core2-like. Supports SSE 4.2 + POPCNT/LZCNT.
239 CPU_Nehalem,
240
241 // CPU in PS4/Xbox One.
242 CPU_PS4,
243
244 // Sandy Bridge. Supports AVX 1.
245 CPU_SandyBridge,
246
247 // Ivy Bridge. Supports AVX 1 + RDRAND.
248 CPU_IvyBridge,
249
250 // Haswell. Supports AVX 2.
251 CPU_Haswell,
252
253 // Broadwell. Supports AVX 2 + ADX/RDSEED/SMAP.
254 CPU_Broadwell,
255
256 // Knights Landing - Xeon Phi.
257 // Supports AVX-512F: All the key AVX-512 features: masking, broadcast... ;
258 // AVX-512CDI: Conflict Detection;
259 // AVX-512ERI & PRI: 28-bit precision RCP, RSQRT and EXP transcendentals,
260 // new prefetch instructions.
261 CPU_KNL,
262 // Skylake Xeon.
263 // Supports AVX-512F: All the key AVX-512 features: masking, broadcast... ;
264 // AVX-512CDI: Conflict Detection;
265 // AVX-512VL: Vector Length Orthogonality;
266 // AVX-512DQ: New HPC ISA (vs AVX512F);
267 // AVX-512BW: Byte and Word Support.
268 CPU_SKX,
269
270 // Icelake client
271 CPU_ICL,
272
273 // Late Atom-like design. Supports SSE 4.2 + POPCNT/LZCNT.
274 CPU_Silvermont,
275
276 CPU_ICX,
277 CPU_TGL,
278 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
279 CPU_ADL,
280 CPU_SPR,
281 #endif
282
283 // FIXME: LLVM supports a ton of different ARM CPU variants--not just
284 // cortex-a9 and a15. We should be able to handle any of them that also
285 // have NEON support.
286 #ifdef ISPC_ARM_ENABLED
287 // ARM Cortex A9. Supports NEON VFPv3.
288 CPU_CortexA9,
289
290 // ARM Cortex A15. Supports NEON VFPv4.
291 CPU_CortexA15,
292
293 // ARM Cortex A35, A53, A57.
294 CPU_CortexA35,
295 CPU_CortexA53,
296 CPU_CortexA57,
297
298 // Apple CPUs.
299 CPU_AppleA7,
300 CPU_AppleA10,
301 CPU_AppleA11,
302 CPU_AppleA12,
303 CPU_AppleA13,
304 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
305 CPU_AppleA14,
306 #endif
307 #endif
308 #ifdef ISPC_GENX_ENABLED
309 CPU_GENX,
310 CPU_GENX_TGLLP,
311 #endif
312 sizeofCPUtype
313 } CPUtype;
314
315 // This map is used to verify features available for supported CPUs
316 // and is used to filter target dependent intrisics and report an error.
317 // This mechanism is not precise and doesn't take into account flavors
318 // of AVX512, for example.
319 // The following LLVM files were used as reference:
320 // CPU Features: <llvm>/lib/Support/X86TargetParser.cpp
321 // X86 Intrinsics: <llvm>/include/llvm/IR/IntrinsicsX86.td
322 std::map<CPUtype, std::set<std::string>> CPUFeatures = {
323 {CPU_x86_64, {"mmx", "sse", "sse2"}},
324 {CPU_Bonnell, {"mmx", "sse", "sse2", "ssse3"}},
325 {CPU_Core2, {"mmx", "sse", "sse2", "ssse3"}},
326 {CPU_Penryn, {"mmx", "sse", "sse2", "ssse3", "sse41"}},
327 {CPU_Nehalem, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42"}},
328 {CPU_PS4, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx"}},
329 {CPU_SandyBridge, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx"}},
330 {CPU_IvyBridge, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx"}},
331 {CPU_Haswell, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx", "avx2"}},
332 {CPU_Broadwell, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx", "avx2"}},
333 {CPU_KNL, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx", "avx2", "avx512"}},
334 {CPU_SKX, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx", "avx2", "avx512"}},
335 {CPU_ICL, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx", "avx2", "avx512"}},
336 {CPU_Silvermont, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42"}},
337 {CPU_ICX, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx", "avx2", "avx512"}},
338 {CPU_TGL, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx", "avx2", "avx512"}},
339 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
340 {CPU_ADL, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx", "avx2"}},
341 {CPU_SPR, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx", "avx2", "avx512"}},
342 #endif
343 // TODO: Add features for remaining CPUs if valid.
344 #ifdef ISPC_ARM_ENABLED
345 {CPU_CortexA9, {}},
346 {CPU_CortexA15, {}},
347 {CPU_CortexA35, {}},
348 {CPU_CortexA53, {}},
349 {CPU_CortexA57, {}},
350 {CPU_AppleA7, {}},
351 {CPU_AppleA10, {}},
352 {CPU_AppleA11, {}},
353 {CPU_AppleA12, {}},
354 {CPU_AppleA13, {}},
355 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
356 {CPU_AppleA14, {}},
357 #endif
358 #endif
359 #ifdef ISPC_GENX_ENABLED
360 {CPU_GENX, {}},
361 {CPU_GENX_TGLLP, {}}
362 #endif
363 };
364
365 class AllCPUs {
366 private:
367 std::vector<std::vector<std::string>> names;
368 std::vector<std::set<CPUtype>> compat;
369
Set(int type,...)370 std::set<CPUtype> Set(int type, ...) {
371 std::set<CPUtype> retn;
372 va_list args;
373
374 retn.insert((CPUtype)type);
375 va_start(args, type);
376 while ((type = va_arg(args, int)) != CPU_None)
377 retn.insert((CPUtype)type);
378 va_end(args);
379
380 return retn;
381 }
382
383 public:
AllCPUs()384 AllCPUs() {
385 names = std::vector<std::vector<std::string>>(sizeofCPUtype);
386 compat = std::vector<std::set<CPUtype>>(sizeofCPUtype);
387
388 names[CPU_None].push_back("");
389
390 names[CPU_x86_64].push_back("x86-64");
391
392 names[CPU_Bonnell].push_back("atom");
393 names[CPU_Bonnell].push_back("bonnell");
394
395 names[CPU_Core2].push_back("core2");
396
397 names[CPU_Penryn].push_back("penryn");
398
399 names[CPU_Silvermont].push_back("slm");
400 names[CPU_Silvermont].push_back("silvermont");
401
402 names[CPU_Nehalem].push_back("corei7");
403 names[CPU_Nehalem].push_back("nehalem");
404
405 names[CPU_PS4].push_back("btver2");
406 names[CPU_PS4].push_back("ps4");
407
408 names[CPU_SandyBridge].push_back("corei7-avx");
409 names[CPU_SandyBridge].push_back("sandybridge");
410
411 names[CPU_IvyBridge].push_back("core-avx-i");
412 names[CPU_IvyBridge].push_back("ivybridge");
413
414 names[CPU_Haswell].push_back("core-avx2");
415 names[CPU_Haswell].push_back("haswell");
416
417 names[CPU_Broadwell].push_back("broadwell");
418
419 names[CPU_KNL].push_back("knl");
420
421 names[CPU_SKX].push_back("skx");
422
423 names[CPU_ICL].push_back("icelake-client");
424 names[CPU_ICL].push_back("icl");
425
426 names[CPU_ICX].push_back("icelake-server");
427 names[CPU_ICX].push_back("icx");
428 names[CPU_TGL].push_back("tigerlake");
429 names[CPU_TGL].push_back("tgl");
430 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
431 names[CPU_ADL].push_back("alderlake");
432 names[CPU_ADL].push_back("adl");
433 names[CPU_SPR].push_back("sapphirerapids");
434 names[CPU_SPR].push_back("spr");
435 #endif
436
437 #ifdef ISPC_ARM_ENABLED
438 names[CPU_CortexA9].push_back("cortex-a9");
439 names[CPU_CortexA15].push_back("cortex-a15");
440 names[CPU_CortexA35].push_back("cortex-a35");
441 names[CPU_CortexA53].push_back("cortex-a53");
442 names[CPU_CortexA57].push_back("cortex-a57");
443
444 names[CPU_AppleA7].push_back("apple-a7");
445 names[CPU_AppleA10].push_back("apple-a10");
446 names[CPU_AppleA11].push_back("apple-a11");
447 names[CPU_AppleA12].push_back("apple-a12");
448 names[CPU_AppleA13].push_back("apple-a13");
449 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
450 names[CPU_AppleA14].push_back("apple-a14");
451 #endif
452 #endif
453
454 #ifdef ISPC_GENX_ENABLED
455 names[CPU_GENX].push_back("SKL");
456 names[CPU_GENX_TGLLP].push_back("TGLLP");
457 names[CPU_GENX_TGLLP].push_back("DG1");
458 #endif
459
460 Assert(names.size() == sizeofCPUtype);
461
462 compat[CPU_Silvermont] =
463 Set(CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont, CPU_None);
464
465 compat[CPU_KNL] = Set(CPU_KNL, CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont,
466 CPU_SandyBridge, CPU_IvyBridge, CPU_Haswell, CPU_Broadwell, CPU_None);
467
468 compat[CPU_SKX] = Set(CPU_SKX, CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont,
469 CPU_SandyBridge, CPU_IvyBridge, CPU_Haswell, CPU_Broadwell, CPU_None);
470 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
471 compat[CPU_SPR] =
472 Set(CPU_SPR, CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont, CPU_SandyBridge,
473 CPU_IvyBridge, CPU_Haswell, CPU_Broadwell, CPU_SKX, CPU_ICL, CPU_ICX, CPU_TGL, CPU_ADL, CPU_None);
474 compat[CPU_ADL] = Set(CPU_ADL, CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont,
475 CPU_SandyBridge, CPU_IvyBridge, CPU_Haswell, CPU_Broadwell, CPU_None);
476 #endif
477 compat[CPU_TGL] =
478 Set(CPU_TGL, CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont, CPU_SandyBridge,
479 CPU_IvyBridge, CPU_Haswell, CPU_Broadwell, CPU_SKX, CPU_ICL, CPU_ICX, CPU_None);
480 compat[CPU_ICX] = Set(CPU_ICX, CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont,
481 CPU_SandyBridge, CPU_IvyBridge, CPU_Haswell, CPU_Broadwell, CPU_SKX, CPU_ICL, CPU_None);
482
483 compat[CPU_ICL] = Set(CPU_ICL, CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont,
484 CPU_SandyBridge, CPU_IvyBridge, CPU_Haswell, CPU_Broadwell, CPU_SKX, CPU_None);
485
486 compat[CPU_Broadwell] = Set(CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont,
487 CPU_SandyBridge, CPU_IvyBridge, CPU_Haswell, CPU_Broadwell, CPU_None);
488 compat[CPU_Haswell] = Set(CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont,
489 CPU_SandyBridge, CPU_IvyBridge, CPU_Haswell, CPU_Broadwell, CPU_None);
490 compat[CPU_IvyBridge] = Set(CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont,
491 CPU_SandyBridge, CPU_IvyBridge, CPU_None);
492 compat[CPU_SandyBridge] =
493 Set(CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont, CPU_SandyBridge, CPU_None);
494 compat[CPU_PS4] = Set(CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont,
495 CPU_SandyBridge, CPU_PS4, CPU_None);
496 compat[CPU_Nehalem] =
497 Set(CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont, CPU_None);
498 compat[CPU_Penryn] = Set(CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont, CPU_None);
499 compat[CPU_Core2] = Set(CPU_x86_64, CPU_Bonnell, CPU_Core2, CPU_None);
500 compat[CPU_Bonnell] = Set(CPU_x86_64, CPU_Bonnell, CPU_Core2, CPU_None);
501
502 compat[CPU_x86_64] = Set(CPU_x86_64, CPU_None);
503
504 #ifdef ISPC_ARM_ENABLED
505 compat[CPU_CortexA15] = Set(CPU_CortexA9, CPU_CortexA15, CPU_None);
506 compat[CPU_CortexA9] = Set(CPU_CortexA9, CPU_None);
507 compat[CPU_CortexA35] = Set(CPU_CortexA35, CPU_None);
508 compat[CPU_CortexA53] = Set(CPU_CortexA53, CPU_None);
509 compat[CPU_CortexA57] = Set(CPU_CortexA57, CPU_None);
510 compat[CPU_AppleA7] = Set(CPU_AppleA7, CPU_None);
511 compat[CPU_AppleA10] = Set(CPU_AppleA10, CPU_None);
512 compat[CPU_AppleA11] = Set(CPU_AppleA11, CPU_None);
513 compat[CPU_AppleA12] = Set(CPU_AppleA12, CPU_None);
514 compat[CPU_AppleA13] = Set(CPU_AppleA13, CPU_None);
515 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
516 compat[CPU_AppleA14] = Set(CPU_AppleA14, CPU_None);
517 #endif
518 #endif
519
520 #ifdef ISPC_GENX_ENABLED
521 compat[CPU_GENX] = Set(CPU_GENX, CPU_None);
522 compat[CPU_GENX_TGLLP] = Set(CPU_GENX_TGLLP, CPU_GENX, CPU_None);
523 #endif
524 }
525
HumanReadableListOfNames()526 std::string HumanReadableListOfNames() {
527 std::stringstream CPUs;
528 for (int i = CPU_x86_64; i < sizeofCPUtype; i++) {
529 CPUs << names[i][0];
530 if (names[i].size() > 1) {
531 CPUs << " (synonyms: " << names[i][1];
532 for (int j = 2, je = names[i].size(); j < je; j++)
533 CPUs << ", " << names[i][j];
534 CPUs << ")";
535 }
536 if (i < sizeofCPUtype - 1)
537 CPUs << ", ";
538 }
539 return CPUs.str();
540 }
541
GetDefaultNameFromType(CPUtype type)542 std::string &GetDefaultNameFromType(CPUtype type) {
543 Assert((type >= CPU_None) && (type < sizeofCPUtype));
544 return names[type][0];
545 }
546
GetTypeFromName(std::string name)547 CPUtype GetTypeFromName(std::string name) {
548 CPUtype retn = CPU_None;
549
550 for (int i = 1; (retn == CPU_None) && (i < sizeofCPUtype); i++)
551 for (int j = 0, je = names[i].size(); (retn == CPU_None) && (j < je); j++)
552 if (!name.compare(names[i][j]))
553 retn = (CPUtype)i;
554 return retn;
555 }
556
BackwardCompatible(CPUtype what,CPUtype with)557 bool BackwardCompatible(CPUtype what, CPUtype with) {
558 Assert((what > CPU_None) && (what < sizeofCPUtype));
559 Assert((with > CPU_None) && (with < sizeofCPUtype));
560 return compat[what].find(with) != compat[what].end();
561 }
562 };
563
Target(Arch arch,const char * cpu,ISPCTarget ispc_target,bool pic,bool printTarget)564 Target::Target(Arch arch, const char *cpu, ISPCTarget ispc_target, bool pic, bool printTarget)
565 : m_target(NULL), m_targetMachine(NULL), m_dataLayout(NULL), m_valid(false), m_ispc_target(ispc_target),
566 m_isa(SSE2), m_arch(Arch::none), m_is32Bit(true), m_cpu(""), m_attributes(""), m_tf_attributes(NULL),
567 m_nativeVectorWidth(-1), m_nativeVectorAlignment(-1), m_dataTypeWidth(-1), m_vectorWidth(-1), m_generatePIC(pic),
568 m_maskingIsFree(false), m_maskBitCount(-1), m_hasHalf(false), m_hasRand(false), m_hasGather(false),
569 m_hasScatter(false), m_hasTranscendentals(false), m_hasTrigonometry(false), m_hasRsqrtd(false), m_hasRcpd(false),
570 m_hasVecPrefetch(false), m_hasSaturatingArithmetic(false), m_hasFp64Support(true),
571 m_warnFtoU32IsExpensive(false) {
572 CPUtype CPUID = CPU_None, CPUfromISA = CPU_None;
573 AllCPUs a;
574 std::string featuresString;
575
576 if (cpu) {
577 CPUID = a.GetTypeFromName(cpu);
578 if (CPUID == CPU_None) {
579 Error(SourcePos(),
580 "Error: CPU type \"%s\" unknown. Supported"
581 " CPUs: %s.",
582 cpu, a.HumanReadableListOfNames().c_str());
583 return;
584 }
585 }
586
587 if (m_ispc_target == ISPCTarget::none) {
588 // If a CPU was specified explicitly, try to pick the best
589 // possible ISA based on that.
590 switch (CPUID) {
591 case CPU_None: {
592 // No CPU and no ISA, so use system info to figure out
593 // what this CPU supports.
594 m_ispc_target = lGetSystemISA();
595 std::string target_string = ISPCTargetToString(m_ispc_target);
596 Warning(SourcePos(),
597 "No --target specified on command-line."
598 " Using default system target \"%s\".",
599 target_string.c_str());
600 break;
601 }
602
603 #ifdef ISPC_ARM_ENABLED
604 case CPU_CortexA9:
605 case CPU_CortexA15:
606 case CPU_CortexA35:
607 case CPU_CortexA53:
608 case CPU_CortexA57:
609 case CPU_AppleA7:
610 case CPU_AppleA10:
611 case CPU_AppleA11:
612 case CPU_AppleA12:
613 case CPU_AppleA13:
614 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
615 case CPU_AppleA14:
616 #endif
617 m_ispc_target = ISPCTarget::neon_i32x4;
618 break;
619 #endif
620
621 #ifdef ISPC_GENX_ENABLED
622 case CPU_GENX:
623 m_ispc_target = ISPCTarget::genx_x16;
624 break;
625 case CPU_GENX_TGLLP:
626 m_ispc_target = ISPCTarget::genx_x16;
627 break;
628 #endif
629
630 case CPU_KNL:
631 m_ispc_target = ISPCTarget::avx512knl_i32x16;
632 break;
633
634 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
635 case CPU_SPR:
636 #endif
637 case CPU_TGL:
638 case CPU_ICX:
639 case CPU_ICL:
640 case CPU_SKX:
641 m_ispc_target = ISPCTarget::avx512skx_i32x16;
642 break;
643
644 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
645 case CPU_ADL:
646 #endif
647 case CPU_Broadwell:
648 case CPU_Haswell:
649 m_ispc_target = ISPCTarget::avx2_i32x8;
650 break;
651
652 case CPU_IvyBridge:
653 case CPU_SandyBridge:
654 m_ispc_target = ISPCTarget::avx1_i32x8;
655 break;
656
657 // Penryn is here because ISPC does not use SSE 4.2
658 case CPU_Penryn:
659 case CPU_Nehalem:
660 case CPU_Silvermont:
661 m_ispc_target = ISPCTarget::sse4_i32x4;
662 break;
663
664 case CPU_PS4:
665 m_ispc_target = ISPCTarget::avx1_i32x4;
666 break;
667
668 default:
669 m_ispc_target = ISPCTarget::sse2_i32x4;
670 break;
671 }
672 if (CPUID != CPU_None) {
673 std::string target_string = ISPCTargetToString(m_ispc_target);
674 Warning(SourcePos(),
675 "No --target specified on command-line."
676 " Using ISA \"%s\" based on specified CPU \"%s\".",
677 target_string.c_str(), cpu);
678 }
679 }
680
681 if (m_ispc_target == ISPCTarget::host) {
682 m_ispc_target = lGetSystemISA();
683 }
684
685 if (arch == Arch::none) {
686 #ifdef ISPC_ARM_ENABLED
687 if (ISPCTargetIsNeon(m_ispc_target)) {
688 #if defined(__arm__)
689 arch = Arch::arm;
690 #else
691 arch = Arch::aarch64;
692 #endif
693 } else
694 #endif
695 #if ISPC_GENX_ENABLED
696 if (ISPCTargetIsGen(m_ispc_target)) {
697 arch = Arch::genx64;
698 } else
699 #endif
700 arch = Arch::x86_64;
701 }
702
703 bool error = false;
704 // Make sure the target architecture is a known one; print an error
705 // with the valid ones otherwise.
706 for (llvm::TargetRegistry::iterator iter = llvm::TargetRegistry::targets().begin();
707 iter != llvm::TargetRegistry::targets().end(); ++iter) {
708 if (ArchToString(arch) == iter->getName()) {
709 this->m_target = &*iter;
710 break;
711 }
712 }
713 // For gen target we do not need to create target/targetMachine
714 if (this->m_target == NULL && !ISPCTargetIsGen(m_ispc_target)) {
715 std::string error_message;
716 error_message = "Invalid architecture \"";
717 error_message += ArchToString(arch);
718 error_message += "\"\nOptions: ";
719 llvm::TargetRegistry::iterator iter;
720 const char *separator = "";
721 for (iter = llvm::TargetRegistry::targets().begin(); iter != llvm::TargetRegistry::targets().end(); ++iter) {
722 error_message += separator;
723 error_message += iter->getName();
724 separator = ", ";
725 }
726 error_message += ".";
727 Error(SourcePos(), "%s", error_message.c_str());
728 error = true;
729 } else {
730 this->m_arch = arch;
731 }
732
733 // Ensure that we have a valid target/arch combination.
734 if (!lIsTargetValidforArch(m_ispc_target, arch)) {
735 std::string str_arch = ArchToString(arch);
736 std::string target_string = ISPCTargetToString(m_ispc_target);
737 Error(SourcePos(), "arch = %s and target = %s is not a valid combination.", str_arch.c_str(),
738 target_string.c_str());
739 return;
740 }
741 #ifdef ISPC_GENX_ENABLED
742 if ((ISPCTargetIsGen(m_ispc_target)) && (CPUID == CPU_GENX_TGLLP)) {
743 m_hasFp64Support = false;
744 }
745 // In case of gen target addressing should correspond to host addressing. Otherwise SVM pointers will not work.
746 if (arch == Arch::genx32) {
747 g->opt.force32BitAddressing = true;
748 } else if (arch == Arch::genx64) {
749 g->opt.force32BitAddressing = false;
750 }
751 #endif
752 // Check default LLVM generated targets
753 bool unsupported_target = false;
754 switch (m_ispc_target) {
755 case ISPCTarget::sse2_i32x4:
756 this->m_isa = Target::SSE2;
757 this->m_nativeVectorWidth = 4;
758 this->m_nativeVectorAlignment = 16;
759 this->m_dataTypeWidth = 32;
760 this->m_vectorWidth = 4;
761 this->m_maskingIsFree = false;
762 this->m_maskBitCount = 32;
763 this->m_warnFtoU32IsExpensive = true;
764 CPUfromISA = CPU_x86_64;
765 break;
766 case ISPCTarget::sse2_i32x8:
767 this->m_isa = Target::SSE2;
768 this->m_nativeVectorWidth = 4;
769 this->m_nativeVectorAlignment = 16;
770 this->m_dataTypeWidth = 32;
771 this->m_vectorWidth = 8;
772 this->m_maskingIsFree = false;
773 this->m_maskBitCount = 32;
774 this->m_warnFtoU32IsExpensive = true;
775 CPUfromISA = CPU_Core2;
776 break;
777 case ISPCTarget::sse4_i8x16:
778 this->m_isa = Target::SSE4;
779 this->m_nativeVectorWidth = 16;
780 this->m_nativeVectorAlignment = 16;
781 this->m_dataTypeWidth = 8;
782 this->m_vectorWidth = 16;
783 this->m_maskingIsFree = false;
784 this->m_maskBitCount = 8;
785 this->m_warnFtoU32IsExpensive = true;
786 CPUfromISA = CPU_Nehalem;
787 break;
788 case ISPCTarget::sse4_i16x8:
789 this->m_isa = Target::SSE4;
790 this->m_nativeVectorWidth = 8;
791 this->m_nativeVectorAlignment = 16;
792 this->m_dataTypeWidth = 16;
793 this->m_vectorWidth = 8;
794 this->m_maskingIsFree = false;
795 this->m_maskBitCount = 16;
796 this->m_warnFtoU32IsExpensive = true;
797 CPUfromISA = CPU_Nehalem;
798 break;
799 case ISPCTarget::sse4_i32x4:
800 this->m_isa = Target::SSE4;
801 this->m_nativeVectorWidth = 4;
802 this->m_nativeVectorAlignment = 16;
803 this->m_dataTypeWidth = 32;
804 this->m_vectorWidth = 4;
805 this->m_maskingIsFree = false;
806 this->m_maskBitCount = 32;
807 this->m_warnFtoU32IsExpensive = true;
808 CPUfromISA = CPU_Nehalem;
809 break;
810 case ISPCTarget::sse4_i32x8:
811 this->m_isa = Target::SSE4;
812 this->m_nativeVectorWidth = 4;
813 this->m_nativeVectorAlignment = 16;
814 this->m_dataTypeWidth = 32;
815 this->m_vectorWidth = 8;
816 this->m_maskingIsFree = false;
817 this->m_maskBitCount = 32;
818 this->m_warnFtoU32IsExpensive = true;
819 CPUfromISA = CPU_Nehalem;
820 break;
821 case ISPCTarget::avx1_i32x4:
822 this->m_isa = Target::AVX;
823 this->m_nativeVectorWidth = 8;
824 this->m_nativeVectorAlignment = 32;
825 this->m_dataTypeWidth = 32;
826 this->m_vectorWidth = 4;
827 this->m_maskingIsFree = false;
828 this->m_maskBitCount = 32;
829 this->m_warnFtoU32IsExpensive = true;
830 CPUfromISA = CPU_SandyBridge;
831 break;
832 case ISPCTarget::avx1_i32x8:
833 this->m_isa = Target::AVX;
834 this->m_nativeVectorWidth = 8;
835 this->m_nativeVectorAlignment = 32;
836 this->m_dataTypeWidth = 32;
837 this->m_vectorWidth = 8;
838 this->m_maskingIsFree = false;
839 this->m_maskBitCount = 32;
840 this->m_warnFtoU32IsExpensive = true;
841 CPUfromISA = CPU_SandyBridge;
842 break;
843 case ISPCTarget::avx1_i32x16:
844 this->m_isa = Target::AVX;
845 this->m_nativeVectorWidth = 8;
846 this->m_nativeVectorAlignment = 32;
847 this->m_dataTypeWidth = 32;
848 this->m_vectorWidth = 16;
849 this->m_maskingIsFree = false;
850 this->m_maskBitCount = 32;
851 this->m_warnFtoU32IsExpensive = true;
852 CPUfromISA = CPU_SandyBridge;
853 break;
854 case ISPCTarget::avx1_i64x4:
855 this->m_isa = Target::AVX;
856 this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
857 this->m_nativeVectorAlignment = 32;
858 this->m_dataTypeWidth = 64;
859 this->m_vectorWidth = 4;
860 this->m_maskingIsFree = false;
861 this->m_maskBitCount = 64;
862 this->m_warnFtoU32IsExpensive = true;
863 CPUfromISA = CPU_SandyBridge;
864 break;
865 case ISPCTarget::avx2_i8x32:
866 this->m_isa = Target::AVX2;
867 this->m_nativeVectorWidth = 32;
868 this->m_nativeVectorAlignment = 32;
869 this->m_dataTypeWidth = 8;
870 this->m_vectorWidth = 32;
871 this->m_maskingIsFree = false;
872 this->m_maskBitCount = 8;
873 this->m_hasHalf = true;
874 this->m_hasRand = true;
875 this->m_hasGather = true;
876 this->m_warnFtoU32IsExpensive = true;
877 CPUfromISA = CPU_Haswell;
878 break;
879 case ISPCTarget::avx2_i16x16:
880 this->m_isa = Target::AVX2;
881 this->m_nativeVectorWidth = 16;
882 this->m_nativeVectorAlignment = 32;
883 this->m_dataTypeWidth = 16;
884 this->m_vectorWidth = 16;
885 this->m_maskingIsFree = false;
886 this->m_maskBitCount = 16;
887 this->m_hasHalf = true;
888 this->m_hasRand = true;
889 this->m_hasGather = true;
890 this->m_warnFtoU32IsExpensive = true;
891 CPUfromISA = CPU_Haswell;
892 break;
893 case ISPCTarget::avx2_i32x4:
894 this->m_isa = Target::AVX2;
895 this->m_nativeVectorWidth = 8;
896 this->m_nativeVectorAlignment = 32;
897 this->m_dataTypeWidth = 32;
898 this->m_vectorWidth = 4;
899 this->m_maskingIsFree = false;
900 this->m_maskBitCount = 32;
901 this->m_hasHalf = true;
902 this->m_hasRand = true;
903 this->m_hasGather = true;
904 this->m_warnFtoU32IsExpensive = true;
905 CPUfromISA = CPU_Haswell;
906 break;
907 case ISPCTarget::avx2_i32x8:
908 this->m_isa = Target::AVX2;
909 this->m_nativeVectorWidth = 8;
910 this->m_nativeVectorAlignment = 32;
911 this->m_dataTypeWidth = 32;
912 this->m_vectorWidth = 8;
913 this->m_maskingIsFree = false;
914 this->m_maskBitCount = 32;
915 this->m_hasHalf = true;
916 this->m_hasRand = true;
917 this->m_hasGather = true;
918 this->m_warnFtoU32IsExpensive = true;
919 CPUfromISA = CPU_Haswell;
920 break;
921 case ISPCTarget::avx2_i32x16:
922 this->m_isa = Target::AVX2;
923 this->m_nativeVectorWidth = 16;
924 this->m_nativeVectorAlignment = 32;
925 this->m_dataTypeWidth = 32;
926 this->m_vectorWidth = 16;
927 this->m_maskingIsFree = false;
928 this->m_maskBitCount = 32;
929 this->m_hasHalf = true;
930 this->m_hasRand = true;
931 this->m_hasGather = true;
932 this->m_warnFtoU32IsExpensive = true;
933 CPUfromISA = CPU_Haswell;
934 break;
935 case ISPCTarget::avx2_i64x4:
936 this->m_isa = Target::AVX2;
937 this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
938 this->m_nativeVectorAlignment = 32;
939 this->m_dataTypeWidth = 64;
940 this->m_vectorWidth = 4;
941 this->m_maskingIsFree = false;
942 this->m_maskBitCount = 64;
943 this->m_hasHalf = true;
944 this->m_hasRand = true;
945 this->m_hasGather = true;
946 this->m_warnFtoU32IsExpensive = true;
947 CPUfromISA = CPU_Haswell;
948 break;
949 case ISPCTarget::avx512knl_i32x16:
950 this->m_isa = Target::KNL_AVX512;
951 this->m_nativeVectorWidth = 16;
952 this->m_nativeVectorAlignment = 64;
953 this->m_dataTypeWidth = 32;
954 this->m_vectorWidth = 16;
955 this->m_maskingIsFree = true;
956 this->m_maskBitCount = 1;
957 this->m_hasHalf = true;
958 this->m_hasRand = true;
959 this->m_hasGather = this->m_hasScatter = true;
960 this->m_hasTranscendentals = false;
961 // For MIC it is set to true due to performance reasons. The option should be tested.
962 this->m_hasTrigonometry = false;
963 this->m_hasRsqrtd = this->m_hasRcpd = false;
964 this->m_hasVecPrefetch = false;
965 CPUfromISA = CPU_KNL;
966 break;
967 case ISPCTarget::avx512skx_i32x8:
968 this->m_isa = Target::SKX_AVX512;
969 this->m_nativeVectorWidth = 16;
970 this->m_nativeVectorAlignment = 64;
971 this->m_dataTypeWidth = 32;
972 this->m_vectorWidth = 8;
973 this->m_maskingIsFree = true;
974 this->m_maskBitCount = 1;
975 this->m_hasHalf = true;
976 this->m_hasRand = true;
977 this->m_hasGather = this->m_hasScatter = true;
978 this->m_hasTranscendentals = false;
979 this->m_hasTrigonometry = false;
980 this->m_hasRsqrtd = this->m_hasRcpd = false;
981 this->m_hasVecPrefetch = false;
982 CPUfromISA = CPU_SKX;
983 this->m_funcAttributes.push_back(std::make_pair("prefer-vector-width", "256"));
984 this->m_funcAttributes.push_back(std::make_pair("min-legal-vector-width", "256"));
985 break;
986 case ISPCTarget::avx512skx_i32x16:
987 this->m_isa = Target::SKX_AVX512;
988 this->m_nativeVectorWidth = 16;
989 this->m_nativeVectorAlignment = 64;
990 this->m_dataTypeWidth = 32;
991 this->m_vectorWidth = 16;
992 this->m_maskingIsFree = true;
993 this->m_maskBitCount = 1;
994 this->m_hasHalf = true;
995 this->m_hasRand = true;
996 this->m_hasGather = this->m_hasScatter = true;
997 this->m_hasTranscendentals = false;
998 this->m_hasTrigonometry = false;
999 this->m_hasRsqrtd = this->m_hasRcpd = false;
1000 this->m_hasVecPrefetch = false;
1001 CPUfromISA = CPU_SKX;
1002 if (g->opt.disableZMM) {
1003 this->m_funcAttributes.push_back(std::make_pair("prefer-vector-width", "256"));
1004 this->m_funcAttributes.push_back(std::make_pair("min-legal-vector-width", "256"));
1005 } else {
1006 this->m_funcAttributes.push_back(std::make_pair("prefer-vector-width", "512"));
1007 this->m_funcAttributes.push_back(std::make_pair("min-legal-vector-width", "512"));
1008 }
1009 break;
1010 case ISPCTarget::avx512skx_i8x64:
1011 // This target is enabled only for LLVM 10.0 and later
1012 // because LLVM requires a number of fixes, which are
1013 // committed to LLVM 11.0 and can be applied to 10.0, but not
1014 // earlier versions.
1015 this->m_isa = Target::SKX_AVX512;
1016 this->m_nativeVectorWidth = 64;
1017 this->m_nativeVectorAlignment = 64;
1018 this->m_dataTypeWidth = 8;
1019 this->m_vectorWidth = 64;
1020 this->m_maskingIsFree = true;
1021 this->m_maskBitCount = 1;
1022 this->m_hasHalf = true;
1023 this->m_hasRand = true;
1024 this->m_hasGather = this->m_hasScatter = true;
1025 this->m_hasTranscendentals = false;
1026 this->m_hasTrigonometry = false;
1027 this->m_hasRsqrtd = this->m_hasRcpd = false;
1028 this->m_hasVecPrefetch = false;
1029 CPUfromISA = CPU_SKX;
1030 break;
1031 case ISPCTarget::avx512skx_i16x32:
1032 // This target is enabled only for LLVM 10.0 and later
1033 // because LLVM requires a number of fixes, which are
1034 // committed to LLVM 11.0 and can be applied to 10.0, but not
1035 // earlier versions.
1036 this->m_isa = Target::SKX_AVX512;
1037 this->m_nativeVectorWidth = 64;
1038 this->m_nativeVectorAlignment = 64;
1039 this->m_dataTypeWidth = 16;
1040 this->m_vectorWidth = 32;
1041 this->m_maskingIsFree = true;
1042 this->m_maskBitCount = 1;
1043 this->m_hasHalf = true;
1044 this->m_hasRand = true;
1045 this->m_hasGather = this->m_hasScatter = true;
1046 this->m_hasTranscendentals = false;
1047 this->m_hasTrigonometry = false;
1048 this->m_hasRsqrtd = this->m_hasRcpd = false;
1049 this->m_hasVecPrefetch = false;
1050 CPUfromISA = CPU_SKX;
1051 break;
1052 #ifdef ISPC_ARM_ENABLED
1053 case ISPCTarget::neon_i8x16:
1054 this->m_isa = Target::NEON;
1055 this->m_nativeVectorWidth = 16;
1056 this->m_nativeVectorAlignment = 16;
1057 this->m_dataTypeWidth = 8;
1058 this->m_vectorWidth = 16;
1059 this->m_hasHalf = true; // ??
1060 // https://github.com/ispc/ispc/issues/2052
1061 // AArch64 disables Coherent Control Flow optimization because of a bug in
1062 // LLVM aarch64 back-end that reduces the efficiency of simplifyCFG.
1063 // Branches added by CCF can only be removed after the back-end formed
1064 // fused-multiply-adds. This reduces the quality of code as most of scalar
1065 // optimizations will not apply.
1066 // FIXME: Consider turning this optimization back on after
1067 // https://reviews.llvm.org/D100963 gets committed to LLVM-13.
1068 // This note applies to all NEON targets below.
1069 this->m_maskingIsFree = (arch == Arch::aarch64);
1070 this->m_maskBitCount = 8;
1071 break;
1072 case ISPCTarget::neon_i16x8:
1073 this->m_isa = Target::NEON;
1074 this->m_nativeVectorWidth = 8;
1075 this->m_nativeVectorAlignment = 16;
1076 this->m_dataTypeWidth = 16;
1077 this->m_vectorWidth = 8;
1078 this->m_hasHalf = true; // ??
1079 this->m_maskingIsFree = (arch == Arch::aarch64);
1080 this->m_maskBitCount = 16;
1081 break;
1082 case ISPCTarget::neon_i32x4:
1083 this->m_isa = Target::NEON;
1084 this->m_nativeVectorWidth = 4;
1085 this->m_nativeVectorAlignment = 16;
1086 this->m_dataTypeWidth = 32;
1087 this->m_vectorWidth = 4;
1088 this->m_hasHalf = true; // ??
1089 this->m_maskingIsFree = (arch == Arch::aarch64);
1090 this->m_maskBitCount = 32;
1091 break;
1092 case ISPCTarget::neon_i32x8:
1093 this->m_isa = Target::NEON;
1094 this->m_nativeVectorWidth = 4;
1095 this->m_nativeVectorAlignment = 16;
1096 this->m_dataTypeWidth = 32;
1097 this->m_vectorWidth = 8;
1098 this->m_hasHalf = true; // ??
1099 this->m_maskingIsFree = (arch == Arch::aarch64);
1100 this->m_maskBitCount = 32;
1101 break;
1102 #else
1103 case ISPCTarget::neon_i8x16:
1104 case ISPCTarget::neon_i16x8:
1105 case ISPCTarget::neon_i32x4:
1106 case ISPCTarget::neon_i32x8:
1107 unsupported_target = true;
1108 break;
1109 #endif
1110 #ifdef ISPC_WASM_ENABLED
1111 case ISPCTarget::wasm_i32x4:
1112 this->m_isa = Target::WASM;
1113 this->m_nativeVectorWidth = 4;
1114 this->m_nativeVectorAlignment = 16;
1115 this->m_dataTypeWidth = 32;
1116 this->m_vectorWidth = 4;
1117 this->m_hasHalf = false;
1118 this->m_maskingIsFree = false;
1119 this->m_maskBitCount = 32;
1120 this->m_hasTranscendentals = false;
1121 this->m_hasTrigonometry = false;
1122 this->m_hasRcpd = false;
1123 this->m_hasRsqrtd = false;
1124 this->m_hasScatter = false;
1125 this->m_hasGather = false;
1126 this->m_hasVecPrefetch = false;
1127 break;
1128 #else
1129 case ISPCTarget::wasm_i32x4:
1130 unsupported_target = true;
1131 break;
1132 #endif
1133 #ifdef ISPC_GENX_ENABLED
1134 case ISPCTarget::genx_x8:
1135 this->m_isa = Target::GENX;
1136 this->m_nativeVectorWidth = 8;
1137 this->m_nativeVectorAlignment = 64;
1138 this->m_vectorWidth = 8;
1139 this->m_dataTypeWidth = 32;
1140 this->m_hasHalf = true;
1141 this->m_maskingIsFree = true;
1142 this->m_maskBitCount = 1;
1143 this->m_hasSaturatingArithmetic = true;
1144 this->m_hasTranscendentals = true;
1145 this->m_hasTrigonometry = true;
1146 this->m_hasGather = this->m_hasScatter = true;
1147 CPUfromISA = CPU_GENX;
1148 break;
1149 case ISPCTarget::genx_x16:
1150 this->m_isa = Target::GENX;
1151 this->m_nativeVectorWidth = 16;
1152 this->m_nativeVectorAlignment = 64;
1153 this->m_vectorWidth = 16;
1154 this->m_dataTypeWidth = 32;
1155 this->m_hasHalf = true;
1156 this->m_maskingIsFree = true;
1157 this->m_maskBitCount = 1;
1158 this->m_hasSaturatingArithmetic = true;
1159 this->m_hasTranscendentals = true;
1160 this->m_hasTrigonometry = true;
1161 this->m_hasGather = this->m_hasScatter = true;
1162 CPUfromISA = CPU_GENX;
1163 break;
1164 #else
1165 case ISPCTarget::genx_x8:
1166 case ISPCTarget::genx_x16:
1167 unsupported_target = true;
1168 break;
1169 #endif
1170 case ISPCTarget::none:
1171 case ISPCTarget::host:
1172 case ISPCTarget::error:
1173 unsupported_target = true;
1174 break;
1175 }
1176
1177 if (unsupported_target) {
1178 // Hitting one of unsupported targets is internal error.
1179 // Proper reporting about incorrect targets is done during options parsing.
1180 std::string target_string = "Problem with target (" + ISPCTargetToString(m_ispc_target) + ")";
1181 FATAL(target_string.c_str());
1182 }
1183
1184 #if defined(ISPC_ARM_ENABLED)
1185 if ((CPUID == CPU_None) && ISPCTargetIsNeon(m_ispc_target)) {
1186 if (arch == Arch::arm) {
1187 CPUID = CPU_CortexA9;
1188 } else if (arch == Arch::aarch64) {
1189 if (g->target_os == TargetOS::ios) {
1190 CPUID = CPU_AppleA7;
1191 } else if (g->target_os == TargetOS::macos) {
1192 // Open source LLVM doesn't has definition for M1 CPU, so use the latest iPhone CPU.
1193 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
1194 CPUID = CPU_AppleA14;
1195 #else
1196 CPUID = CPU_AppleA13;
1197 #endif
1198 } else {
1199 CPUID = CPU_CortexA35;
1200 }
1201 } else {
1202 UNREACHABLE();
1203 }
1204 }
1205 #endif
1206
1207 if (CPUID == CPU_None) {
1208 cpu = a.GetDefaultNameFromType(CPUfromISA).c_str();
1209 } else {
1210 if ((CPUfromISA != CPU_None) && !a.BackwardCompatible(CPUID, CPUfromISA)) {
1211 std::string target_string = ISPCTargetToString(m_ispc_target);
1212 Error(SourcePos(),
1213 "The requested CPU (%s) is incompatible"
1214 " with the CPU required for %s target (%s)",
1215 cpu, target_string.c_str(), a.GetDefaultNameFromType(CPUfromISA).c_str());
1216 return;
1217 }
1218 cpu = a.GetDefaultNameFromType(CPUID).c_str();
1219 }
1220 this->m_cpu = cpu;
1221
1222 if (!error) {
1223 // Create TargetMachine
1224 std::string triple = GetTripleString();
1225
1226 // The last validity check to ensure that supported for this target was enabled in the build.
1227 if (!g->target_registry->isSupported(m_ispc_target, g->target_os, arch)) {
1228 std::string target_string = ISPCTargetToString(m_ispc_target);
1229 std::string arch_str = ArchToString(arch);
1230 std::string os_str = OSToString(g->target_os);
1231 Error(SourcePos(), "%s target for %s on %s is not supported in current build.", target_string.c_str(),
1232 arch_str.c_str(), os_str.c_str());
1233 return;
1234 }
1235
1236 llvm::Optional<llvm::Reloc::Model> relocModel;
1237 if (m_generatePIC) {
1238 relocModel = llvm::Reloc::PIC_;
1239 }
1240 llvm::TargetOptions options;
1241 #ifdef ISPC_ARM_ENABLED
1242 options.FloatABIType = llvm::FloatABI::Hard;
1243 if (arch == Arch::arm) {
1244 if (g->target_os == TargetOS::custom_linux) {
1245 this->m_funcAttributes.push_back(std::make_pair("target-features", "+crypto,+fp-armv8,+neon,+sha2"));
1246 } else {
1247 this->m_funcAttributes.push_back(std::make_pair("target-features", "+neon,+fp16"));
1248 }
1249 featuresString = "+neon,+fp16";
1250 } else if (arch == Arch::aarch64) {
1251 if (g->target_os == TargetOS::custom_linux) {
1252 this->m_funcAttributes.push_back(
1253 std::make_pair("target-features", "+aes,+crc,+crypto,+fp-armv8,+neon,+sha2"));
1254 } else {
1255 this->m_funcAttributes.push_back(std::make_pair("target-features", "+neon"));
1256 }
1257 featuresString = "+neon";
1258 }
1259 #endif
1260
1261 // Support 'i64' and 'double' types in cm
1262 if (isGenXTarget())
1263 featuresString += "+longlong";
1264
1265 if (g->opt.disableFMA == false)
1266 options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
1267
1268 // For gen target we do not need to create target/targetMachine
1269 if (!isGenXTarget()) {
1270 m_targetMachine = m_target->createTargetMachine(triple, m_cpu, featuresString, options, relocModel);
1271 Assert(m_targetMachine != NULL);
1272
1273 // Set Optimization level for llvm codegen based on Optimization level
1274 // requested by user via ISPC Optimization Flag. Mapping is :
1275 // ISPC O0 -> Codegen O0
1276 // ISPC O1,O2,O3,default -> Codegen O3
1277 llvm::CodeGenOpt::Level cOptLevel = llvm::CodeGenOpt::Level::Aggressive;
1278 switch (g->codegenOptLevel) {
1279 case Globals::CodegenOptLevel::None:
1280 cOptLevel = llvm::CodeGenOpt::Level::None;
1281 break;
1282
1283 case Globals::CodegenOptLevel::Aggressive:
1284 cOptLevel = llvm::CodeGenOpt::Level::Aggressive;
1285 break;
1286 }
1287 m_targetMachine->setOptLevel(cOptLevel);
1288
1289 m_targetMachine->Options.MCOptions.AsmVerbose = true;
1290
1291 // Change default version of generated DWARF.
1292 if (g->generateDWARFVersion != 0) {
1293 m_targetMachine->Options.MCOptions.DwarfVersion = g->generateDWARFVersion;
1294 }
1295 }
1296 // Initialize TargetData/DataLayout in 3 steps.
1297 // 1. Get default data layout first
1298 std::string dl_string;
1299 if (m_targetMachine != NULL)
1300 dl_string = m_targetMachine->createDataLayout().getStringRepresentation();
1301 if (isGenXTarget())
1302 dl_string = m_arch == Arch::genx64 ? "e-p:64:64-i64:64-n8:16:32" : "e-p:32:32-i64:64-n8:16:32";
1303
1304 // 2. Finally set member data
1305 m_dataLayout = new llvm::DataLayout(dl_string);
1306
1307 // Set is32Bit
1308 // This indicates if we are compiling for 32 bit platform and can assume 32 bit runtime.
1309
1310 this->m_is32Bit = (getDataLayout()->getPointerSize() == 4);
1311
1312 // TO-DO : Revisit addition of "target-features" and "target-cpu" for ARM support.
1313 llvm::AttrBuilder fattrBuilder;
1314 #ifdef ISPC_ARM_ENABLED
1315 if (m_isa == Target::NEON)
1316 fattrBuilder.addAttribute("target-cpu", this->m_cpu);
1317 #endif
1318 for (auto const &f_attr : m_funcAttributes)
1319 fattrBuilder.addAttribute(f_attr.first, f_attr.second);
1320 this->m_tf_attributes = new llvm::AttrBuilder(fattrBuilder);
1321
1322 Assert(this->m_vectorWidth <= ISPC_MAX_NVEC);
1323 }
1324
1325 m_valid = !error;
1326
1327 if (printTarget) {
1328 if (!isGenXTarget()) {
1329 printf("Target Triple: %s\n", m_targetMachine->getTargetTriple().str().c_str());
1330 printf("Target CPU: %s\n", m_targetMachine->getTargetCPU().str().c_str());
1331 printf("Target Feature String: %s\n", m_targetMachine->getTargetFeatureString().str().c_str());
1332 } else {
1333 printf("Target Triple: %s\n", this->GetTripleString().c_str());
1334 printf("Target GPU: %s\n", this->getCPU().c_str());
1335 printf("Target Feature String: %s\n", featuresString.c_str());
1336 }
1337 }
1338
1339 return;
1340 }
1341
checkIntrinsticSupport(llvm::StringRef name,SourcePos pos)1342 bool Target::checkIntrinsticSupport(llvm::StringRef name, SourcePos pos) {
1343 if (name.consume_front("llvm.") == false) {
1344 return false;
1345 }
1346 // x86 specific intrinsics are verified using 'CPUFeatures'.
1347 // TODO: Add relevant information to 'CPUFeatures' for non x86 targets.
1348 if (name.consume_front("x86.") == true) {
1349 if (!ISPCTargetIsX86(m_ispc_target)) {
1350 Error(pos, "LLVM intrinsic \"%s\" supported only on \"x86\" target architecture.", name.data());
1351 return false;
1352 }
1353 AllCPUs a;
1354 std::string featureName = name.substr(0, name.find('.')).str();
1355 if (CPUFeatures[a.GetTypeFromName(this->getCPU())].count(featureName) == 0) {
1356 Error(pos, "Target specfic LLVM intrinsic \"%s\" not supported on \"%s\" CPU.", name.data(),
1357 this->getCPU().c_str());
1358 return false;
1359 }
1360 } else if (name.consume_front("arm.") == true) {
1361 if (m_arch != Arch::arm) {
1362 Error(pos, "LLVM intrinsic \"%s\" supported only on \"arm\" target architecture.", name.data());
1363 return false;
1364 }
1365 // TODO: Check 'CPUFeatures'.
1366 } else if (name.consume_front("aarch64.") == true) {
1367 if (m_arch != Arch::aarch64) {
1368 Error(pos, "LLVM intrinsic \"%s\" supported only on \"aarch64\" target architecture.", name.data());
1369 return false;
1370 }
1371 // TODO: Check 'CPUFeatures'.
1372 } else if (name.consume_front("wasm.") == true) {
1373 // TODO: Add Condition in future if relevant.
1374 // For now, returning 'true'.
1375 return true;
1376 }
1377 return true;
1378 }
1379
SupportedCPUs()1380 std::string Target::SupportedCPUs() {
1381 AllCPUs a;
1382 return a.HumanReadableListOfNames();
1383 }
1384
GetTripleString() const1385 std::string Target::GetTripleString() const {
1386 llvm::Triple triple;
1387 switch (g->target_os) {
1388 case TargetOS::windows:
1389 if (m_arch == Arch::x86) {
1390 triple.setArchName("i686");
1391 } else if (m_arch == Arch::x86_64) {
1392 triple.setArchName("x86_64");
1393 } else if (m_arch == Arch::arm) {
1394 Error(SourcePos(), "Arm is not supported on Windows.");
1395 exit(1);
1396 } else if (m_arch == Arch::aarch64) {
1397 Error(SourcePos(), "Aarch64 is not supported on Windows.");
1398 exit(1);
1399 } else if (m_arch == Arch::genx32) {
1400 triple.setArchName("spir");
1401 } else if (m_arch == Arch::genx64) {
1402 triple.setArchName("spir64");
1403 } else {
1404 Error(SourcePos(), "Unknown arch.");
1405 exit(1);
1406 }
1407 #ifdef ISPC_GENX_ENABLED
1408 if (m_arch == Arch::genx32 || m_arch == Arch::genx64) {
1409 //"spir64-unknown-unknown"
1410 triple.setVendor(llvm::Triple::VendorType::UnknownVendor);
1411 triple.setOS(llvm::Triple::OSType::UnknownOS);
1412 return triple.str();
1413 }
1414 #endif
1415 //"x86_64-pc-windows-msvc"
1416 triple.setVendor(llvm::Triple::VendorType::PC);
1417 triple.setOS(llvm::Triple::OSType::Win32);
1418 triple.setEnvironment(llvm::Triple::EnvironmentType::MSVC);
1419 break;
1420 case TargetOS::custom_linux:
1421 case TargetOS::linux:
1422 if (m_arch == Arch::x86) {
1423 triple.setArchName("i686");
1424 } else if (m_arch == Arch::x86_64) {
1425 triple.setArchName("x86_64");
1426 } else if (m_arch == Arch::arm) {
1427 triple.setArchName("armv7");
1428 } else if (m_arch == Arch::aarch64) {
1429 triple.setArchName("aarch64");
1430 } else if (m_arch == Arch::genx32) {
1431 triple.setArchName("spir");
1432 } else if (m_arch == Arch::genx64) {
1433 triple.setArchName("spir64");
1434 } else {
1435 Error(SourcePos(), "Unknown arch.");
1436 exit(1);
1437 }
1438 #ifdef ISPC_GENX_ENABLED
1439 if (m_arch == Arch::genx32 || m_arch == Arch::genx64) {
1440 //"spir64-unknown-unknown"
1441 triple.setVendor(llvm::Triple::VendorType::UnknownVendor);
1442 triple.setOS(llvm::Triple::OSType::UnknownOS);
1443 return triple.str();
1444 }
1445 #endif
1446 triple.setVendor(llvm::Triple::VendorType::UnknownVendor);
1447 triple.setOS(llvm::Triple::OSType::Linux);
1448 if (m_arch == Arch::x86 || m_arch == Arch::x86_64 || m_arch == Arch::aarch64 || m_arch == Arch::genx32 ||
1449 m_arch == Arch::genx64) {
1450 triple.setEnvironment(llvm::Triple::EnvironmentType::GNU);
1451 } else if (m_arch == Arch::arm) {
1452 triple.setEnvironment(llvm::Triple::EnvironmentType::GNUEABIHF);
1453 } else {
1454 Error(SourcePos(), "Unknown arch.");
1455 exit(1);
1456 }
1457 break;
1458 case TargetOS::freebsd:
1459 if (m_arch == Arch::x86) {
1460 triple.setArchName("i686");
1461 } else if (m_arch == Arch::x86_64) {
1462 triple.setArchName("amd64");
1463 } else if (m_arch == Arch::arm) {
1464 triple.setArchName("armv7");
1465 } else if (m_arch == Arch::aarch64) {
1466 triple.setArchName("aarch64");
1467 } else if (m_arch == Arch::genx32) {
1468 triple.setArchName("spir");
1469 } else if (m_arch == Arch::genx64) {
1470 triple.setArchName("spir64");
1471 } else {
1472 Error(SourcePos(), "Unknown arch.");
1473 exit(1);
1474 }
1475 #ifdef ISPC_GENX_ENABLED
1476 if (m_arch == Arch::genx32 || m_arch == Arch::genx64) {
1477 //"spir64-unknown-unknown"
1478 triple.setVendor(llvm::Triple::VendorType::UnknownVendor);
1479 triple.setOS(llvm::Triple::OSType::UnknownOS);
1480 return triple.str();
1481 }
1482 #endif
1483 triple.setVendor(llvm::Triple::VendorType::UnknownVendor);
1484 triple.setOS(llvm::Triple::OSType::FreeBSD);
1485 break;
1486 case TargetOS::dragonfly:
1487 if (m_arch == Arch::x86) {
1488 triple.setArchName("i686");
1489 } else if (m_arch == Arch::x86_64) {
1490 triple.setArchName("amd64");
1491 } else if (m_arch == Arch::arm) {
1492 triple.setArchName("armv7");
1493 } else if (m_arch == Arch::aarch64) {
1494 triple.setArchName("aarch64");
1495 } else {
1496 Error(SourcePos(), "Unknown arch.");
1497 exit(1);
1498 }
1499 triple.setVendor(llvm::Triple::VendorType::UnknownVendor);
1500 triple.setOS(llvm::Triple::OSType::DragonFly);
1501 break;
1502 case TargetOS::macos:
1503 // asserts
1504 if (m_arch == Arch::x86_64) {
1505 triple.setArchName("x86_64");
1506 } else if (m_arch == Arch::aarch64) {
1507 triple.setArchName("arm64");
1508 } else {
1509 Error(SourcePos(), "macOS target supports only x86_64 and aarch64.");
1510 exit(1);
1511 }
1512 triple.setVendor(llvm::Triple::VendorType::Apple);
1513 triple.setOS(llvm::Triple::OSType::MacOSX);
1514 break;
1515 case TargetOS::android:
1516 if (m_arch == Arch::x86) {
1517 triple.setArchName("i686");
1518 } else if (m_arch == Arch::x86_64) {
1519 triple.setArchName("x86_64");
1520 } else if (m_arch == Arch::arm) {
1521 triple.setArchName("armv7");
1522 } else if (m_arch == Arch::aarch64) {
1523 triple.setArchName("aarch64");
1524 } else {
1525 Error(SourcePos(), "Unknown arch.");
1526 exit(1);
1527 }
1528 triple.setVendor(llvm::Triple::VendorType::UnknownVendor);
1529 triple.setOS(llvm::Triple::OSType::Linux);
1530 triple.setEnvironment(llvm::Triple::EnvironmentType::Android);
1531 break;
1532 case TargetOS::ios:
1533 if (m_arch != Arch::aarch64) {
1534 Error(SourcePos(), "iOS target supports only aarch64.");
1535 exit(1);
1536 }
1537 // Note, for iOS arch need to be set to "arm64", instead of "aarch64".
1538 // Internet say this is for historical reasons.
1539 // "arm64-apple-ios"
1540 triple.setArchName("arm64");
1541 triple.setVendor(llvm::Triple::VendorType::Apple);
1542 triple.setOS(llvm::Triple::OSType::IOS);
1543 break;
1544 case TargetOS::ps4:
1545 if (m_arch != Arch::x86_64) {
1546 Error(SourcePos(), "PS4 target supports only x86_64.");
1547 exit(1);
1548 }
1549 // "x86_64-scei-ps4"
1550 triple.setArch(llvm::Triple::ArchType::x86_64);
1551 triple.setVendor(llvm::Triple::VendorType::SCEI);
1552 triple.setOS(llvm::Triple::OSType::PS4);
1553 break;
1554 case TargetOS::web:
1555 if (m_arch != Arch::wasm32) {
1556 Error(SourcePos(), "Web target supports only wasm32.");
1557 exit(1);
1558 }
1559 triple.setArch(llvm::Triple::ArchType::wasm32);
1560 triple.setVendor(llvm::Triple::VendorType::UnknownVendor);
1561 triple.setOS(llvm::Triple::OSType::UnknownOS);
1562 break;
1563 case TargetOS::error:
1564 Error(SourcePos(), "Invalid target OS.");
1565 exit(1);
1566 }
1567
1568 return triple.str();
1569 }
1570
1571 // This function returns string representation of ISA for the purpose of
1572 // mangling. And may return any unique string, preferably short, like
1573 // sse4, avx and etc.
ISAToString(ISA isa)1574 const char *Target::ISAToString(ISA isa) {
1575 switch (isa) {
1576 #ifdef ISPC_ARM_ENABLED
1577 case Target::NEON:
1578 return "neon";
1579 #endif
1580 #ifdef ISPC_WASM_ENABLED
1581 case Target::WASM:
1582 return "wasm";
1583 #endif
1584 case Target::SSE2:
1585 return "sse2";
1586 case Target::SSE4:
1587 return "sse4";
1588 case Target::AVX:
1589 return "avx";
1590 case Target::AVX2:
1591 return "avx2";
1592 case Target::KNL_AVX512:
1593 return "avx512knl";
1594 case Target::SKX_AVX512:
1595 return "avx512skx";
1596 #ifdef ISPC_GENX_ENABLED
1597 case Target::GENX:
1598 return "genx";
1599 #endif
1600 default:
1601 FATAL("Unhandled target in ISAToString()");
1602 }
1603 return "";
1604 }
1605
GetISAString() const1606 const char *Target::GetISAString() const { return ISAToString(m_isa); }
1607
1608 // This function returns string representation of default target corresponding
1609 // to ISA. I.e. for SSE4 it's sse4-i32x4, for AVX2 it's avx2-i32x8. This
1610 // string may be used to initialize Target.
ISAToTargetString(ISA isa)1611 const char *Target::ISAToTargetString(ISA isa) {
1612 switch (isa) {
1613 #ifdef ISPC_ARM_ENABLED
1614 case Target::NEON:
1615 return "neon-i32x4";
1616 #endif
1617 #ifdef ISPC_WASM_ENABLED
1618 case Target::WASM:
1619 return "wasm-i32x4";
1620 #endif
1621 #ifdef ISPC_GENX_ENABLED
1622 case Target::GENX:
1623 return "genx-x16";
1624 #endif
1625 case Target::SSE2:
1626 return "sse2-i32x4";
1627 case Target::SSE4:
1628 return "sse4-i32x4";
1629 case Target::AVX:
1630 return "avx1-i32x8";
1631 case Target::AVX2:
1632 return "avx2-i32x8";
1633 case Target::KNL_AVX512:
1634 return "avx512knl-i32x16";
1635 case Target::SKX_AVX512:
1636 return "avx512skx-i32x16";
1637 default:
1638 FATAL("Unhandled target in ISAToTargetString()");
1639 }
1640 return "";
1641 }
1642
GetISATargetString() const1643 const char *Target::GetISATargetString() const { return ISAToTargetString(m_isa); }
1644
SizeOf(llvm::Type * type,llvm::BasicBlock * insertAtEnd)1645 llvm::Value *Target::SizeOf(llvm::Type *type, llvm::BasicBlock *insertAtEnd) {
1646 uint64_t byteSize = getDataLayout()->getTypeStoreSize(type);
1647 if (m_is32Bit || g->opt.force32BitAddressing)
1648 return LLVMInt32((int32_t)byteSize);
1649 else
1650 return LLVMInt64(byteSize);
1651 }
1652
StructOffset(llvm::Type * type,int element,llvm::BasicBlock * insertAtEnd)1653 llvm::Value *Target::StructOffset(llvm::Type *type, int element, llvm::BasicBlock *insertAtEnd) {
1654 llvm::StructType *structType = llvm::dyn_cast<llvm::StructType>(type);
1655 if (structType == NULL || structType->isSized() == false) {
1656 Assert(m->errorCount > 0);
1657 return NULL;
1658 }
1659
1660 const llvm::StructLayout *sl = getDataLayout()->getStructLayout(structType);
1661 Assert(sl != NULL);
1662
1663 uint64_t offset = sl->getElementOffset(element);
1664 if (m_is32Bit || g->opt.force32BitAddressing)
1665 return LLVMInt32((int32_t)offset);
1666 else
1667 return LLVMInt64(offset);
1668 }
1669
markFuncWithTargetAttr(llvm::Function * func)1670 void Target::markFuncWithTargetAttr(llvm::Function *func) {
1671 if (m_tf_attributes) {
1672 func->addAttributes(llvm::AttributeList::FunctionIndex, *m_tf_attributes);
1673 }
1674 }
1675
markFuncWithCallingConv(llvm::Function * func)1676 void Target::markFuncWithCallingConv(llvm::Function *func) {
1677 assert(g->calling_conv != CallingConv::uninitialized);
1678 if (g->calling_conv == CallingConv::x86_vectorcall) {
1679 func->setCallingConv(llvm::CallingConv::X86_VectorCall);
1680 // Add x86 vectorcall changes as a separate commit.
1681 /*
1682 // We have to jump through some hoops for x86.
1683 // In LLVM IR for x86, arguments which are to be passed in registers
1684 // have to marked with 'InReg' attribue.
1685 // Rules(Ref : https://docs.microsoft.com/en-us/cpp/cpp/vectorcall?view=vs-2019 )
1686 // Definitions:
1687 // Integer Type : it fits in the native register size of the processor for example,
1688 // 4 bytes on an x86 machine.Integer types include pointer, reference, and struct or union types of 4 bytes or
1689 less.
1690 // Vector Type : either a floating - point type for example, a float or double or an SIMD vector type for
1691 // example, __m128 or __m256.
1692 // Rules for x86: Integer Type : The first two integer type arguments found in the
1693 // parameter list from left to right are placed in ECX and EDX, respectively.
1694 // Vector Type : The first six vector type arguments in order from left to right are passed by value in SSE
1695 vector registers 0 to 5.
1696 //The seventh and subsequent vector type arguments are passed on the stack by reference to memory allocated by
1697 the caller.
1698 // Observations from Clang(Is there somewhere these rules are mentioned??)
1699 // Integer Type : After first Integer Type greater than 32 bit, other integer types NOT passed in reg.
1700 // Vector Type : After 6 Vector Type args, if 2 Integer Type registers are not yet used, VectorType args
1701 // passed by reference via register - TO DO
1702
1703 if (m_arch == Arch::x86) {
1704 llvm::Function::arg_iterator argIter = func->arg_begin();
1705 llvm::FunctionType *fType = func->getFunctionType();
1706 int numArgsVecInReg = 0;
1707 int numArgsIntInReg = 0;
1708 for (; argIter != func->arg_end(); ++argIter) {
1709 llvm::Type *argType = fType->getParamType(argIter->getArgNo());
1710 if (argType->isIntegerTy() || argType->isStructTy() || argType->isPointerTy()) {
1711 if (((argType->isIntegerTy()) || (argType->isStructTy())) &&
1712 (g->target->getDataLayout()->getTypeSizeInBits(argType) > 32)) {
1713 numArgsIntInReg = 2;
1714 continue;
1715 }
1716
1717 numArgsIntInReg++;
1718 argIter->addAttr(llvm::Attribute::InReg);
1719 continue;
1720 }
1721 if (((llvm::dyn_cast<llvm::VectorType>(argType) != NULL) || argType->isFloatTy() ||
1722 argType->isDoubleTy())) {
1723 numArgsVecInReg++;
1724 argIter->addAttr(llvm::Attribute::InReg);
1725 }
1726
1727 if ((numArgsIntInReg == 2) && (numArgsVecInReg == 6))
1728 break;
1729 }
1730 }*/
1731 }
1732 }
1733
1734 #ifdef ISPC_GENX_ENABLED
getGenxPlatform() const1735 Target::GENX_PLATFORM Target::getGenxPlatform() const {
1736 AllCPUs a;
1737 switch (a.GetTypeFromName(m_cpu)) {
1738 case CPU_GENX:
1739 return GENX_PLATFORM::GENX_GEN9;
1740 case CPU_GENX_TGLLP:
1741 return GENX_PLATFORM::GENX_TGLLP;
1742 default:
1743 return GENX_PLATFORM::GENX_GEN9;
1744 }
1745 return GENX_PLATFORM::GENX_GEN9;
1746 }
1747
getGenxGrfSize() const1748 uint32_t Target::getGenxGrfSize() const {
1749 switch (getGenxPlatform()) {
1750 case GENX_GEN9:
1751 case GENX_TGLLP:
1752 return 32;
1753 default:
1754 return 32;
1755 }
1756 return 32;
1757 }
1758
hasGenxPrefetch() const1759 bool Target::hasGenxPrefetch() const {
1760 switch (getGenxPlatform()) {
1761 case GENX_GEN9:
1762 case GENX_TGLLP:
1763 return false;
1764 default:
1765 return true;
1766 }
1767 return true;
1768 }
1769 #endif
1770
1771 ///////////////////////////////////////////////////////////////////////////
1772 // Opt
1773
Opt()1774 Opt::Opt() {
1775 level = 1;
1776 fastMath = false;
1777 fastMaskedVload = false;
1778 force32BitAddressing = true;
1779 unrollLoops = true;
1780 disableAsserts = false;
1781 disableFMA = false;
1782 forceAlignedMemory = false;
1783 disableMaskAllOnOptimizations = false;
1784 disableHandlePseudoMemoryOps = false;
1785 disableBlendedMaskedStores = false;
1786 disableCoherentControlFlow = false;
1787 disableUniformControlFlow = false;
1788 disableGatherScatterOptimizations = false;
1789 disableMaskedStoreToStore = false;
1790 disableGatherScatterFlattening = false;
1791 disableUniformMemoryOptimizations = false;
1792 disableCoalescing = false;
1793 disableZMM = false;
1794 #ifdef ISPC_GENX_ENABLED
1795 disableGenXGatherCoalescing = false;
1796 enableForeachInsideVarying = false;
1797 emitGenXHardwareMask = false;
1798 enableGenXUnsafeMaskedLoad = false;
1799 #endif
1800 }
1801
1802 ///////////////////////////////////////////////////////////////////////////
1803 // Globals
1804
Globals()1805 Globals::Globals() {
1806 target_registry = TargetLibRegistry::getTargetLibRegistry();
1807
1808 mathLib = Globals::Math_ISPC;
1809 codegenOptLevel = Globals::Aggressive;
1810
1811 includeStdlib = true;
1812 runCPP = true;
1813 debugPrint = false;
1814 dumpFile = false;
1815 printTarget = false;
1816 NoOmitFramePointer = false;
1817 debugIR = -1;
1818 disableWarnings = false;
1819 warningsAsErrors = false;
1820 quiet = false;
1821 forceColoredOutput = false;
1822 disableLineWrap = false;
1823 emitPerfWarnings = true;
1824 emitInstrumentation = false;
1825 noPragmaOnce = false;
1826 generateDebuggingSymbols = false;
1827 generateDWARFVersion = 3;
1828 enableFuzzTest = false;
1829 enableLLVMIntrinsics = false;
1830 fuzzTestSeed = -1;
1831 mangleFunctionsWithTarget = false;
1832 isMultiTargetCompilation = false;
1833 errorLimit = -1;
1834
1835 enableTimeTrace = false;
1836 // set default granularity to 500.
1837 timeTraceGranularity = 500;
1838 target = NULL;
1839 ctx = new llvm::LLVMContext;
1840
1841 #ifdef ISPC_HOST_IS_WINDOWS
1842 _getcwd(currentDirectory, sizeof(currentDirectory));
1843 #else
1844 if (getcwd(currentDirectory, sizeof(currentDirectory)) == NULL)
1845 FATAL("Current directory path is too long!");
1846 #endif
1847 forceAlignment = -1;
1848 dllExport = false;
1849
1850 // Target OS defaults to host OS.
1851 target_os = GetHostOS();
1852
1853 // Set calling convention to 'uninitialized'.
1854 // This needs to be set once target OS is decided.
1855 calling_conv = CallingConv::uninitialized;
1856 }
1857
1858 ///////////////////////////////////////////////////////////////////////////
1859 // SourcePos
1860
SourcePos(const char * n,int fl,int fc,int ll,int lc)1861 SourcePos::SourcePos(const char *n, int fl, int fc, int ll, int lc) {
1862 name = n;
1863 if (name == NULL) {
1864 if (m != NULL)
1865 name = m->module->getModuleIdentifier().c_str();
1866 else
1867 name = "(unknown)";
1868 }
1869 first_line = fl;
1870 first_column = fc;
1871 last_line = ll != 0 ? ll : fl;
1872 last_column = lc != 0 ? lc : fc;
1873 }
1874
1875 llvm::DIFile *
1876 // llvm::MDFile*
GetDIFile() const1877 SourcePos::GetDIFile() const {
1878 std::string directory, filename;
1879 GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
1880 llvm::DIFile *ret = m->diBuilder->createFile(filename, directory);
1881 return ret;
1882 }
1883
GetDINamespace() const1884 llvm::DINamespace *SourcePos::GetDINamespace() const {
1885 llvm::DIScope *discope = GetDIFile();
1886 llvm::DINamespace *ret = m->diBuilder->createNameSpace(discope, "ispc", true);
1887 return ret;
1888 }
1889
Print() const1890 void SourcePos::Print() const {
1891 printf(" @ [%s:%d.%d - %d.%d] ", name, first_line, first_column, last_line, last_column);
1892 }
1893
operator ==(const SourcePos & p2) const1894 bool SourcePos::operator==(const SourcePos &p2) const {
1895 return (!strcmp(name, p2.name) && first_line == p2.first_line && first_column == p2.first_column &&
1896 last_line == p2.last_line && last_column == p2.last_column);
1897 }
1898
Union(const SourcePos & p1,const SourcePos & p2)1899 SourcePos ispc::Union(const SourcePos &p1, const SourcePos &p2) {
1900 if (strcmp(p1.name, p2.name) != 0)
1901 return p1;
1902
1903 SourcePos ret;
1904 ret.name = p1.name;
1905 ret.first_line = std::min(p1.first_line, p2.first_line);
1906 ret.first_column = std::min(p1.first_column, p2.first_column);
1907 ret.last_line = std::max(p1.last_line, p2.last_line);
1908 ret.last_column = std::max(p1.last_column, p2.last_column);
1909 return ret;
1910 }
1911