1 /*
2   Copyright (c) 2010-2021, Intel Corporation
3   All rights reserved.
4 
5   Redistribution and use in source and binary forms, with or without
6   modification, are permitted provided that the following conditions are
7   met:
8 
9     * Redistributions of source code must retain the above copyright
10       notice, this list of conditions and the following disclaimer.
11 
12     * Redistributions in binary form must reproduce the above copyright
13       notice, this list of conditions and the following disclaimer in the
14       documentation and/or other materials provided with the distribution.
15 
16     * Neither the name of Intel Corporation nor the names of its
17       contributors may be used to endorse or promote products derived from
18       this software without specific prior written permission.
19 
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
22    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
25    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33 
34 /** @file ispc.cpp
35     @brief ispc global definitions
36 */
37 
38 #include "ispc.h"
39 #include "llvmutil.h"
40 #include "module.h"
41 #include "util.h"
42 
43 #include <sstream>
44 #include <stdarg.h> /* va_list, va_start, va_arg, va_end */
45 #include <stdio.h>
46 #ifdef ISPC_HOST_IS_WINDOWS
47 #include <direct.h>
48 #include <windows.h>
49 #define strcasecmp stricmp
50 #include <intrin.h>
51 #else // !ISPC_HOST_IS_WINDOWS
52 #include <sys/types.h>
53 #include <unistd.h>
54 #endif // ISPC_HOST_IS_WINDOWS
55 
56 #include <llvm/BinaryFormat/Dwarf.h>
57 #include <llvm/CodeGen/TargetLowering.h>
58 #include <llvm/CodeGen/TargetSubtargetInfo.h>
59 #include <llvm/IR/Attributes.h>
60 #include <llvm/IR/DIBuilder.h>
61 #include <llvm/IR/DataLayout.h>
62 #include <llvm/IR/DebugInfo.h>
63 #include <llvm/IR/Instructions.h>
64 #include <llvm/IR/LLVMContext.h>
65 #include <llvm/IR/Module.h>
66 #include <llvm/Support/CodeGen.h>
67 #include <llvm/Support/Host.h>
68 #include <llvm/Support/TargetRegistry.h>
69 #include <llvm/Support/TargetSelect.h>
70 #include <llvm/Target/TargetMachine.h>
71 #include <llvm/Target/TargetOptions.h>
72 
73 using namespace ispc;
74 
75 Globals *ispc::g;
76 Module *ispc::m;
77 
78 ///////////////////////////////////////////////////////////////////////////
79 // Target
80 
81 #if defined(__arm__) || defined(__aarch64__)
82 #define ARM_HOST
83 #endif
84 
85 #if !defined(ISPC_HOST_IS_WINDOWS) && !defined(ARM_HOST)
86 // __cpuid() and __cpuidex() are defined on Windows in <intrin.h> for x86/x64.
87 // On *nix they need to be defined manually through inline assembler.
__cpuid(int info[4],int infoType)88 static void __cpuid(int info[4], int infoType) {
89     __asm__ __volatile__("cpuid" : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) : "0"(infoType));
90 }
91 
92 /* Save %ebx in case it's the PIC register */
__cpuidex(int info[4],int level,int count)93 static void __cpuidex(int info[4], int level, int count) {
94     __asm__ __volatile__("xchg{l}\t{%%}ebx, %1\n\t"
95                          "cpuid\n\t"
96                          "xchg{l}\t{%%}ebx, %1\n\t"
97                          : "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3])
98                          : "0"(level), "2"(count));
99 }
100 #endif // !ISPC_HOST_IS_WINDOWS && !__ARM__ && !__AARCH64__
101 
102 #ifndef ARM_HOST
__os_has_avx_support()103 static bool __os_has_avx_support() {
104 #if defined(ISPC_HOST_IS_WINDOWS)
105     // Check if the OS will save the YMM registers
106     unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
107     return (xcrFeatureMask & 6) == 6;
108 #else  // !defined(ISPC_HOST_IS_WINDOWS)
109     // Check xgetbv; this uses a .byte sequence instead of the instruction
110     // directly because older assemblers do not include support for xgetbv and
111     // there is no easy way to conditionally compile based on the assembler used.
112     int rEAX, rEDX;
113     __asm__ __volatile__(".byte 0x0f, 0x01, 0xd0" : "=a"(rEAX), "=d"(rEDX) : "c"(0));
114     return (rEAX & 6) == 6;
115 #endif // !defined(ISPC_HOST_IS_WINDOWS)
116 }
117 
__os_has_avx512_support()118 static bool __os_has_avx512_support() {
119 #if defined(ISPC_HOST_IS_WINDOWS)
120     // Check if the OS saves the XMM, YMM and ZMM registers, i.e. it supports AVX2 and AVX512.
121     // See section 2.1 of software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf
122     unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
123     return (xcrFeatureMask & 0xE6) == 0xE6;
124 #elif defined(ISPC_HOST_IS_APPLE)
125     // macOS has different way of dealing with AVX512 than Windows and Linux:
126     // - by default AVX512 is off in the newly created thread, which means CPUID flags will
127     //   indicate AVX512 availability, but OS support check (XCR0) will not succeed.
128     // - AVX512 can be enabled either by calling thread_set_state() or by executing any
129     //   AVX512 instruction, which would cause #UD exception handled by the OS.
130     // The purpose of this check is to identify if AVX512 is potentially available, so we
131     // need to bypass OS check and look at CPUID flags only.
132     // See ispc issue #1854 for more details.
133     return true;
134 #else  // !defined(ISPC_HOST_IS_WINDOWS)
135     // Check xgetbv; this uses a .byte sequence instead of the instruction
136     // directly because older assemblers do not include support for xgetbv and
137     // there is no easy way to conditionally compile based on the assembler used.
138     int rEAX, rEDX;
139     __asm__ __volatile__(".byte 0x0f, 0x01, 0xd0" : "=a"(rEAX), "=d"(rEDX) : "c"(0));
140     return (rEAX & 0xE6) == 0xE6;
141 #endif // !defined(ISPC_HOST_IS_WINDOWS)
142 }
143 #endif // !ARM_HOST
144 
lGetSystemISA()145 static ISPCTarget lGetSystemISA() {
146 #ifdef ARM_HOST
147     return ISPCTarget::neon_i32x4;
148 #else
149     int info[4];
150     __cpuid(info, 1);
151 
152     int info2[4];
153     // Call cpuid with eax=7, ecx=0
154     __cpuidex(info2, 7, 0);
155 
156     if ((info[2] & (1 << 27)) != 0 &&  // OSXSAVE
157         (info2[1] & (1 << 5)) != 0 &&  // AVX2
158         (info2[1] & (1 << 16)) != 0 && // AVX512 F
159         __os_has_avx512_support()) {
160         // We need to verify that AVX2 is also available,
161         // as well as AVX512, because our targets are supposed
162         // to use both.
163 
164         if ((info2[1] & (1 << 17)) != 0 && // AVX512 DQ
165             (info2[1] & (1 << 28)) != 0 && // AVX512 CDI
166             (info2[1] & (1 << 30)) != 0 && // AVX512 BW
167             (info2[1] & (1 << 31)) != 0) { // AVX512 VL
168             return ISPCTarget::avx512skx_i32x16;
169         } else if ((info2[1] & (1 << 26)) != 0 && // AVX512 PF
170                    (info2[1] & (1 << 27)) != 0 && // AVX512 ER
171                    (info2[1] & (1 << 28)) != 0) { // AVX512 CDI
172             return ISPCTarget::avx512knl_i32x16;
173         }
174         // If it's unknown AVX512 target, fall through and use AVX2
175         // or whatever is available in the machine.
176     }
177 
178     if ((info[2] & (1 << 27)) != 0 &&                           // OSXSAVE
179         (info[2] & (1 << 28)) != 0 && __os_has_avx_support()) { // AVX
180         // AVX1 for sure....
181         // Ivy Bridge?
182         if ((info[2] & (1 << 29)) != 0 && // F16C
183             (info[2] & (1 << 30)) != 0 && // RDRAND
184             (info2[1] & (1 << 5)) != 0) { // AVX2.
185             return ISPCTarget::avx2_i32x8;
186         }
187         // Regular AVX
188         return ISPCTarget::avx1_i32x8;
189     } else if ((info[2] & (1 << 19)) != 0)
190         return ISPCTarget::sse4_i32x4;
191     else if ((info[3] & (1 << 26)) != 0)
192         return ISPCTarget::sse2_i32x4;
193     else {
194         Error(SourcePos(), "Unable to detect supported SSE/AVX ISA.  Exiting.");
195         exit(1);
196     }
197 #endif
198 }
199 
lIsTargetValidforArch(ISPCTarget target,Arch arch)200 static const bool lIsTargetValidforArch(ISPCTarget target, Arch arch) {
201     bool ret = true;
202     // If target name starts with sse or avx, has to be x86 or x86-64.
203     if (ISPCTargetIsX86(target)) {
204         if (arch != Arch::x86_64 && arch != Arch::x86)
205             ret = false;
206     } else if (target == ISPCTarget::neon_i8x16 || target == ISPCTarget::neon_i16x8) {
207         if (arch != Arch::arm)
208             ret = false;
209     } else if (target == ISPCTarget::neon_i32x4 || target == ISPCTarget::neon_i32x8) {
210         if (arch != Arch::arm && arch != Arch::aarch64)
211             ret = false;
212     } else if (ISPCTargetIsGen(target)) {
213         if (arch != Arch::genx32 && arch != Arch::genx64)
214             ret = false;
215     }
216 
217     return ret;
218 }
219 typedef enum {
220     // Special value, indicates that no CPU is present.
221     CPU_None = 0,
222 
223     // A generic 64-bit specific x86 processor model which tries to be good
224     // for modern chips without enabling instruction set encodings past the
225     // basic SSE2 and 64-bit ones
226     CPU_x86_64 = 1,
227 
228     // Early Atom CPU. Supports SSSE3.
229     CPU_Bonnell,
230 
231     // Generic Core2-like. Supports SSSE3. Isn`t quite compatible with Bonnell,
232     // but for ISPC the difference is negligible; ISPC doesn`t make use of it.
233     CPU_Core2,
234 
235     // Core2 Solo/Duo/Quad/Extreme. Supports SSE 4.1 (but not 4.2).
236     CPU_Penryn,
237 
238     // Late Core2-like. Supports SSE 4.2 + POPCNT/LZCNT.
239     CPU_Nehalem,
240 
241     // CPU in PS4/Xbox One.
242     CPU_PS4,
243 
244     // Sandy Bridge. Supports AVX 1.
245     CPU_SandyBridge,
246 
247     // Ivy Bridge. Supports AVX 1 + RDRAND.
248     CPU_IvyBridge,
249 
250     // Haswell. Supports AVX 2.
251     CPU_Haswell,
252 
253     // Broadwell. Supports AVX 2 + ADX/RDSEED/SMAP.
254     CPU_Broadwell,
255 
256     // Knights Landing - Xeon Phi.
257     // Supports AVX-512F: All the key AVX-512 features: masking, broadcast... ;
258     //          AVX-512CDI: Conflict Detection;
259     //          AVX-512ERI & PRI: 28-bit precision RCP, RSQRT and EXP transcendentals,
260     //                            new prefetch instructions.
261     CPU_KNL,
262     // Skylake Xeon.
263     // Supports AVX-512F: All the key AVX-512 features: masking, broadcast... ;
264     //          AVX-512CDI: Conflict Detection;
265     //          AVX-512VL: Vector Length Orthogonality;
266     //          AVX-512DQ: New HPC ISA (vs AVX512F);
267     //          AVX-512BW: Byte and Word Support.
268     CPU_SKX,
269 
270     // Icelake client
271     CPU_ICL,
272 
273     // Late Atom-like design. Supports SSE 4.2 + POPCNT/LZCNT.
274     CPU_Silvermont,
275 
276     CPU_ICX,
277     CPU_TGL,
278 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
279     CPU_ADL,
280     CPU_SPR,
281 #endif
282 
283 // FIXME: LLVM supports a ton of different ARM CPU variants--not just
284 // cortex-a9 and a15.  We should be able to handle any of them that also
285 // have NEON support.
286 #ifdef ISPC_ARM_ENABLED
287     // ARM Cortex A9. Supports NEON VFPv3.
288     CPU_CortexA9,
289 
290     // ARM Cortex A15. Supports NEON VFPv4.
291     CPU_CortexA15,
292 
293     // ARM Cortex A35, A53, A57.
294     CPU_CortexA35,
295     CPU_CortexA53,
296     CPU_CortexA57,
297 
298     // Apple CPUs.
299     CPU_AppleA7,
300     CPU_AppleA10,
301     CPU_AppleA11,
302     CPU_AppleA12,
303     CPU_AppleA13,
304 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
305     CPU_AppleA14,
306 #endif
307 #endif
308 #ifdef ISPC_GENX_ENABLED
309     CPU_GENX,
310     CPU_GENX_TGLLP,
311 #endif
312     sizeofCPUtype
313 } CPUtype;
314 
315 // This map is used to verify features available for supported CPUs
316 // and is used to filter target dependent intrisics and report an error.
317 // This mechanism is not precise and doesn't take into account flavors
318 // of AVX512, for example.
319 // The following LLVM files were used as reference:
320 // CPU Features: <llvm>/lib/Support/X86TargetParser.cpp
321 // X86 Intrinsics: <llvm>/include/llvm/IR/IntrinsicsX86.td
322 std::map<CPUtype, std::set<std::string>> CPUFeatures = {
323     {CPU_x86_64, {"mmx", "sse", "sse2"}},
324     {CPU_Bonnell, {"mmx", "sse", "sse2", "ssse3"}},
325     {CPU_Core2, {"mmx", "sse", "sse2", "ssse3"}},
326     {CPU_Penryn, {"mmx", "sse", "sse2", "ssse3", "sse41"}},
327     {CPU_Nehalem, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42"}},
328     {CPU_PS4, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx"}},
329     {CPU_SandyBridge, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx"}},
330     {CPU_IvyBridge, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx"}},
331     {CPU_Haswell, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx", "avx2"}},
332     {CPU_Broadwell, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx", "avx2"}},
333     {CPU_KNL, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx", "avx2", "avx512"}},
334     {CPU_SKX, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx", "avx2", "avx512"}},
335     {CPU_ICL, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx", "avx2", "avx512"}},
336     {CPU_Silvermont, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42"}},
337     {CPU_ICX, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx", "avx2", "avx512"}},
338     {CPU_TGL, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx", "avx2", "avx512"}},
339 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
340     {CPU_ADL, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx", "avx2"}},
341     {CPU_SPR, {"mmx", "sse", "sse2", "ssse3", "sse41", "sse42", "avx", "avx2", "avx512"}},
342 #endif
343 // TODO: Add features for remaining CPUs if valid.
344 #ifdef ISPC_ARM_ENABLED
345     {CPU_CortexA9, {}},
346     {CPU_CortexA15, {}},
347     {CPU_CortexA35, {}},
348     {CPU_CortexA53, {}},
349     {CPU_CortexA57, {}},
350     {CPU_AppleA7, {}},
351     {CPU_AppleA10, {}},
352     {CPU_AppleA11, {}},
353     {CPU_AppleA12, {}},
354     {CPU_AppleA13, {}},
355 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
356     {CPU_AppleA14, {}},
357 #endif
358 #endif
359 #ifdef ISPC_GENX_ENABLED
360     {CPU_GENX, {}},
361     {CPU_GENX_TGLLP, {}}
362 #endif
363 };
364 
365 class AllCPUs {
366   private:
367     std::vector<std::vector<std::string>> names;
368     std::vector<std::set<CPUtype>> compat;
369 
Set(int type,...)370     std::set<CPUtype> Set(int type, ...) {
371         std::set<CPUtype> retn;
372         va_list args;
373 
374         retn.insert((CPUtype)type);
375         va_start(args, type);
376         while ((type = va_arg(args, int)) != CPU_None)
377             retn.insert((CPUtype)type);
378         va_end(args);
379 
380         return retn;
381     }
382 
383   public:
AllCPUs()384     AllCPUs() {
385         names = std::vector<std::vector<std::string>>(sizeofCPUtype);
386         compat = std::vector<std::set<CPUtype>>(sizeofCPUtype);
387 
388         names[CPU_None].push_back("");
389 
390         names[CPU_x86_64].push_back("x86-64");
391 
392         names[CPU_Bonnell].push_back("atom");
393         names[CPU_Bonnell].push_back("bonnell");
394 
395         names[CPU_Core2].push_back("core2");
396 
397         names[CPU_Penryn].push_back("penryn");
398 
399         names[CPU_Silvermont].push_back("slm");
400         names[CPU_Silvermont].push_back("silvermont");
401 
402         names[CPU_Nehalem].push_back("corei7");
403         names[CPU_Nehalem].push_back("nehalem");
404 
405         names[CPU_PS4].push_back("btver2");
406         names[CPU_PS4].push_back("ps4");
407 
408         names[CPU_SandyBridge].push_back("corei7-avx");
409         names[CPU_SandyBridge].push_back("sandybridge");
410 
411         names[CPU_IvyBridge].push_back("core-avx-i");
412         names[CPU_IvyBridge].push_back("ivybridge");
413 
414         names[CPU_Haswell].push_back("core-avx2");
415         names[CPU_Haswell].push_back("haswell");
416 
417         names[CPU_Broadwell].push_back("broadwell");
418 
419         names[CPU_KNL].push_back("knl");
420 
421         names[CPU_SKX].push_back("skx");
422 
423         names[CPU_ICL].push_back("icelake-client");
424         names[CPU_ICL].push_back("icl");
425 
426         names[CPU_ICX].push_back("icelake-server");
427         names[CPU_ICX].push_back("icx");
428         names[CPU_TGL].push_back("tigerlake");
429         names[CPU_TGL].push_back("tgl");
430 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
431         names[CPU_ADL].push_back("alderlake");
432         names[CPU_ADL].push_back("adl");
433         names[CPU_SPR].push_back("sapphirerapids");
434         names[CPU_SPR].push_back("spr");
435 #endif
436 
437 #ifdef ISPC_ARM_ENABLED
438         names[CPU_CortexA9].push_back("cortex-a9");
439         names[CPU_CortexA15].push_back("cortex-a15");
440         names[CPU_CortexA35].push_back("cortex-a35");
441         names[CPU_CortexA53].push_back("cortex-a53");
442         names[CPU_CortexA57].push_back("cortex-a57");
443 
444         names[CPU_AppleA7].push_back("apple-a7");
445         names[CPU_AppleA10].push_back("apple-a10");
446         names[CPU_AppleA11].push_back("apple-a11");
447         names[CPU_AppleA12].push_back("apple-a12");
448         names[CPU_AppleA13].push_back("apple-a13");
449 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
450         names[CPU_AppleA14].push_back("apple-a14");
451 #endif
452 #endif
453 
454 #ifdef ISPC_GENX_ENABLED
455         names[CPU_GENX].push_back("SKL");
456         names[CPU_GENX_TGLLP].push_back("TGLLP");
457         names[CPU_GENX_TGLLP].push_back("DG1");
458 #endif
459 
460         Assert(names.size() == sizeofCPUtype);
461 
462         compat[CPU_Silvermont] =
463             Set(CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont, CPU_None);
464 
465         compat[CPU_KNL] = Set(CPU_KNL, CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont,
466                               CPU_SandyBridge, CPU_IvyBridge, CPU_Haswell, CPU_Broadwell, CPU_None);
467 
468         compat[CPU_SKX] = Set(CPU_SKX, CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont,
469                               CPU_SandyBridge, CPU_IvyBridge, CPU_Haswell, CPU_Broadwell, CPU_None);
470 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
471         compat[CPU_SPR] =
472             Set(CPU_SPR, CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont, CPU_SandyBridge,
473                 CPU_IvyBridge, CPU_Haswell, CPU_Broadwell, CPU_SKX, CPU_ICL, CPU_ICX, CPU_TGL, CPU_ADL, CPU_None);
474         compat[CPU_ADL] = Set(CPU_ADL, CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont,
475                               CPU_SandyBridge, CPU_IvyBridge, CPU_Haswell, CPU_Broadwell, CPU_None);
476 #endif
477         compat[CPU_TGL] =
478             Set(CPU_TGL, CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont, CPU_SandyBridge,
479                 CPU_IvyBridge, CPU_Haswell, CPU_Broadwell, CPU_SKX, CPU_ICL, CPU_ICX, CPU_None);
480         compat[CPU_ICX] = Set(CPU_ICX, CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont,
481                               CPU_SandyBridge, CPU_IvyBridge, CPU_Haswell, CPU_Broadwell, CPU_SKX, CPU_ICL, CPU_None);
482 
483         compat[CPU_ICL] = Set(CPU_ICL, CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont,
484                               CPU_SandyBridge, CPU_IvyBridge, CPU_Haswell, CPU_Broadwell, CPU_SKX, CPU_None);
485 
486         compat[CPU_Broadwell] = Set(CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont,
487                                     CPU_SandyBridge, CPU_IvyBridge, CPU_Haswell, CPU_Broadwell, CPU_None);
488         compat[CPU_Haswell] = Set(CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont,
489                                   CPU_SandyBridge, CPU_IvyBridge, CPU_Haswell, CPU_Broadwell, CPU_None);
490         compat[CPU_IvyBridge] = Set(CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont,
491                                     CPU_SandyBridge, CPU_IvyBridge, CPU_None);
492         compat[CPU_SandyBridge] =
493             Set(CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont, CPU_SandyBridge, CPU_None);
494         compat[CPU_PS4] = Set(CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont,
495                               CPU_SandyBridge, CPU_PS4, CPU_None);
496         compat[CPU_Nehalem] =
497             Set(CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont, CPU_None);
498         compat[CPU_Penryn] = Set(CPU_x86_64, CPU_Bonnell, CPU_Penryn, CPU_Core2, CPU_Nehalem, CPU_Silvermont, CPU_None);
499         compat[CPU_Core2] = Set(CPU_x86_64, CPU_Bonnell, CPU_Core2, CPU_None);
500         compat[CPU_Bonnell] = Set(CPU_x86_64, CPU_Bonnell, CPU_Core2, CPU_None);
501 
502         compat[CPU_x86_64] = Set(CPU_x86_64, CPU_None);
503 
504 #ifdef ISPC_ARM_ENABLED
505         compat[CPU_CortexA15] = Set(CPU_CortexA9, CPU_CortexA15, CPU_None);
506         compat[CPU_CortexA9] = Set(CPU_CortexA9, CPU_None);
507         compat[CPU_CortexA35] = Set(CPU_CortexA35, CPU_None);
508         compat[CPU_CortexA53] = Set(CPU_CortexA53, CPU_None);
509         compat[CPU_CortexA57] = Set(CPU_CortexA57, CPU_None);
510         compat[CPU_AppleA7] = Set(CPU_AppleA7, CPU_None);
511         compat[CPU_AppleA10] = Set(CPU_AppleA10, CPU_None);
512         compat[CPU_AppleA11] = Set(CPU_AppleA11, CPU_None);
513         compat[CPU_AppleA12] = Set(CPU_AppleA12, CPU_None);
514         compat[CPU_AppleA13] = Set(CPU_AppleA13, CPU_None);
515 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
516         compat[CPU_AppleA14] = Set(CPU_AppleA14, CPU_None);
517 #endif
518 #endif
519 
520 #ifdef ISPC_GENX_ENABLED
521         compat[CPU_GENX] = Set(CPU_GENX, CPU_None);
522         compat[CPU_GENX_TGLLP] = Set(CPU_GENX_TGLLP, CPU_GENX, CPU_None);
523 #endif
524     }
525 
HumanReadableListOfNames()526     std::string HumanReadableListOfNames() {
527         std::stringstream CPUs;
528         for (int i = CPU_x86_64; i < sizeofCPUtype; i++) {
529             CPUs << names[i][0];
530             if (names[i].size() > 1) {
531                 CPUs << " (synonyms: " << names[i][1];
532                 for (int j = 2, je = names[i].size(); j < je; j++)
533                     CPUs << ", " << names[i][j];
534                 CPUs << ")";
535             }
536             if (i < sizeofCPUtype - 1)
537                 CPUs << ", ";
538         }
539         return CPUs.str();
540     }
541 
GetDefaultNameFromType(CPUtype type)542     std::string &GetDefaultNameFromType(CPUtype type) {
543         Assert((type >= CPU_None) && (type < sizeofCPUtype));
544         return names[type][0];
545     }
546 
GetTypeFromName(std::string name)547     CPUtype GetTypeFromName(std::string name) {
548         CPUtype retn = CPU_None;
549 
550         for (int i = 1; (retn == CPU_None) && (i < sizeofCPUtype); i++)
551             for (int j = 0, je = names[i].size(); (retn == CPU_None) && (j < je); j++)
552                 if (!name.compare(names[i][j]))
553                     retn = (CPUtype)i;
554         return retn;
555     }
556 
BackwardCompatible(CPUtype what,CPUtype with)557     bool BackwardCompatible(CPUtype what, CPUtype with) {
558         Assert((what > CPU_None) && (what < sizeofCPUtype));
559         Assert((with > CPU_None) && (with < sizeofCPUtype));
560         return compat[what].find(with) != compat[what].end();
561     }
562 };
563 
Target(Arch arch,const char * cpu,ISPCTarget ispc_target,bool pic,bool printTarget)564 Target::Target(Arch arch, const char *cpu, ISPCTarget ispc_target, bool pic, bool printTarget)
565     : m_target(NULL), m_targetMachine(NULL), m_dataLayout(NULL), m_valid(false), m_ispc_target(ispc_target),
566       m_isa(SSE2), m_arch(Arch::none), m_is32Bit(true), m_cpu(""), m_attributes(""), m_tf_attributes(NULL),
567       m_nativeVectorWidth(-1), m_nativeVectorAlignment(-1), m_dataTypeWidth(-1), m_vectorWidth(-1), m_generatePIC(pic),
568       m_maskingIsFree(false), m_maskBitCount(-1), m_hasHalf(false), m_hasRand(false), m_hasGather(false),
569       m_hasScatter(false), m_hasTranscendentals(false), m_hasTrigonometry(false), m_hasRsqrtd(false), m_hasRcpd(false),
570       m_hasVecPrefetch(false), m_hasSaturatingArithmetic(false), m_hasFp64Support(true),
571       m_warnFtoU32IsExpensive(false) {
572     CPUtype CPUID = CPU_None, CPUfromISA = CPU_None;
573     AllCPUs a;
574     std::string featuresString;
575 
576     if (cpu) {
577         CPUID = a.GetTypeFromName(cpu);
578         if (CPUID == CPU_None) {
579             Error(SourcePos(),
580                   "Error: CPU type \"%s\" unknown. Supported"
581                   " CPUs: %s.",
582                   cpu, a.HumanReadableListOfNames().c_str());
583             return;
584         }
585     }
586 
587     if (m_ispc_target == ISPCTarget::none) {
588         // If a CPU was specified explicitly, try to pick the best
589         // possible ISA based on that.
590         switch (CPUID) {
591         case CPU_None: {
592             // No CPU and no ISA, so use system info to figure out
593             // what this CPU supports.
594             m_ispc_target = lGetSystemISA();
595             std::string target_string = ISPCTargetToString(m_ispc_target);
596             Warning(SourcePos(),
597                     "No --target specified on command-line."
598                     " Using default system target \"%s\".",
599                     target_string.c_str());
600             break;
601         }
602 
603 #ifdef ISPC_ARM_ENABLED
604         case CPU_CortexA9:
605         case CPU_CortexA15:
606         case CPU_CortexA35:
607         case CPU_CortexA53:
608         case CPU_CortexA57:
609         case CPU_AppleA7:
610         case CPU_AppleA10:
611         case CPU_AppleA11:
612         case CPU_AppleA12:
613         case CPU_AppleA13:
614 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
615         case CPU_AppleA14:
616 #endif
617             m_ispc_target = ISPCTarget::neon_i32x4;
618             break;
619 #endif
620 
621 #ifdef ISPC_GENX_ENABLED
622         case CPU_GENX:
623             m_ispc_target = ISPCTarget::genx_x16;
624             break;
625         case CPU_GENX_TGLLP:
626             m_ispc_target = ISPCTarget::genx_x16;
627             break;
628 #endif
629 
630         case CPU_KNL:
631             m_ispc_target = ISPCTarget::avx512knl_i32x16;
632             break;
633 
634 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
635         case CPU_SPR:
636 #endif
637         case CPU_TGL:
638         case CPU_ICX:
639         case CPU_ICL:
640         case CPU_SKX:
641             m_ispc_target = ISPCTarget::avx512skx_i32x16;
642             break;
643 
644 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
645         case CPU_ADL:
646 #endif
647         case CPU_Broadwell:
648         case CPU_Haswell:
649             m_ispc_target = ISPCTarget::avx2_i32x8;
650             break;
651 
652         case CPU_IvyBridge:
653         case CPU_SandyBridge:
654             m_ispc_target = ISPCTarget::avx1_i32x8;
655             break;
656 
657             // Penryn is here because ISPC does not use SSE 4.2
658         case CPU_Penryn:
659         case CPU_Nehalem:
660         case CPU_Silvermont:
661             m_ispc_target = ISPCTarget::sse4_i32x4;
662             break;
663 
664         case CPU_PS4:
665             m_ispc_target = ISPCTarget::avx1_i32x4;
666             break;
667 
668         default:
669             m_ispc_target = ISPCTarget::sse2_i32x4;
670             break;
671         }
672         if (CPUID != CPU_None) {
673             std::string target_string = ISPCTargetToString(m_ispc_target);
674             Warning(SourcePos(),
675                     "No --target specified on command-line."
676                     " Using ISA \"%s\" based on specified CPU \"%s\".",
677                     target_string.c_str(), cpu);
678         }
679     }
680 
681     if (m_ispc_target == ISPCTarget::host) {
682         m_ispc_target = lGetSystemISA();
683     }
684 
685     if (arch == Arch::none) {
686 #ifdef ISPC_ARM_ENABLED
687         if (ISPCTargetIsNeon(m_ispc_target)) {
688 #if defined(__arm__)
689             arch = Arch::arm;
690 #else
691             arch = Arch::aarch64;
692 #endif
693         } else
694 #endif
695 #if ISPC_GENX_ENABLED
696             if (ISPCTargetIsGen(m_ispc_target)) {
697             arch = Arch::genx64;
698         } else
699 #endif
700             arch = Arch::x86_64;
701     }
702 
703     bool error = false;
704     // Make sure the target architecture is a known one; print an error
705     // with the valid ones otherwise.
706     for (llvm::TargetRegistry::iterator iter = llvm::TargetRegistry::targets().begin();
707          iter != llvm::TargetRegistry::targets().end(); ++iter) {
708         if (ArchToString(arch) == iter->getName()) {
709             this->m_target = &*iter;
710             break;
711         }
712     }
713     // For gen target we do not need to create target/targetMachine
714     if (this->m_target == NULL && !ISPCTargetIsGen(m_ispc_target)) {
715         std::string error_message;
716         error_message = "Invalid architecture \"";
717         error_message += ArchToString(arch);
718         error_message += "\"\nOptions: ";
719         llvm::TargetRegistry::iterator iter;
720         const char *separator = "";
721         for (iter = llvm::TargetRegistry::targets().begin(); iter != llvm::TargetRegistry::targets().end(); ++iter) {
722             error_message += separator;
723             error_message += iter->getName();
724             separator = ", ";
725         }
726         error_message += ".";
727         Error(SourcePos(), "%s", error_message.c_str());
728         error = true;
729     } else {
730         this->m_arch = arch;
731     }
732 
733     // Ensure that we have a valid target/arch combination.
734     if (!lIsTargetValidforArch(m_ispc_target, arch)) {
735         std::string str_arch = ArchToString(arch);
736         std::string target_string = ISPCTargetToString(m_ispc_target);
737         Error(SourcePos(), "arch = %s and target = %s is not a valid combination.", str_arch.c_str(),
738               target_string.c_str());
739         return;
740     }
741 #ifdef ISPC_GENX_ENABLED
742     if ((ISPCTargetIsGen(m_ispc_target)) && (CPUID == CPU_GENX_TGLLP)) {
743         m_hasFp64Support = false;
744     }
745     // In case of gen target addressing should correspond to host addressing. Otherwise SVM pointers will not work.
746     if (arch == Arch::genx32) {
747         g->opt.force32BitAddressing = true;
748     } else if (arch == Arch::genx64) {
749         g->opt.force32BitAddressing = false;
750     }
751 #endif
752     // Check default LLVM generated targets
753     bool unsupported_target = false;
754     switch (m_ispc_target) {
755     case ISPCTarget::sse2_i32x4:
756         this->m_isa = Target::SSE2;
757         this->m_nativeVectorWidth = 4;
758         this->m_nativeVectorAlignment = 16;
759         this->m_dataTypeWidth = 32;
760         this->m_vectorWidth = 4;
761         this->m_maskingIsFree = false;
762         this->m_maskBitCount = 32;
763         this->m_warnFtoU32IsExpensive = true;
764         CPUfromISA = CPU_x86_64;
765         break;
766     case ISPCTarget::sse2_i32x8:
767         this->m_isa = Target::SSE2;
768         this->m_nativeVectorWidth = 4;
769         this->m_nativeVectorAlignment = 16;
770         this->m_dataTypeWidth = 32;
771         this->m_vectorWidth = 8;
772         this->m_maskingIsFree = false;
773         this->m_maskBitCount = 32;
774         this->m_warnFtoU32IsExpensive = true;
775         CPUfromISA = CPU_Core2;
776         break;
777     case ISPCTarget::sse4_i8x16:
778         this->m_isa = Target::SSE4;
779         this->m_nativeVectorWidth = 16;
780         this->m_nativeVectorAlignment = 16;
781         this->m_dataTypeWidth = 8;
782         this->m_vectorWidth = 16;
783         this->m_maskingIsFree = false;
784         this->m_maskBitCount = 8;
785         this->m_warnFtoU32IsExpensive = true;
786         CPUfromISA = CPU_Nehalem;
787         break;
788     case ISPCTarget::sse4_i16x8:
789         this->m_isa = Target::SSE4;
790         this->m_nativeVectorWidth = 8;
791         this->m_nativeVectorAlignment = 16;
792         this->m_dataTypeWidth = 16;
793         this->m_vectorWidth = 8;
794         this->m_maskingIsFree = false;
795         this->m_maskBitCount = 16;
796         this->m_warnFtoU32IsExpensive = true;
797         CPUfromISA = CPU_Nehalem;
798         break;
799     case ISPCTarget::sse4_i32x4:
800         this->m_isa = Target::SSE4;
801         this->m_nativeVectorWidth = 4;
802         this->m_nativeVectorAlignment = 16;
803         this->m_dataTypeWidth = 32;
804         this->m_vectorWidth = 4;
805         this->m_maskingIsFree = false;
806         this->m_maskBitCount = 32;
807         this->m_warnFtoU32IsExpensive = true;
808         CPUfromISA = CPU_Nehalem;
809         break;
810     case ISPCTarget::sse4_i32x8:
811         this->m_isa = Target::SSE4;
812         this->m_nativeVectorWidth = 4;
813         this->m_nativeVectorAlignment = 16;
814         this->m_dataTypeWidth = 32;
815         this->m_vectorWidth = 8;
816         this->m_maskingIsFree = false;
817         this->m_maskBitCount = 32;
818         this->m_warnFtoU32IsExpensive = true;
819         CPUfromISA = CPU_Nehalem;
820         break;
821     case ISPCTarget::avx1_i32x4:
822         this->m_isa = Target::AVX;
823         this->m_nativeVectorWidth = 8;
824         this->m_nativeVectorAlignment = 32;
825         this->m_dataTypeWidth = 32;
826         this->m_vectorWidth = 4;
827         this->m_maskingIsFree = false;
828         this->m_maskBitCount = 32;
829         this->m_warnFtoU32IsExpensive = true;
830         CPUfromISA = CPU_SandyBridge;
831         break;
832     case ISPCTarget::avx1_i32x8:
833         this->m_isa = Target::AVX;
834         this->m_nativeVectorWidth = 8;
835         this->m_nativeVectorAlignment = 32;
836         this->m_dataTypeWidth = 32;
837         this->m_vectorWidth = 8;
838         this->m_maskingIsFree = false;
839         this->m_maskBitCount = 32;
840         this->m_warnFtoU32IsExpensive = true;
841         CPUfromISA = CPU_SandyBridge;
842         break;
843     case ISPCTarget::avx1_i32x16:
844         this->m_isa = Target::AVX;
845         this->m_nativeVectorWidth = 8;
846         this->m_nativeVectorAlignment = 32;
847         this->m_dataTypeWidth = 32;
848         this->m_vectorWidth = 16;
849         this->m_maskingIsFree = false;
850         this->m_maskBitCount = 32;
851         this->m_warnFtoU32IsExpensive = true;
852         CPUfromISA = CPU_SandyBridge;
853         break;
854     case ISPCTarget::avx1_i64x4:
855         this->m_isa = Target::AVX;
856         this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
857         this->m_nativeVectorAlignment = 32;
858         this->m_dataTypeWidth = 64;
859         this->m_vectorWidth = 4;
860         this->m_maskingIsFree = false;
861         this->m_maskBitCount = 64;
862         this->m_warnFtoU32IsExpensive = true;
863         CPUfromISA = CPU_SandyBridge;
864         break;
865     case ISPCTarget::avx2_i8x32:
866         this->m_isa = Target::AVX2;
867         this->m_nativeVectorWidth = 32;
868         this->m_nativeVectorAlignment = 32;
869         this->m_dataTypeWidth = 8;
870         this->m_vectorWidth = 32;
871         this->m_maskingIsFree = false;
872         this->m_maskBitCount = 8;
873         this->m_hasHalf = true;
874         this->m_hasRand = true;
875         this->m_hasGather = true;
876         this->m_warnFtoU32IsExpensive = true;
877         CPUfromISA = CPU_Haswell;
878         break;
879     case ISPCTarget::avx2_i16x16:
880         this->m_isa = Target::AVX2;
881         this->m_nativeVectorWidth = 16;
882         this->m_nativeVectorAlignment = 32;
883         this->m_dataTypeWidth = 16;
884         this->m_vectorWidth = 16;
885         this->m_maskingIsFree = false;
886         this->m_maskBitCount = 16;
887         this->m_hasHalf = true;
888         this->m_hasRand = true;
889         this->m_hasGather = true;
890         this->m_warnFtoU32IsExpensive = true;
891         CPUfromISA = CPU_Haswell;
892         break;
893     case ISPCTarget::avx2_i32x4:
894         this->m_isa = Target::AVX2;
895         this->m_nativeVectorWidth = 8;
896         this->m_nativeVectorAlignment = 32;
897         this->m_dataTypeWidth = 32;
898         this->m_vectorWidth = 4;
899         this->m_maskingIsFree = false;
900         this->m_maskBitCount = 32;
901         this->m_hasHalf = true;
902         this->m_hasRand = true;
903         this->m_hasGather = true;
904         this->m_warnFtoU32IsExpensive = true;
905         CPUfromISA = CPU_Haswell;
906         break;
907     case ISPCTarget::avx2_i32x8:
908         this->m_isa = Target::AVX2;
909         this->m_nativeVectorWidth = 8;
910         this->m_nativeVectorAlignment = 32;
911         this->m_dataTypeWidth = 32;
912         this->m_vectorWidth = 8;
913         this->m_maskingIsFree = false;
914         this->m_maskBitCount = 32;
915         this->m_hasHalf = true;
916         this->m_hasRand = true;
917         this->m_hasGather = true;
918         this->m_warnFtoU32IsExpensive = true;
919         CPUfromISA = CPU_Haswell;
920         break;
921     case ISPCTarget::avx2_i32x16:
922         this->m_isa = Target::AVX2;
923         this->m_nativeVectorWidth = 16;
924         this->m_nativeVectorAlignment = 32;
925         this->m_dataTypeWidth = 32;
926         this->m_vectorWidth = 16;
927         this->m_maskingIsFree = false;
928         this->m_maskBitCount = 32;
929         this->m_hasHalf = true;
930         this->m_hasRand = true;
931         this->m_hasGather = true;
932         this->m_warnFtoU32IsExpensive = true;
933         CPUfromISA = CPU_Haswell;
934         break;
935     case ISPCTarget::avx2_i64x4:
936         this->m_isa = Target::AVX2;
937         this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
938         this->m_nativeVectorAlignment = 32;
939         this->m_dataTypeWidth = 64;
940         this->m_vectorWidth = 4;
941         this->m_maskingIsFree = false;
942         this->m_maskBitCount = 64;
943         this->m_hasHalf = true;
944         this->m_hasRand = true;
945         this->m_hasGather = true;
946         this->m_warnFtoU32IsExpensive = true;
947         CPUfromISA = CPU_Haswell;
948         break;
949     case ISPCTarget::avx512knl_i32x16:
950         this->m_isa = Target::KNL_AVX512;
951         this->m_nativeVectorWidth = 16;
952         this->m_nativeVectorAlignment = 64;
953         this->m_dataTypeWidth = 32;
954         this->m_vectorWidth = 16;
955         this->m_maskingIsFree = true;
956         this->m_maskBitCount = 1;
957         this->m_hasHalf = true;
958         this->m_hasRand = true;
959         this->m_hasGather = this->m_hasScatter = true;
960         this->m_hasTranscendentals = false;
961         // For MIC it is set to true due to performance reasons. The option should be tested.
962         this->m_hasTrigonometry = false;
963         this->m_hasRsqrtd = this->m_hasRcpd = false;
964         this->m_hasVecPrefetch = false;
965         CPUfromISA = CPU_KNL;
966         break;
967     case ISPCTarget::avx512skx_i32x8:
968         this->m_isa = Target::SKX_AVX512;
969         this->m_nativeVectorWidth = 16;
970         this->m_nativeVectorAlignment = 64;
971         this->m_dataTypeWidth = 32;
972         this->m_vectorWidth = 8;
973         this->m_maskingIsFree = true;
974         this->m_maskBitCount = 1;
975         this->m_hasHalf = true;
976         this->m_hasRand = true;
977         this->m_hasGather = this->m_hasScatter = true;
978         this->m_hasTranscendentals = false;
979         this->m_hasTrigonometry = false;
980         this->m_hasRsqrtd = this->m_hasRcpd = false;
981         this->m_hasVecPrefetch = false;
982         CPUfromISA = CPU_SKX;
983         this->m_funcAttributes.push_back(std::make_pair("prefer-vector-width", "256"));
984         this->m_funcAttributes.push_back(std::make_pair("min-legal-vector-width", "256"));
985         break;
986     case ISPCTarget::avx512skx_i32x16:
987         this->m_isa = Target::SKX_AVX512;
988         this->m_nativeVectorWidth = 16;
989         this->m_nativeVectorAlignment = 64;
990         this->m_dataTypeWidth = 32;
991         this->m_vectorWidth = 16;
992         this->m_maskingIsFree = true;
993         this->m_maskBitCount = 1;
994         this->m_hasHalf = true;
995         this->m_hasRand = true;
996         this->m_hasGather = this->m_hasScatter = true;
997         this->m_hasTranscendentals = false;
998         this->m_hasTrigonometry = false;
999         this->m_hasRsqrtd = this->m_hasRcpd = false;
1000         this->m_hasVecPrefetch = false;
1001         CPUfromISA = CPU_SKX;
1002         if (g->opt.disableZMM) {
1003             this->m_funcAttributes.push_back(std::make_pair("prefer-vector-width", "256"));
1004             this->m_funcAttributes.push_back(std::make_pair("min-legal-vector-width", "256"));
1005         } else {
1006             this->m_funcAttributes.push_back(std::make_pair("prefer-vector-width", "512"));
1007             this->m_funcAttributes.push_back(std::make_pair("min-legal-vector-width", "512"));
1008         }
1009         break;
1010     case ISPCTarget::avx512skx_i8x64:
1011         // This target is enabled only for LLVM 10.0 and later
1012         // because LLVM requires a number of fixes, which are
1013         // committed to LLVM 11.0 and can be applied to 10.0, but not
1014         // earlier versions.
1015         this->m_isa = Target::SKX_AVX512;
1016         this->m_nativeVectorWidth = 64;
1017         this->m_nativeVectorAlignment = 64;
1018         this->m_dataTypeWidth = 8;
1019         this->m_vectorWidth = 64;
1020         this->m_maskingIsFree = true;
1021         this->m_maskBitCount = 1;
1022         this->m_hasHalf = true;
1023         this->m_hasRand = true;
1024         this->m_hasGather = this->m_hasScatter = true;
1025         this->m_hasTranscendentals = false;
1026         this->m_hasTrigonometry = false;
1027         this->m_hasRsqrtd = this->m_hasRcpd = false;
1028         this->m_hasVecPrefetch = false;
1029         CPUfromISA = CPU_SKX;
1030         break;
1031     case ISPCTarget::avx512skx_i16x32:
1032         // This target is enabled only for LLVM 10.0 and later
1033         // because LLVM requires a number of fixes, which are
1034         // committed to LLVM 11.0 and can be applied to 10.0, but not
1035         // earlier versions.
1036         this->m_isa = Target::SKX_AVX512;
1037         this->m_nativeVectorWidth = 64;
1038         this->m_nativeVectorAlignment = 64;
1039         this->m_dataTypeWidth = 16;
1040         this->m_vectorWidth = 32;
1041         this->m_maskingIsFree = true;
1042         this->m_maskBitCount = 1;
1043         this->m_hasHalf = true;
1044         this->m_hasRand = true;
1045         this->m_hasGather = this->m_hasScatter = true;
1046         this->m_hasTranscendentals = false;
1047         this->m_hasTrigonometry = false;
1048         this->m_hasRsqrtd = this->m_hasRcpd = false;
1049         this->m_hasVecPrefetch = false;
1050         CPUfromISA = CPU_SKX;
1051         break;
1052 #ifdef ISPC_ARM_ENABLED
1053     case ISPCTarget::neon_i8x16:
1054         this->m_isa = Target::NEON;
1055         this->m_nativeVectorWidth = 16;
1056         this->m_nativeVectorAlignment = 16;
1057         this->m_dataTypeWidth = 8;
1058         this->m_vectorWidth = 16;
1059         this->m_hasHalf = true; // ??
1060         // https://github.com/ispc/ispc/issues/2052
1061         // AArch64 disables Coherent Control Flow optimization because of a bug in
1062         // LLVM aarch64 back-end that reduces the efficiency of simplifyCFG.
1063         // Branches added by CCF can only be removed after the back-end formed
1064         // fused-multiply-adds.  This reduces the quality of code as most of scalar
1065         // optimizations will not apply.
1066         // FIXME: Consider turning this optimization back on after
1067         // https://reviews.llvm.org/D100963 gets committed to LLVM-13.
1068         // This note applies to all NEON targets below.
1069         this->m_maskingIsFree = (arch == Arch::aarch64);
1070         this->m_maskBitCount = 8;
1071         break;
1072     case ISPCTarget::neon_i16x8:
1073         this->m_isa = Target::NEON;
1074         this->m_nativeVectorWidth = 8;
1075         this->m_nativeVectorAlignment = 16;
1076         this->m_dataTypeWidth = 16;
1077         this->m_vectorWidth = 8;
1078         this->m_hasHalf = true; // ??
1079         this->m_maskingIsFree = (arch == Arch::aarch64);
1080         this->m_maskBitCount = 16;
1081         break;
1082     case ISPCTarget::neon_i32x4:
1083         this->m_isa = Target::NEON;
1084         this->m_nativeVectorWidth = 4;
1085         this->m_nativeVectorAlignment = 16;
1086         this->m_dataTypeWidth = 32;
1087         this->m_vectorWidth = 4;
1088         this->m_hasHalf = true; // ??
1089         this->m_maskingIsFree = (arch == Arch::aarch64);
1090         this->m_maskBitCount = 32;
1091         break;
1092     case ISPCTarget::neon_i32x8:
1093         this->m_isa = Target::NEON;
1094         this->m_nativeVectorWidth = 4;
1095         this->m_nativeVectorAlignment = 16;
1096         this->m_dataTypeWidth = 32;
1097         this->m_vectorWidth = 8;
1098         this->m_hasHalf = true; // ??
1099         this->m_maskingIsFree = (arch == Arch::aarch64);
1100         this->m_maskBitCount = 32;
1101         break;
1102 #else
1103     case ISPCTarget::neon_i8x16:
1104     case ISPCTarget::neon_i16x8:
1105     case ISPCTarget::neon_i32x4:
1106     case ISPCTarget::neon_i32x8:
1107         unsupported_target = true;
1108         break;
1109 #endif
1110 #ifdef ISPC_WASM_ENABLED
1111     case ISPCTarget::wasm_i32x4:
1112         this->m_isa = Target::WASM;
1113         this->m_nativeVectorWidth = 4;
1114         this->m_nativeVectorAlignment = 16;
1115         this->m_dataTypeWidth = 32;
1116         this->m_vectorWidth = 4;
1117         this->m_hasHalf = false;
1118         this->m_maskingIsFree = false;
1119         this->m_maskBitCount = 32;
1120         this->m_hasTranscendentals = false;
1121         this->m_hasTrigonometry = false;
1122         this->m_hasRcpd = false;
1123         this->m_hasRsqrtd = false;
1124         this->m_hasScatter = false;
1125         this->m_hasGather = false;
1126         this->m_hasVecPrefetch = false;
1127         break;
1128 #else
1129     case ISPCTarget::wasm_i32x4:
1130         unsupported_target = true;
1131         break;
1132 #endif
1133 #ifdef ISPC_GENX_ENABLED
1134     case ISPCTarget::genx_x8:
1135         this->m_isa = Target::GENX;
1136         this->m_nativeVectorWidth = 8;
1137         this->m_nativeVectorAlignment = 64;
1138         this->m_vectorWidth = 8;
1139         this->m_dataTypeWidth = 32;
1140         this->m_hasHalf = true;
1141         this->m_maskingIsFree = true;
1142         this->m_maskBitCount = 1;
1143         this->m_hasSaturatingArithmetic = true;
1144         this->m_hasTranscendentals = true;
1145         this->m_hasTrigonometry = true;
1146         this->m_hasGather = this->m_hasScatter = true;
1147         CPUfromISA = CPU_GENX;
1148         break;
1149     case ISPCTarget::genx_x16:
1150         this->m_isa = Target::GENX;
1151         this->m_nativeVectorWidth = 16;
1152         this->m_nativeVectorAlignment = 64;
1153         this->m_vectorWidth = 16;
1154         this->m_dataTypeWidth = 32;
1155         this->m_hasHalf = true;
1156         this->m_maskingIsFree = true;
1157         this->m_maskBitCount = 1;
1158         this->m_hasSaturatingArithmetic = true;
1159         this->m_hasTranscendentals = true;
1160         this->m_hasTrigonometry = true;
1161         this->m_hasGather = this->m_hasScatter = true;
1162         CPUfromISA = CPU_GENX;
1163         break;
1164 #else
1165     case ISPCTarget::genx_x8:
1166     case ISPCTarget::genx_x16:
1167         unsupported_target = true;
1168         break;
1169 #endif
1170     case ISPCTarget::none:
1171     case ISPCTarget::host:
1172     case ISPCTarget::error:
1173         unsupported_target = true;
1174         break;
1175     }
1176 
1177     if (unsupported_target) {
1178         // Hitting one of unsupported targets is internal error.
1179         // Proper reporting about incorrect targets is done during options parsing.
1180         std::string target_string = "Problem with target (" + ISPCTargetToString(m_ispc_target) + ")";
1181         FATAL(target_string.c_str());
1182     }
1183 
1184 #if defined(ISPC_ARM_ENABLED)
1185     if ((CPUID == CPU_None) && ISPCTargetIsNeon(m_ispc_target)) {
1186         if (arch == Arch::arm) {
1187             CPUID = CPU_CortexA9;
1188         } else if (arch == Arch::aarch64) {
1189             if (g->target_os == TargetOS::ios) {
1190                 CPUID = CPU_AppleA7;
1191             } else if (g->target_os == TargetOS::macos) {
1192                 // Open source LLVM doesn't has definition for M1 CPU, so use the latest iPhone CPU.
1193 #if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
1194                 CPUID = CPU_AppleA14;
1195 #else
1196                 CPUID = CPU_AppleA13;
1197 #endif
1198             } else {
1199                 CPUID = CPU_CortexA35;
1200             }
1201         } else {
1202             UNREACHABLE();
1203         }
1204     }
1205 #endif
1206 
1207     if (CPUID == CPU_None) {
1208         cpu = a.GetDefaultNameFromType(CPUfromISA).c_str();
1209     } else {
1210         if ((CPUfromISA != CPU_None) && !a.BackwardCompatible(CPUID, CPUfromISA)) {
1211             std::string target_string = ISPCTargetToString(m_ispc_target);
1212             Error(SourcePos(),
1213                   "The requested CPU (%s) is incompatible"
1214                   " with the CPU required for %s target (%s)",
1215                   cpu, target_string.c_str(), a.GetDefaultNameFromType(CPUfromISA).c_str());
1216             return;
1217         }
1218         cpu = a.GetDefaultNameFromType(CPUID).c_str();
1219     }
1220     this->m_cpu = cpu;
1221 
1222     if (!error) {
1223         // Create TargetMachine
1224         std::string triple = GetTripleString();
1225 
1226         // The last validity check to ensure that supported for this target was enabled in the build.
1227         if (!g->target_registry->isSupported(m_ispc_target, g->target_os, arch)) {
1228             std::string target_string = ISPCTargetToString(m_ispc_target);
1229             std::string arch_str = ArchToString(arch);
1230             std::string os_str = OSToString(g->target_os);
1231             Error(SourcePos(), "%s target for %s on %s is not supported in current build.", target_string.c_str(),
1232                   arch_str.c_str(), os_str.c_str());
1233             return;
1234         }
1235 
1236         llvm::Optional<llvm::Reloc::Model> relocModel;
1237         if (m_generatePIC) {
1238             relocModel = llvm::Reloc::PIC_;
1239         }
1240         llvm::TargetOptions options;
1241 #ifdef ISPC_ARM_ENABLED
1242         options.FloatABIType = llvm::FloatABI::Hard;
1243         if (arch == Arch::arm) {
1244             if (g->target_os == TargetOS::custom_linux) {
1245                 this->m_funcAttributes.push_back(std::make_pair("target-features", "+crypto,+fp-armv8,+neon,+sha2"));
1246             } else {
1247                 this->m_funcAttributes.push_back(std::make_pair("target-features", "+neon,+fp16"));
1248             }
1249             featuresString = "+neon,+fp16";
1250         } else if (arch == Arch::aarch64) {
1251             if (g->target_os == TargetOS::custom_linux) {
1252                 this->m_funcAttributes.push_back(
1253                     std::make_pair("target-features", "+aes,+crc,+crypto,+fp-armv8,+neon,+sha2"));
1254             } else {
1255                 this->m_funcAttributes.push_back(std::make_pair("target-features", "+neon"));
1256             }
1257             featuresString = "+neon";
1258         }
1259 #endif
1260 
1261         // Support 'i64' and 'double' types in cm
1262         if (isGenXTarget())
1263             featuresString += "+longlong";
1264 
1265         if (g->opt.disableFMA == false)
1266             options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
1267 
1268         // For gen target we do not need to create target/targetMachine
1269         if (!isGenXTarget()) {
1270             m_targetMachine = m_target->createTargetMachine(triple, m_cpu, featuresString, options, relocModel);
1271             Assert(m_targetMachine != NULL);
1272 
1273             // Set Optimization level for llvm codegen based on Optimization level
1274             // requested by user via ISPC Optimization Flag. Mapping is :
1275             // ISPC O0 -> Codegen O0
1276             // ISPC O1,O2,O3,default -> Codegen O3
1277             llvm::CodeGenOpt::Level cOptLevel = llvm::CodeGenOpt::Level::Aggressive;
1278             switch (g->codegenOptLevel) {
1279             case Globals::CodegenOptLevel::None:
1280                 cOptLevel = llvm::CodeGenOpt::Level::None;
1281                 break;
1282 
1283             case Globals::CodegenOptLevel::Aggressive:
1284                 cOptLevel = llvm::CodeGenOpt::Level::Aggressive;
1285                 break;
1286             }
1287             m_targetMachine->setOptLevel(cOptLevel);
1288 
1289             m_targetMachine->Options.MCOptions.AsmVerbose = true;
1290 
1291             // Change default version of generated DWARF.
1292             if (g->generateDWARFVersion != 0) {
1293                 m_targetMachine->Options.MCOptions.DwarfVersion = g->generateDWARFVersion;
1294             }
1295         }
1296         // Initialize TargetData/DataLayout in 3 steps.
1297         // 1. Get default data layout first
1298         std::string dl_string;
1299         if (m_targetMachine != NULL)
1300             dl_string = m_targetMachine->createDataLayout().getStringRepresentation();
1301         if (isGenXTarget())
1302             dl_string = m_arch == Arch::genx64 ? "e-p:64:64-i64:64-n8:16:32" : "e-p:32:32-i64:64-n8:16:32";
1303 
1304         // 2. Finally set member data
1305         m_dataLayout = new llvm::DataLayout(dl_string);
1306 
1307         // Set is32Bit
1308         // This indicates if we are compiling for 32 bit platform and can assume 32 bit runtime.
1309 
1310         this->m_is32Bit = (getDataLayout()->getPointerSize() == 4);
1311 
1312         // TO-DO : Revisit addition of "target-features" and "target-cpu" for ARM support.
1313         llvm::AttrBuilder fattrBuilder;
1314 #ifdef ISPC_ARM_ENABLED
1315         if (m_isa == Target::NEON)
1316             fattrBuilder.addAttribute("target-cpu", this->m_cpu);
1317 #endif
1318         for (auto const &f_attr : m_funcAttributes)
1319             fattrBuilder.addAttribute(f_attr.first, f_attr.second);
1320         this->m_tf_attributes = new llvm::AttrBuilder(fattrBuilder);
1321 
1322         Assert(this->m_vectorWidth <= ISPC_MAX_NVEC);
1323     }
1324 
1325     m_valid = !error;
1326 
1327     if (printTarget) {
1328         if (!isGenXTarget()) {
1329             printf("Target Triple: %s\n", m_targetMachine->getTargetTriple().str().c_str());
1330             printf("Target CPU: %s\n", m_targetMachine->getTargetCPU().str().c_str());
1331             printf("Target Feature String: %s\n", m_targetMachine->getTargetFeatureString().str().c_str());
1332         } else {
1333             printf("Target Triple: %s\n", this->GetTripleString().c_str());
1334             printf("Target GPU: %s\n", this->getCPU().c_str());
1335             printf("Target Feature String: %s\n", featuresString.c_str());
1336         }
1337     }
1338 
1339     return;
1340 }
1341 
checkIntrinsticSupport(llvm::StringRef name,SourcePos pos)1342 bool Target::checkIntrinsticSupport(llvm::StringRef name, SourcePos pos) {
1343     if (name.consume_front("llvm.") == false) {
1344         return false;
1345     }
1346     // x86 specific intrinsics are verified using 'CPUFeatures'.
1347     // TODO: Add relevant information to 'CPUFeatures' for non x86 targets.
1348     if (name.consume_front("x86.") == true) {
1349         if (!ISPCTargetIsX86(m_ispc_target)) {
1350             Error(pos, "LLVM intrinsic \"%s\" supported only on \"x86\" target architecture.", name.data());
1351             return false;
1352         }
1353         AllCPUs a;
1354         std::string featureName = name.substr(0, name.find('.')).str();
1355         if (CPUFeatures[a.GetTypeFromName(this->getCPU())].count(featureName) == 0) {
1356             Error(pos, "Target specfic LLVM intrinsic \"%s\" not supported on \"%s\" CPU.", name.data(),
1357                   this->getCPU().c_str());
1358             return false;
1359         }
1360     } else if (name.consume_front("arm.") == true) {
1361         if (m_arch != Arch::arm) {
1362             Error(pos, "LLVM intrinsic \"%s\" supported only on \"arm\" target architecture.", name.data());
1363             return false;
1364         }
1365         // TODO: Check 'CPUFeatures'.
1366     } else if (name.consume_front("aarch64.") == true) {
1367         if (m_arch != Arch::aarch64) {
1368             Error(pos, "LLVM intrinsic \"%s\" supported only on \"aarch64\" target architecture.", name.data());
1369             return false;
1370         }
1371         // TODO: Check 'CPUFeatures'.
1372     } else if (name.consume_front("wasm.") == true) {
1373         // TODO: Add Condition in future if relevant.
1374         // For now, returning 'true'.
1375         return true;
1376     }
1377     return true;
1378 }
1379 
SupportedCPUs()1380 std::string Target::SupportedCPUs() {
1381     AllCPUs a;
1382     return a.HumanReadableListOfNames();
1383 }
1384 
GetTripleString() const1385 std::string Target::GetTripleString() const {
1386     llvm::Triple triple;
1387     switch (g->target_os) {
1388     case TargetOS::windows:
1389         if (m_arch == Arch::x86) {
1390             triple.setArchName("i686");
1391         } else if (m_arch == Arch::x86_64) {
1392             triple.setArchName("x86_64");
1393         } else if (m_arch == Arch::arm) {
1394             Error(SourcePos(), "Arm is not supported on Windows.");
1395             exit(1);
1396         } else if (m_arch == Arch::aarch64) {
1397             Error(SourcePos(), "Aarch64 is not supported on Windows.");
1398             exit(1);
1399         } else if (m_arch == Arch::genx32) {
1400             triple.setArchName("spir");
1401         } else if (m_arch == Arch::genx64) {
1402             triple.setArchName("spir64");
1403         } else {
1404             Error(SourcePos(), "Unknown arch.");
1405             exit(1);
1406         }
1407 #ifdef ISPC_GENX_ENABLED
1408         if (m_arch == Arch::genx32 || m_arch == Arch::genx64) {
1409             //"spir64-unknown-unknown"
1410             triple.setVendor(llvm::Triple::VendorType::UnknownVendor);
1411             triple.setOS(llvm::Triple::OSType::UnknownOS);
1412             return triple.str();
1413         }
1414 #endif
1415         //"x86_64-pc-windows-msvc"
1416         triple.setVendor(llvm::Triple::VendorType::PC);
1417         triple.setOS(llvm::Triple::OSType::Win32);
1418         triple.setEnvironment(llvm::Triple::EnvironmentType::MSVC);
1419         break;
1420     case TargetOS::custom_linux:
1421     case TargetOS::linux:
1422         if (m_arch == Arch::x86) {
1423             triple.setArchName("i686");
1424         } else if (m_arch == Arch::x86_64) {
1425             triple.setArchName("x86_64");
1426         } else if (m_arch == Arch::arm) {
1427             triple.setArchName("armv7");
1428         } else if (m_arch == Arch::aarch64) {
1429             triple.setArchName("aarch64");
1430         } else if (m_arch == Arch::genx32) {
1431             triple.setArchName("spir");
1432         } else if (m_arch == Arch::genx64) {
1433             triple.setArchName("spir64");
1434         } else {
1435             Error(SourcePos(), "Unknown arch.");
1436             exit(1);
1437         }
1438 #ifdef ISPC_GENX_ENABLED
1439         if (m_arch == Arch::genx32 || m_arch == Arch::genx64) {
1440             //"spir64-unknown-unknown"
1441             triple.setVendor(llvm::Triple::VendorType::UnknownVendor);
1442             triple.setOS(llvm::Triple::OSType::UnknownOS);
1443             return triple.str();
1444         }
1445 #endif
1446         triple.setVendor(llvm::Triple::VendorType::UnknownVendor);
1447         triple.setOS(llvm::Triple::OSType::Linux);
1448         if (m_arch == Arch::x86 || m_arch == Arch::x86_64 || m_arch == Arch::aarch64 || m_arch == Arch::genx32 ||
1449             m_arch == Arch::genx64) {
1450             triple.setEnvironment(llvm::Triple::EnvironmentType::GNU);
1451         } else if (m_arch == Arch::arm) {
1452             triple.setEnvironment(llvm::Triple::EnvironmentType::GNUEABIHF);
1453         } else {
1454             Error(SourcePos(), "Unknown arch.");
1455             exit(1);
1456         }
1457         break;
1458     case TargetOS::freebsd:
1459         if (m_arch == Arch::x86) {
1460             triple.setArchName("i686");
1461         } else if (m_arch == Arch::x86_64) {
1462             triple.setArchName("amd64");
1463         } else if (m_arch == Arch::arm) {
1464             triple.setArchName("armv7");
1465         } else if (m_arch == Arch::aarch64) {
1466             triple.setArchName("aarch64");
1467         } else if (m_arch == Arch::genx32) {
1468             triple.setArchName("spir");
1469         } else if (m_arch == Arch::genx64) {
1470             triple.setArchName("spir64");
1471         } else {
1472             Error(SourcePos(), "Unknown arch.");
1473             exit(1);
1474         }
1475 #ifdef ISPC_GENX_ENABLED
1476         if (m_arch == Arch::genx32 || m_arch == Arch::genx64) {
1477             //"spir64-unknown-unknown"
1478             triple.setVendor(llvm::Triple::VendorType::UnknownVendor);
1479             triple.setOS(llvm::Triple::OSType::UnknownOS);
1480             return triple.str();
1481         }
1482 #endif
1483         triple.setVendor(llvm::Triple::VendorType::UnknownVendor);
1484         triple.setOS(llvm::Triple::OSType::FreeBSD);
1485         break;
1486     case TargetOS::dragonfly:
1487         if (m_arch == Arch::x86) {
1488             triple.setArchName("i686");
1489         } else if (m_arch == Arch::x86_64) {
1490             triple.setArchName("amd64");
1491         } else if (m_arch == Arch::arm) {
1492             triple.setArchName("armv7");
1493         } else if (m_arch == Arch::aarch64) {
1494             triple.setArchName("aarch64");
1495         } else {
1496             Error(SourcePos(), "Unknown arch.");
1497             exit(1);
1498         }
1499         triple.setVendor(llvm::Triple::VendorType::UnknownVendor);
1500         triple.setOS(llvm::Triple::OSType::DragonFly);
1501         break;
1502     case TargetOS::macos:
1503         // asserts
1504         if (m_arch == Arch::x86_64) {
1505             triple.setArchName("x86_64");
1506         } else if (m_arch == Arch::aarch64) {
1507             triple.setArchName("arm64");
1508         } else {
1509             Error(SourcePos(), "macOS target supports only x86_64 and aarch64.");
1510             exit(1);
1511         }
1512         triple.setVendor(llvm::Triple::VendorType::Apple);
1513         triple.setOS(llvm::Triple::OSType::MacOSX);
1514         break;
1515     case TargetOS::android:
1516         if (m_arch == Arch::x86) {
1517             triple.setArchName("i686");
1518         } else if (m_arch == Arch::x86_64) {
1519             triple.setArchName("x86_64");
1520         } else if (m_arch == Arch::arm) {
1521             triple.setArchName("armv7");
1522         } else if (m_arch == Arch::aarch64) {
1523             triple.setArchName("aarch64");
1524         } else {
1525             Error(SourcePos(), "Unknown arch.");
1526             exit(1);
1527         }
1528         triple.setVendor(llvm::Triple::VendorType::UnknownVendor);
1529         triple.setOS(llvm::Triple::OSType::Linux);
1530         triple.setEnvironment(llvm::Triple::EnvironmentType::Android);
1531         break;
1532     case TargetOS::ios:
1533         if (m_arch != Arch::aarch64) {
1534             Error(SourcePos(), "iOS target supports only aarch64.");
1535             exit(1);
1536         }
1537         // Note, for iOS arch need to be set to "arm64", instead of "aarch64".
1538         // Internet say this is for historical reasons.
1539         // "arm64-apple-ios"
1540         triple.setArchName("arm64");
1541         triple.setVendor(llvm::Triple::VendorType::Apple);
1542         triple.setOS(llvm::Triple::OSType::IOS);
1543         break;
1544     case TargetOS::ps4:
1545         if (m_arch != Arch::x86_64) {
1546             Error(SourcePos(), "PS4 target supports only x86_64.");
1547             exit(1);
1548         }
1549         // "x86_64-scei-ps4"
1550         triple.setArch(llvm::Triple::ArchType::x86_64);
1551         triple.setVendor(llvm::Triple::VendorType::SCEI);
1552         triple.setOS(llvm::Triple::OSType::PS4);
1553         break;
1554     case TargetOS::web:
1555         if (m_arch != Arch::wasm32) {
1556             Error(SourcePos(), "Web target supports only wasm32.");
1557             exit(1);
1558         }
1559         triple.setArch(llvm::Triple::ArchType::wasm32);
1560         triple.setVendor(llvm::Triple::VendorType::UnknownVendor);
1561         triple.setOS(llvm::Triple::OSType::UnknownOS);
1562         break;
1563     case TargetOS::error:
1564         Error(SourcePos(), "Invalid target OS.");
1565         exit(1);
1566     }
1567 
1568     return triple.str();
1569 }
1570 
1571 // This function returns string representation of ISA for the purpose of
1572 // mangling. And may return any unique string, preferably short, like
1573 // sse4, avx and etc.
ISAToString(ISA isa)1574 const char *Target::ISAToString(ISA isa) {
1575     switch (isa) {
1576 #ifdef ISPC_ARM_ENABLED
1577     case Target::NEON:
1578         return "neon";
1579 #endif
1580 #ifdef ISPC_WASM_ENABLED
1581     case Target::WASM:
1582         return "wasm";
1583 #endif
1584     case Target::SSE2:
1585         return "sse2";
1586     case Target::SSE4:
1587         return "sse4";
1588     case Target::AVX:
1589         return "avx";
1590     case Target::AVX2:
1591         return "avx2";
1592     case Target::KNL_AVX512:
1593         return "avx512knl";
1594     case Target::SKX_AVX512:
1595         return "avx512skx";
1596 #ifdef ISPC_GENX_ENABLED
1597     case Target::GENX:
1598         return "genx";
1599 #endif
1600     default:
1601         FATAL("Unhandled target in ISAToString()");
1602     }
1603     return "";
1604 }
1605 
GetISAString() const1606 const char *Target::GetISAString() const { return ISAToString(m_isa); }
1607 
1608 // This function returns string representation of default target corresponding
1609 // to ISA. I.e. for SSE4 it's sse4-i32x4, for AVX2 it's avx2-i32x8. This
1610 // string may be used to initialize Target.
ISAToTargetString(ISA isa)1611 const char *Target::ISAToTargetString(ISA isa) {
1612     switch (isa) {
1613 #ifdef ISPC_ARM_ENABLED
1614     case Target::NEON:
1615         return "neon-i32x4";
1616 #endif
1617 #ifdef ISPC_WASM_ENABLED
1618     case Target::WASM:
1619         return "wasm-i32x4";
1620 #endif
1621 #ifdef ISPC_GENX_ENABLED
1622     case Target::GENX:
1623         return "genx-x16";
1624 #endif
1625     case Target::SSE2:
1626         return "sse2-i32x4";
1627     case Target::SSE4:
1628         return "sse4-i32x4";
1629     case Target::AVX:
1630         return "avx1-i32x8";
1631     case Target::AVX2:
1632         return "avx2-i32x8";
1633     case Target::KNL_AVX512:
1634         return "avx512knl-i32x16";
1635     case Target::SKX_AVX512:
1636         return "avx512skx-i32x16";
1637     default:
1638         FATAL("Unhandled target in ISAToTargetString()");
1639     }
1640     return "";
1641 }
1642 
GetISATargetString() const1643 const char *Target::GetISATargetString() const { return ISAToTargetString(m_isa); }
1644 
SizeOf(llvm::Type * type,llvm::BasicBlock * insertAtEnd)1645 llvm::Value *Target::SizeOf(llvm::Type *type, llvm::BasicBlock *insertAtEnd) {
1646     uint64_t byteSize = getDataLayout()->getTypeStoreSize(type);
1647     if (m_is32Bit || g->opt.force32BitAddressing)
1648         return LLVMInt32((int32_t)byteSize);
1649     else
1650         return LLVMInt64(byteSize);
1651 }
1652 
StructOffset(llvm::Type * type,int element,llvm::BasicBlock * insertAtEnd)1653 llvm::Value *Target::StructOffset(llvm::Type *type, int element, llvm::BasicBlock *insertAtEnd) {
1654     llvm::StructType *structType = llvm::dyn_cast<llvm::StructType>(type);
1655     if (structType == NULL || structType->isSized() == false) {
1656         Assert(m->errorCount > 0);
1657         return NULL;
1658     }
1659 
1660     const llvm::StructLayout *sl = getDataLayout()->getStructLayout(structType);
1661     Assert(sl != NULL);
1662 
1663     uint64_t offset = sl->getElementOffset(element);
1664     if (m_is32Bit || g->opt.force32BitAddressing)
1665         return LLVMInt32((int32_t)offset);
1666     else
1667         return LLVMInt64(offset);
1668 }
1669 
markFuncWithTargetAttr(llvm::Function * func)1670 void Target::markFuncWithTargetAttr(llvm::Function *func) {
1671     if (m_tf_attributes) {
1672         func->addAttributes(llvm::AttributeList::FunctionIndex, *m_tf_attributes);
1673     }
1674 }
1675 
markFuncWithCallingConv(llvm::Function * func)1676 void Target::markFuncWithCallingConv(llvm::Function *func) {
1677     assert(g->calling_conv != CallingConv::uninitialized);
1678     if (g->calling_conv == CallingConv::x86_vectorcall) {
1679         func->setCallingConv(llvm::CallingConv::X86_VectorCall);
1680         // Add x86 vectorcall changes as a separate commit.
1681         /*
1682         // We have to jump through some hoops for x86.
1683         // In LLVM IR for x86, arguments which are to be passed in registers
1684         // have to marked with 'InReg' attribue.
1685         // Rules(Ref : https://docs.microsoft.com/en-us/cpp/cpp/vectorcall?view=vs-2019 )
1686         // Definitions:
1687         // Integer Type : it fits in the native register size of the processor for example,
1688         // 4 bytes on an x86 machine.Integer types include pointer, reference, and struct or union types of 4 bytes or
1689         less.
1690         // Vector Type : either a floating - point type for example, a float or double or an SIMD vector type for
1691         // example, __m128 or __m256.
1692         // Rules for x86: Integer Type : The first two integer type arguments found in the
1693         // parameter list from left to right are placed in ECX and EDX, respectively.
1694         // Vector Type : The first six vector type arguments in order from left to right are passed by value in SSE
1695         vector registers 0 to 5.
1696         //The seventh and subsequent vector type arguments are passed on the stack by reference to memory allocated by
1697         the caller.
1698         // Observations from Clang(Is there somewhere these rules are mentioned??)
1699         // Integer Type : After first Integer Type greater than 32 bit, other integer types NOT passed in reg.
1700         // Vector Type : After 6 Vector Type args, if 2 Integer Type registers are not yet used, VectorType args
1701         // passed by reference via register - TO DO
1702 
1703         if (m_arch == Arch::x86) {
1704             llvm::Function::arg_iterator argIter = func->arg_begin();
1705             llvm::FunctionType *fType = func->getFunctionType();
1706             int numArgsVecInReg = 0;
1707             int numArgsIntInReg = 0;
1708             for (; argIter != func->arg_end(); ++argIter) {
1709                 llvm::Type *argType = fType->getParamType(argIter->getArgNo());
1710                 if (argType->isIntegerTy() || argType->isStructTy() || argType->isPointerTy()) {
1711                     if (((argType->isIntegerTy()) || (argType->isStructTy())) &&
1712                         (g->target->getDataLayout()->getTypeSizeInBits(argType) > 32)) {
1713                         numArgsIntInReg = 2;
1714                         continue;
1715                     }
1716 
1717                     numArgsIntInReg++;
1718                     argIter->addAttr(llvm::Attribute::InReg);
1719                     continue;
1720                 }
1721                 if (((llvm::dyn_cast<llvm::VectorType>(argType) != NULL) || argType->isFloatTy() ||
1722                      argType->isDoubleTy())) {
1723                     numArgsVecInReg++;
1724                     argIter->addAttr(llvm::Attribute::InReg);
1725                 }
1726 
1727                 if ((numArgsIntInReg == 2) && (numArgsVecInReg == 6))
1728                     break;
1729             }
1730         }*/
1731     }
1732 }
1733 
1734 #ifdef ISPC_GENX_ENABLED
getGenxPlatform() const1735 Target::GENX_PLATFORM Target::getGenxPlatform() const {
1736     AllCPUs a;
1737     switch (a.GetTypeFromName(m_cpu)) {
1738     case CPU_GENX:
1739         return GENX_PLATFORM::GENX_GEN9;
1740     case CPU_GENX_TGLLP:
1741         return GENX_PLATFORM::GENX_TGLLP;
1742     default:
1743         return GENX_PLATFORM::GENX_GEN9;
1744     }
1745     return GENX_PLATFORM::GENX_GEN9;
1746 }
1747 
getGenxGrfSize() const1748 uint32_t Target::getGenxGrfSize() const {
1749     switch (getGenxPlatform()) {
1750     case GENX_GEN9:
1751     case GENX_TGLLP:
1752         return 32;
1753     default:
1754         return 32;
1755     }
1756     return 32;
1757 }
1758 
hasGenxPrefetch() const1759 bool Target::hasGenxPrefetch() const {
1760     switch (getGenxPlatform()) {
1761     case GENX_GEN9:
1762     case GENX_TGLLP:
1763         return false;
1764     default:
1765         return true;
1766     }
1767     return true;
1768 }
1769 #endif
1770 
1771 ///////////////////////////////////////////////////////////////////////////
1772 // Opt
1773 
Opt()1774 Opt::Opt() {
1775     level = 1;
1776     fastMath = false;
1777     fastMaskedVload = false;
1778     force32BitAddressing = true;
1779     unrollLoops = true;
1780     disableAsserts = false;
1781     disableFMA = false;
1782     forceAlignedMemory = false;
1783     disableMaskAllOnOptimizations = false;
1784     disableHandlePseudoMemoryOps = false;
1785     disableBlendedMaskedStores = false;
1786     disableCoherentControlFlow = false;
1787     disableUniformControlFlow = false;
1788     disableGatherScatterOptimizations = false;
1789     disableMaskedStoreToStore = false;
1790     disableGatherScatterFlattening = false;
1791     disableUniformMemoryOptimizations = false;
1792     disableCoalescing = false;
1793     disableZMM = false;
1794 #ifdef ISPC_GENX_ENABLED
1795     disableGenXGatherCoalescing = false;
1796     enableForeachInsideVarying = false;
1797     emitGenXHardwareMask = false;
1798     enableGenXUnsafeMaskedLoad = false;
1799 #endif
1800 }
1801 
1802 ///////////////////////////////////////////////////////////////////////////
1803 // Globals
1804 
Globals()1805 Globals::Globals() {
1806     target_registry = TargetLibRegistry::getTargetLibRegistry();
1807 
1808     mathLib = Globals::Math_ISPC;
1809     codegenOptLevel = Globals::Aggressive;
1810 
1811     includeStdlib = true;
1812     runCPP = true;
1813     debugPrint = false;
1814     dumpFile = false;
1815     printTarget = false;
1816     NoOmitFramePointer = false;
1817     debugIR = -1;
1818     disableWarnings = false;
1819     warningsAsErrors = false;
1820     quiet = false;
1821     forceColoredOutput = false;
1822     disableLineWrap = false;
1823     emitPerfWarnings = true;
1824     emitInstrumentation = false;
1825     noPragmaOnce = false;
1826     generateDebuggingSymbols = false;
1827     generateDWARFVersion = 3;
1828     enableFuzzTest = false;
1829     enableLLVMIntrinsics = false;
1830     fuzzTestSeed = -1;
1831     mangleFunctionsWithTarget = false;
1832     isMultiTargetCompilation = false;
1833     errorLimit = -1;
1834 
1835     enableTimeTrace = false;
1836     // set default granularity to 500.
1837     timeTraceGranularity = 500;
1838     target = NULL;
1839     ctx = new llvm::LLVMContext;
1840 
1841 #ifdef ISPC_HOST_IS_WINDOWS
1842     _getcwd(currentDirectory, sizeof(currentDirectory));
1843 #else
1844     if (getcwd(currentDirectory, sizeof(currentDirectory)) == NULL)
1845         FATAL("Current directory path is too long!");
1846 #endif
1847     forceAlignment = -1;
1848     dllExport = false;
1849 
1850     // Target OS defaults to host OS.
1851     target_os = GetHostOS();
1852 
1853     // Set calling convention to 'uninitialized'.
1854     // This needs to be set once target OS is decided.
1855     calling_conv = CallingConv::uninitialized;
1856 }
1857 
1858 ///////////////////////////////////////////////////////////////////////////
1859 // SourcePos
1860 
SourcePos(const char * n,int fl,int fc,int ll,int lc)1861 SourcePos::SourcePos(const char *n, int fl, int fc, int ll, int lc) {
1862     name = n;
1863     if (name == NULL) {
1864         if (m != NULL)
1865             name = m->module->getModuleIdentifier().c_str();
1866         else
1867             name = "(unknown)";
1868     }
1869     first_line = fl;
1870     first_column = fc;
1871     last_line = ll != 0 ? ll : fl;
1872     last_column = lc != 0 ? lc : fc;
1873 }
1874 
1875 llvm::DIFile *
1876 // llvm::MDFile*
GetDIFile() const1877 SourcePos::GetDIFile() const {
1878     std::string directory, filename;
1879     GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
1880     llvm::DIFile *ret = m->diBuilder->createFile(filename, directory);
1881     return ret;
1882 }
1883 
GetDINamespace() const1884 llvm::DINamespace *SourcePos::GetDINamespace() const {
1885     llvm::DIScope *discope = GetDIFile();
1886     llvm::DINamespace *ret = m->diBuilder->createNameSpace(discope, "ispc", true);
1887     return ret;
1888 }
1889 
Print() const1890 void SourcePos::Print() const {
1891     printf(" @ [%s:%d.%d - %d.%d] ", name, first_line, first_column, last_line, last_column);
1892 }
1893 
operator ==(const SourcePos & p2) const1894 bool SourcePos::operator==(const SourcePos &p2) const {
1895     return (!strcmp(name, p2.name) && first_line == p2.first_line && first_column == p2.first_column &&
1896             last_line == p2.last_line && last_column == p2.last_column);
1897 }
1898 
Union(const SourcePos & p1,const SourcePos & p2)1899 SourcePos ispc::Union(const SourcePos &p1, const SourcePos &p2) {
1900     if (strcmp(p1.name, p2.name) != 0)
1901         return p1;
1902 
1903     SourcePos ret;
1904     ret.name = p1.name;
1905     ret.first_line = std::min(p1.first_line, p2.first_line);
1906     ret.first_column = std::min(p1.first_column, p2.first_column);
1907     ret.last_line = std::max(p1.last_line, p2.last_line);
1908     ret.last_column = std::max(p1.last_column, p2.last_column);
1909     return ret;
1910 }
1911