1 // AsmJit - Machine code generation for C++
2 //
3 //  * Official AsmJit Home Page: https://asmjit.com
4 //  * Official Github Repository: https://github.com/asmjit/asmjit
5 //
6 // Copyright (c) 2008-2020 The AsmJit Authors
7 //
8 // This software is provided 'as-is', without any express or implied
9 // warranty. In no event will the authors be held liable for any damages
10 // arising from the use of this software.
11 //
12 // Permission is granted to anyone to use this software for any purpose,
13 // including commercial applications, and to alter it and redistribute it
14 // freely, subject to the following restrictions:
15 //
16 // 1. The origin of this software must not be misrepresented; you must not
17 //    claim that you wrote the original software. If you use this software
18 //    in a product, an acknowledgment in the product documentation would be
19 //    appreciated but is not required.
20 // 2. Altered source versions must be plainly marked as such, and must not be
21 //    misrepresented as being the original software.
22 // 3. This notice may not be removed or altered from any source distribution.
23 
24 #include "../core/api-build_p.h"
25 #if defined(ASMJIT_BUILD_X86) && ASMJIT_ARCH_X86
26 
27 #include "../core/cpuinfo.h"
28 #include "../core/support.h"
29 #include "../x86/x86features.h"
30 
31 // Required by `__cpuidex()` and `_xgetbv()`.
32 #if defined(_MSC_VER)
33   #include <intrin.h>
34 #endif
35 
36 ASMJIT_BEGIN_SUB_NAMESPACE(x86)
37 
38 // ============================================================================
39 // [asmjit::x86::Features - Detect]
40 // ============================================================================
41 
42 struct cpuid_t { uint32_t eax, ebx, ecx, edx; };
43 struct xgetbv_t { uint32_t eax, edx; };
44 
45 // Executes `cpuid` instruction.
cpuidQuery(cpuid_t * out,uint32_t inEax,uint32_t inEcx=0)46 static inline void cpuidQuery(cpuid_t* out, uint32_t inEax, uint32_t inEcx = 0) noexcept {
47 #if defined(_MSC_VER)
48   __cpuidex(reinterpret_cast<int*>(out), inEax, inEcx);
49 #elif defined(__GNUC__) && ASMJIT_ARCH_X86 == 32
50   __asm__ __volatile__(
51     "mov %%ebx, %%edi\n"
52     "cpuid\n"
53     "xchg %%edi, %%ebx\n" : "=a"(out->eax), "=D"(out->ebx), "=c"(out->ecx), "=d"(out->edx) : "a"(inEax), "c"(inEcx));
54 #elif defined(__GNUC__) && ASMJIT_ARCH_X86 == 64
55   __asm__ __volatile__(
56     "mov %%rbx, %%rdi\n"
57     "cpuid\n"
58     "xchg %%rdi, %%rbx\n" : "=a"(out->eax), "=D"(out->ebx), "=c"(out->ecx), "=d"(out->edx) : "a"(inEax), "c"(inEcx));
59 #else
60   #error "[asmjit] x86::cpuidQuery() - Unsupported compiler."
61 #endif
62 }
63 
64 // Executes 'xgetbv' instruction.
xgetbvQuery(xgetbv_t * out,uint32_t inEcx)65 static inline void xgetbvQuery(xgetbv_t* out, uint32_t inEcx) noexcept {
66 #if defined(_MSC_VER)
67   uint64_t value = _xgetbv(inEcx);
68   out->eax = uint32_t(value & 0xFFFFFFFFu);
69   out->edx = uint32_t(value >> 32);
70 #elif defined(__GNUC__)
71   uint32_t outEax;
72   uint32_t outEdx;
73 
74   // Replaced, because the world is not perfect:
75   //   __asm__ __volatile__("xgetbv" : "=a"(outEax), "=d"(outEdx) : "c"(inEcx));
76   __asm__ __volatile__(".byte 0x0F, 0x01, 0xD0" : "=a"(outEax), "=d"(outEdx) : "c"(inEcx));
77 
78   out->eax = outEax;
79   out->edx = outEdx;
80 #else
81   out->eax = 0;
82   out->edx = 0;
83 #endif
84 }
85 
86 // Map a 12-byte vendor string returned by `cpuid` into a `CpuInfo::Vendor` ID.
simplifyCpuVendor(CpuInfo & cpu,uint32_t d0,uint32_t d1,uint32_t d2)87 static inline void simplifyCpuVendor(CpuInfo& cpu, uint32_t d0, uint32_t d1, uint32_t d2) noexcept {
88   struct Vendor {
89     char normalized[8];
90     union { char text[12]; uint32_t d[3]; };
91   };
92 
93   static const Vendor table[] = {
94     { { 'A', 'M', 'D'                     }, {{ 'A', 'u', 't', 'h', 'e', 'n', 't', 'i', 'c', 'A', 'M', 'D' }} },
95     { { 'I', 'N', 'T', 'E', 'L'           }, {{ 'G', 'e', 'n', 'u', 'i', 'n', 'e', 'I', 'n', 't', 'e', 'l' }} },
96     { { 'V', 'I', 'A'                     }, {{ 'C', 'e', 'n', 't', 'a', 'u', 'r', 'H', 'a', 'u', 'l', 's' }} },
97     { { 'V', 'I', 'A'                     }, {{ 'V', 'I', 'A',  0 , 'V', 'I', 'A',  0 , 'V', 'I', 'A',  0  }} },
98     { { 'U', 'N', 'K', 'N', 'O', 'W', 'N' }, {{ 0                                                          }} }
99   };
100 
101   uint32_t i;
102   for (i = 0; i < ASMJIT_ARRAY_SIZE(table) - 1; i++)
103     if (table[i].d[0] == d0 && table[i].d[1] == d1 && table[i].d[2] == d2)
104       break;
105   memcpy(cpu._vendor.str, table[i].normalized, 8);
106 }
107 
simplifyCpuBrand(char * s)108 static inline void simplifyCpuBrand(char* s) noexcept {
109   char* d = s;
110 
111   char c = s[0];
112   char prev = 0;
113 
114   // Used to always clear the current character to ensure that the result
115   // doesn't contain garbage after a new null terminator is placed at the end.
116   s[0] = '\0';
117 
118   for (;;) {
119     if (!c)
120       break;
121 
122     if (!(c == ' ' && (prev == '@' || s[1] == ' ' || s[1] == '@'))) {
123       *d++ = c;
124       prev = c;
125     }
126 
127     c = *++s;
128     s[0] = '\0';
129   }
130 
131   d[0] = '\0';
132 }
133 
detectCpu(CpuInfo & cpu)134 ASMJIT_FAVOR_SIZE void detectCpu(CpuInfo& cpu) noexcept {
135   using Support::bitTest;
136 
137   cpuid_t regs;
138   xgetbv_t xcr0 { 0, 0 };
139   Features& features = cpu._features.as<Features>();
140 
141   cpu.reset();
142   cpu._arch = Environment::kArchHost;
143   cpu._subArch = Environment::kSubArchUnknown;
144   cpu._reserved = 0;
145   cpu._maxLogicalProcessors = 1;
146   features.add(Features::kI486);
147 
148   // --------------------------------------------------------------------------
149   // [CPUID EAX=0x0]
150   // --------------------------------------------------------------------------
151 
152   // Get vendor string/id.
153   cpuidQuery(&regs, 0x0);
154 
155   uint32_t maxId = regs.eax;
156   simplifyCpuVendor(cpu, regs.ebx, regs.edx, regs.ecx);
157 
158   // --------------------------------------------------------------------------
159   // [CPUID EAX=0x1]
160   // --------------------------------------------------------------------------
161 
162   if (maxId >= 0x1) {
163     // Get feature flags in ECX/EDX and family/model in EAX.
164     cpuidQuery(&regs, 0x1);
165 
166     // Fill family and model fields.
167     uint32_t modelId  = (regs.eax >> 4) & 0x0F;
168     uint32_t familyId = (regs.eax >> 8) & 0x0F;
169 
170     // Use extended family and model fields.
171     if (familyId == 0x06u || familyId == 0x0Fu)
172       modelId += (((regs.eax >> 16) & 0x0Fu) << 4);
173 
174     if (familyId == 0x0Fu)
175       familyId += (((regs.eax >> 20) & 0xFFu) << 4);
176 
177     cpu._modelId              = modelId;
178     cpu._familyId             = familyId;
179     cpu._brandId              = ((regs.ebx      ) & 0xFF);
180     cpu._processorType        = ((regs.eax >> 12) & 0x03);
181     cpu._maxLogicalProcessors = ((regs.ebx >> 16) & 0xFF);
182     cpu._stepping             = ((regs.eax      ) & 0x0F);
183     cpu._cacheLineSize        = ((regs.ebx >>  8) & 0xFF) * 8;
184 
185     if (bitTest(regs.ecx,  0)) features.add(Features::kSSE3);
186     if (bitTest(regs.ecx,  1)) features.add(Features::kPCLMULQDQ);
187     if (bitTest(regs.ecx,  3)) features.add(Features::kMONITOR);
188     if (bitTest(regs.ecx,  5)) features.add(Features::kVMX);
189     if (bitTest(regs.ecx,  6)) features.add(Features::kSMX);
190     if (bitTest(regs.ecx,  9)) features.add(Features::kSSSE3);
191     if (bitTest(regs.ecx, 13)) features.add(Features::kCMPXCHG16B);
192     if (bitTest(regs.ecx, 19)) features.add(Features::kSSE4_1);
193     if (bitTest(regs.ecx, 20)) features.add(Features::kSSE4_2);
194     if (bitTest(regs.ecx, 22)) features.add(Features::kMOVBE);
195     if (bitTest(regs.ecx, 23)) features.add(Features::kPOPCNT);
196     if (bitTest(regs.ecx, 25)) features.add(Features::kAESNI);
197     if (bitTest(regs.ecx, 26)) features.add(Features::kXSAVE);
198     if (bitTest(regs.ecx, 27)) features.add(Features::kOSXSAVE);
199     if (bitTest(regs.ecx, 30)) features.add(Features::kRDRAND);
200     if (bitTest(regs.edx,  0)) features.add(Features::kFPU);
201     if (bitTest(regs.edx,  4)) features.add(Features::kRDTSC);
202     if (bitTest(regs.edx,  5)) features.add(Features::kMSR);
203     if (bitTest(regs.edx,  8)) features.add(Features::kCMPXCHG8B);
204     if (bitTest(regs.edx, 15)) features.add(Features::kCMOV);
205     if (bitTest(regs.edx, 19)) features.add(Features::kCLFLUSH);
206     if (bitTest(regs.edx, 23)) features.add(Features::kMMX);
207     if (bitTest(regs.edx, 24)) features.add(Features::kFXSR);
208     if (bitTest(regs.edx, 25)) features.add(Features::kSSE, Features::kMMX2);
209     if (bitTest(regs.edx, 26)) features.add(Features::kSSE, Features::kSSE2);
210     if (bitTest(regs.edx, 28)) features.add(Features::kMT);
211 
212     // Get the content of XCR0 if supported by CPU and enabled by OS.
213     if ((regs.ecx & 0x0C000000u) == 0x0C000000u) {
214       xgetbvQuery(&xcr0, 0);
215     }
216 
217     // Detect AVX+.
218     if (bitTest(regs.ecx, 28)) {
219       // - XCR0[2:1] == 11b
220       //   XMM & YMM states need to be enabled by OS.
221       if ((xcr0.eax & 0x00000006u) == 0x00000006u) {
222         features.add(Features::kAVX);
223 
224         if (bitTest(regs.ecx, 12)) features.add(Features::kFMA);
225         if (bitTest(regs.ecx, 29)) features.add(Features::kF16C);
226       }
227     }
228   }
229 
230   // --------------------------------------------------------------------------
231   // [CPUID EAX=0x7]
232   // --------------------------------------------------------------------------
233 
234   // Detect new features if the processor supports CPUID-07.
235   bool maybeMPX = false;
236 
237   if (maxId >= 0x7) {
238     cpuidQuery(&regs, 0x7);
239     uint32_t maxSubLeafId = regs.eax;
240 
241     if (bitTest(regs.ebx,  0)) features.add(Features::kFSGSBASE);
242     if (bitTest(regs.ebx,  3)) features.add(Features::kBMI);
243     if (bitTest(regs.ebx,  4)) features.add(Features::kHLE);
244     if (bitTest(regs.ebx,  7)) features.add(Features::kSMEP);
245     if (bitTest(regs.ebx,  8)) features.add(Features::kBMI2);
246     if (bitTest(regs.ebx,  9)) features.add(Features::kERMS);
247     if (bitTest(regs.ebx, 11)) features.add(Features::kRTM);
248     if (bitTest(regs.ebx, 14)) maybeMPX = true;
249     if (bitTest(regs.ebx, 18)) features.add(Features::kRDSEED);
250     if (bitTest(regs.ebx, 19)) features.add(Features::kADX);
251     if (bitTest(regs.ebx, 20)) features.add(Features::kSMAP);
252     if (bitTest(regs.ebx, 22)) features.add(Features::kPCOMMIT);
253     if (bitTest(regs.ebx, 23)) features.add(Features::kCLFLUSHOPT);
254     if (bitTest(regs.ebx, 24)) features.add(Features::kCLWB);
255     if (bitTest(regs.ebx, 29)) features.add(Features::kSHA);
256     if (bitTest(regs.ecx,  0)) features.add(Features::kPREFETCHWT1);
257     if (bitTest(regs.ecx, 22)) features.add(Features::kRDPID);
258     if (bitTest(regs.ecx, 25)) features.add(Features::kCLDEMOTE);
259     if (bitTest(regs.ecx, 27)) features.add(Features::kMOVDIRI);
260     if (bitTest(regs.ecx, 28)) features.add(Features::kMOVDIR64B);
261     if (bitTest(regs.ecx, 29)) features.add(Features::kENQCMD);
262     if (bitTest(regs.edx, 18)) features.add(Features::kPCONFIG);
263 
264     // Detect 'TSX' - Requires at least one of `HLE` and `RTM` features.
265     if (features.hasHLE() || features.hasRTM())
266       features.add(Features::kTSX);
267 
268     // Detect 'AVX2' - Requires AVX as well.
269     if (bitTest(regs.ebx, 5) && features.hasAVX())
270       features.add(Features::kAVX2);
271 
272     // Detect 'AVX_512'.
273     if (bitTest(regs.ebx, 16)) {
274       // - XCR0[2:1] ==  11b - XMM/YMM states need to be enabled by OS.
275       // - XCR0[7:5] == 111b - Upper 256-bit of ZMM0-XMM15 and ZMM16-ZMM31 need to be enabled by OS.
276       if ((xcr0.eax & 0x000000E6u) == 0x000000E6u) {
277         features.add(Features::kAVX512_F);
278 
279         if (bitTest(regs.ebx, 17)) features.add(Features::kAVX512_DQ);
280         if (bitTest(regs.ebx, 21)) features.add(Features::kAVX512_IFMA);
281         if (bitTest(regs.ebx, 26)) features.add(Features::kAVX512_PFI);
282         if (bitTest(regs.ebx, 27)) features.add(Features::kAVX512_ERI);
283         if (bitTest(regs.ebx, 28)) features.add(Features::kAVX512_CDI);
284         if (bitTest(regs.ebx, 30)) features.add(Features::kAVX512_BW);
285         if (bitTest(regs.ebx, 31)) features.add(Features::kAVX512_VL);
286         if (bitTest(regs.ecx,  1)) features.add(Features::kAVX512_VBMI);
287         if (bitTest(regs.ecx,  5)) features.add(Features::kWAITPKG);
288         if (bitTest(regs.ecx,  6)) features.add(Features::kAVX512_VBMI2);
289         if (bitTest(regs.ecx,  8)) features.add(Features::kGFNI);
290         if (bitTest(regs.ecx,  9)) features.add(Features::kVAES);
291         if (bitTest(regs.ecx, 10)) features.add(Features::kVPCLMULQDQ);
292         if (bitTest(regs.ecx, 11)) features.add(Features::kAVX512_VNNI);
293         if (bitTest(regs.ecx, 12)) features.add(Features::kAVX512_BITALG);
294         if (bitTest(regs.ecx, 14)) features.add(Features::kAVX512_VPOPCNTDQ);
295         if (bitTest(regs.edx,  2)) features.add(Features::kAVX512_4VNNIW);
296         if (bitTest(regs.edx,  3)) features.add(Features::kAVX512_4FMAPS);
297         if (bitTest(regs.edx,  8)) features.add(Features::kAVX512_VP2INTERSECT);
298       }
299     }
300 
301     if (maxSubLeafId >= 1 && features.hasAVX512_F()) {
302       cpuidQuery(&regs, 0x7, 1);
303 
304       if (bitTest(regs.eax, 5)) features.add(Features::kAVX512_BF16);
305     }
306   }
307 
308   // --------------------------------------------------------------------------
309   // [CPUID EAX=0xD]
310   // --------------------------------------------------------------------------
311 
312   if (maxId >= 0xD) {
313     cpuidQuery(&regs, 0xD, 0);
314 
315     // Both CPUID result and XCR0 has to be enabled to have support for MPX.
316     if (((regs.eax & xcr0.eax) & 0x00000018u) == 0x00000018u && maybeMPX)
317       features.add(Features::kMPX);
318 
319     cpuidQuery(&regs, 0xD, 1);
320     if (bitTest(regs.eax, 0)) features.add(Features::kXSAVEOPT);
321     if (bitTest(regs.eax, 1)) features.add(Features::kXSAVEC);
322     if (bitTest(regs.eax, 3)) features.add(Features::kXSAVES);
323   }
324 
325   // --------------------------------------------------------------------------
326   // [CPUID EAX=0x80000000...maxId]
327   // --------------------------------------------------------------------------
328 
329   maxId = 0x80000000u;
330   uint32_t i = maxId;
331 
332   // The highest EAX that we understand.
333   uint32_t kHighestProcessedEAX = 0x80000008u;
334 
335   // Several CPUID calls are required to get the whole branc string. It's easy
336   // to copy one DWORD at a time instead of performing a byte copy.
337   uint32_t* brand = cpu._brand.u32;
338   do {
339     cpuidQuery(&regs, i);
340     switch (i) {
341       case 0x80000000u:
342         maxId = Support::min<uint32_t>(regs.eax, kHighestProcessedEAX);
343         break;
344 
345       case 0x80000001u:
346         if (bitTest(regs.ecx,  0)) features.add(Features::kLAHFSAHF);
347         if (bitTest(regs.ecx,  2)) features.add(Features::kSVM);
348         if (bitTest(regs.ecx,  5)) features.add(Features::kLZCNT);
349         if (bitTest(regs.ecx,  6)) features.add(Features::kSSE4A);
350         if (bitTest(regs.ecx,  7)) features.add(Features::kMSSE);
351         if (bitTest(regs.ecx,  8)) features.add(Features::kPREFETCHW);
352         if (bitTest(regs.ecx, 12)) features.add(Features::kSKINIT);
353         if (bitTest(regs.ecx, 15)) features.add(Features::kLWP);
354         if (bitTest(regs.ecx, 21)) features.add(Features::kTBM);
355         if (bitTest(regs.ecx, 29)) features.add(Features::kMONITORX);
356         if (bitTest(regs.edx, 20)) features.add(Features::kNX);
357         if (bitTest(regs.edx, 21)) features.add(Features::kFXSROPT);
358         if (bitTest(regs.edx, 22)) features.add(Features::kMMX2);
359         if (bitTest(regs.edx, 27)) features.add(Features::kRDTSCP);
360         if (bitTest(regs.edx, 30)) features.add(Features::k3DNOW2, Features::kMMX2);
361         if (bitTest(regs.edx, 31)) features.add(Features::k3DNOW);
362 
363         if (cpu.hasFeature(Features::kAVX)) {
364           if (bitTest(regs.ecx, 11)) features.add(Features::kXOP);
365           if (bitTest(regs.ecx, 16)) features.add(Features::kFMA4);
366         }
367 
368         // These seem to be only supported by AMD.
369         if (cpu.isVendor("AMD")) {
370           if (bitTest(regs.ecx,  4)) features.add(Features::kALTMOVCR8);
371         }
372         break;
373 
374       case 0x80000002u:
375       case 0x80000003u:
376       case 0x80000004u:
377         *brand++ = regs.eax;
378         *brand++ = regs.ebx;
379         *brand++ = regs.ecx;
380         *brand++ = regs.edx;
381 
382         // Go directly to the last one.
383         if (i == 0x80000004u) i = 0x80000008u - 1;
384         break;
385 
386       case 0x80000008u:
387         if (bitTest(regs.ebx,  0)) features.add(Features::kCLZERO);
388         break;
389     }
390   } while (++i <= maxId);
391 
392   // Simplify CPU brand string a bit by removing some unnecessary spaces.
393   simplifyCpuBrand(cpu._brand.str);
394 }
395 
396 ASMJIT_END_SUB_NAMESPACE
397 
398 #endif // ASMJIT_BUILD_X86 && ASMJIT_ARCH_X86
399