1// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
2
3// Package cpuid provides information about the CPU running the current program.
4//
5// CPU features are detected on startup, and kept for fast access through the life of the application.
6// Currently x86 / x64 (AMD64) as well as arm64 is supported.
7//
8// You can access the CPU information by accessing the shared CPU variable of the cpuid library.
9//
10// Package home: https://github.com/klauspost/cpuid
11package cpuid
12
13import (
14	"flag"
15	"fmt"
16	"math"
17	"os"
18	"runtime"
19	"strings"
20)
21
22// AMD refererence: https://www.amd.com/system/files/TechDocs/25481.pdf
23// and Processor Programming Reference (PPR)
24
25// Vendor is a representation of a CPU vendor.
26type Vendor int
27
28const (
29	VendorUnknown Vendor = iota
30	Intel
31	AMD
32	VIA
33	Transmeta
34	NSC
35	KVM  // Kernel-based Virtual Machine
36	MSVM // Microsoft Hyper-V or Windows Virtual PC
37	VMware
38	XenHVM
39	Bhyve
40	Hygon
41	SiS
42	RDC
43
44	Ampere
45	ARM
46	Broadcom
47	Cavium
48	DEC
49	Fujitsu
50	Infineon
51	Motorola
52	NVIDIA
53	AMCC
54	Qualcomm
55	Marvell
56
57	lastVendor
58)
59
60//go:generate stringer -type=FeatureID,Vendor
61
62// FeatureID is the ID of a specific cpu feature.
63type FeatureID int
64
65const (
66	// Keep index -1 as unknown
67	UNKNOWN = -1
68
69	// Add features
70	ADX                FeatureID = iota // Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
71	AESNI                               // Advanced Encryption Standard New Instructions
72	AMD3DNOW                            // AMD 3DNOW
73	AMD3DNOWEXT                         // AMD 3DNowExt
74	AMXBF16                             // Tile computational operations on BFLOAT16 numbers
75	AMXINT8                             // Tile computational operations on 8-bit integers
76	AMXTILE                             // Tile architecture
77	AVX                                 // AVX functions
78	AVX2                                // AVX2 functions
79	AVX512BF16                          // AVX-512 BFLOAT16 Instructions
80	AVX512BITALG                        // AVX-512 Bit Algorithms
81	AVX512BW                            // AVX-512 Byte and Word Instructions
82	AVX512CD                            // AVX-512 Conflict Detection Instructions
83	AVX512DQ                            // AVX-512 Doubleword and Quadword Instructions
84	AVX512ER                            // AVX-512 Exponential and Reciprocal Instructions
85	AVX512F                             // AVX-512 Foundation
86	AVX512IFMA                          // AVX-512 Integer Fused Multiply-Add Instructions
87	AVX512PF                            // AVX-512 Prefetch Instructions
88	AVX512VBMI                          // AVX-512 Vector Bit Manipulation Instructions
89	AVX512VBMI2                         // AVX-512 Vector Bit Manipulation Instructions, Version 2
90	AVX512VL                            // AVX-512 Vector Length Extensions
91	AVX512VNNI                          // AVX-512 Vector Neural Network Instructions
92	AVX512VP2INTERSECT                  // AVX-512 Intersect for D/Q
93	AVX512VPOPCNTDQ                     // AVX-512 Vector Population Count Doubleword and Quadword
94	AVXSLOW                             // Indicates the CPU performs 2 128 bit operations instead of one.
95	BMI1                                // Bit Manipulation Instruction Set 1
96	BMI2                                // Bit Manipulation Instruction Set 2
97	CLDEMOTE                            // Cache Line Demote
98	CLMUL                               // Carry-less Multiplication
99	CMOV                                // i686 CMOV
100	CX16                                // CMPXCHG16B Instruction
101	ENQCMD                              // Enqueue Command
102	ERMS                                // Enhanced REP MOVSB/STOSB
103	F16C                                // Half-precision floating-point conversion
104	FMA3                                // Intel FMA 3. Does not imply AVX.
105	FMA4                                // Bulldozer FMA4 functions
106	GFNI                                // Galois Field New Instructions
107	HLE                                 // Hardware Lock Elision
108	HTT                                 // Hyperthreading (enabled)
109	HYPERVISOR                          // This bit has been reserved by Intel & AMD for use by hypervisors
110	IBPB                                // Indirect Branch Restricted Speculation (IBRS) and Indirect Branch Predictor Barrier (IBPB)
111	IBS                                 // Instruction Based Sampling (AMD)
112	IBSBRNTRGT                          // Instruction Based Sampling Feature (AMD)
113	IBSFETCHSAM                         // Instruction Based Sampling Feature (AMD)
114	IBSFFV                              // Instruction Based Sampling Feature (AMD)
115	IBSOPCNT                            // Instruction Based Sampling Feature (AMD)
116	IBSOPCNTEXT                         // Instruction Based Sampling Feature (AMD)
117	IBSOPSAM                            // Instruction Based Sampling Feature (AMD)
118	IBSRDWROPCNT                        // Instruction Based Sampling Feature (AMD)
119	IBSRIPINVALIDCHK                    // Instruction Based Sampling Feature (AMD)
120	LZCNT                               // LZCNT instruction
121	MMX                                 // standard MMX
122	MMXEXT                              // SSE integer functions or AMD MMX ext
123	MOVDIR64B                           // Move 64 Bytes as Direct Store
124	MOVDIRI                             // Move Doubleword as Direct Store
125	MPX                                 // Intel MPX (Memory Protection Extensions)
126	NX                                  // NX (No-Execute) bit
127	POPCNT                              // POPCNT instruction
128	RDRAND                              // RDRAND instruction is available
129	RDSEED                              // RDSEED instruction is available
130	RDTSCP                              // RDTSCP Instruction
131	RTM                                 // Restricted Transactional Memory
132	SERIALIZE                           // Serialize Instruction Execution
133	SGX                                 // Software Guard Extensions
134	SGXLC                               // Software Guard Extensions Launch Control
135	SHA                                 // Intel SHA Extensions
136	SSE                                 // SSE functions
137	SSE2                                // P4 SSE functions
138	SSE3                                // Prescott SSE3 functions
139	SSE4                                // Penryn SSE4.1 functions
140	SSE42                               // Nehalem SSE4.2 functions
141	SSE4A                               // AMD Barcelona microarchitecture SSE4a instructions
142	SSSE3                               // Conroe SSSE3 functions
143	STIBP                               // Single Thread Indirect Branch Predictors
144	TBM                                 // AMD Trailing Bit Manipulation
145	TSXLDTRK                            // Intel TSX Suspend Load Address Tracking
146	VAES                                // Vector AES
147	VMX                                 // Virtual Machine Extensions
148	VPCLMULQDQ                          // Carry-Less Multiplication Quadword
149	WAITPKG                             // TPAUSE, UMONITOR, UMWAIT
150	WBNOINVD                            // Write Back and Do Not Invalidate Cache
151	XOP                                 // Bulldozer XOP functions
152
153	// ARM features:
154	AESARM   // AES instructions
155	ARMCPUID // Some CPU ID registers readable at user-level
156	ASIMD    // Advanced SIMD
157	ASIMDDP  // SIMD Dot Product
158	ASIMDHP  // Advanced SIMD half-precision floating point
159	ASIMDRDM // Rounding Double Multiply Accumulate/Subtract (SQRDMLAH/SQRDMLSH)
160	ATOMICS  // Large System Extensions (LSE)
161	CRC32    // CRC32/CRC32C instructions
162	DCPOP    // Data cache clean to Point of Persistence (DC CVAP)
163	EVTSTRM  // Generic timer
164	FCMA     // Floatin point complex number addition and multiplication
165	FP       // Single-precision and double-precision floating point
166	FPHP     // Half-precision floating point
167	GPA      // Generic Pointer Authentication
168	JSCVT    // Javascript-style double->int convert (FJCVTZS)
169	LRCPC    // Weaker release consistency (LDAPR, etc)
170	PMULL    // Polynomial Multiply instructions (PMULL/PMULL2)
171	SHA1     // SHA-1 instructions (SHA1C, etc)
172	SHA2     // SHA-2 instructions (SHA256H, etc)
173	SHA3     // SHA-3 instructions (EOR3, RAXI, XAR, BCAX)
174	SHA512   // SHA512 instructions
175	SM3      // SM3 instructions
176	SM4      // SM4 instructions
177	SVE      // Scalable Vector Extension
178
179	// Keep it last. It automatically defines the size of []flagSet
180	lastID
181
182	firstID FeatureID = UNKNOWN + 1
183)
184
185// CPUInfo contains information about the detected system CPU.
186type CPUInfo struct {
187	BrandName      string  // Brand name reported by the CPU
188	VendorID       Vendor  // Comparable CPU vendor ID
189	VendorString   string  // Raw vendor string.
190	featureSet     flagSet // Features of the CPU
191	PhysicalCores  int     // Number of physical processor cores in your CPU. Will be 0 if undetectable.
192	ThreadsPerCore int     // Number of threads per physical core. Will be 1 if undetectable.
193	LogicalCores   int     // Number of physical cores times threads that can run on each core through the use of hyperthreading. Will be 0 if undetectable.
194	Family         int     // CPU family number
195	Model          int     // CPU model number
196	CacheLine      int     // Cache line size in bytes. Will be 0 if undetectable.
197	Hz             int64   // Clock speed, if known, 0 otherwise
198	Cache          struct {
199		L1I int // L1 Instruction Cache (per core or shared). Will be -1 if undetected
200		L1D int // L1 Data Cache (per core or shared). Will be -1 if undetected
201		L2  int // L2 Cache (per core or shared). Will be -1 if undetected
202		L3  int // L3 Cache (per core, per ccx or shared). Will be -1 if undetected
203	}
204	SGX       SGXSupport
205	maxFunc   uint32
206	maxExFunc uint32
207}
208
209var cpuid func(op uint32) (eax, ebx, ecx, edx uint32)
210var cpuidex func(op, op2 uint32) (eax, ebx, ecx, edx uint32)
211var xgetbv func(index uint32) (eax, edx uint32)
212var rdtscpAsm func() (eax, ebx, ecx, edx uint32)
213var darwinHasAVX512 = func() bool { return false }
214
215// CPU contains information about the CPU as detected on startup,
216// or when Detect last was called.
217//
218// Use this as the primary entry point to you data.
219var CPU CPUInfo
220
221func init() {
222	initCPU()
223	Detect()
224}
225
226// Detect will re-detect current CPU info.
227// This will replace the content of the exported CPU variable.
228//
229// Unless you expect the CPU to change while you are running your program
230// you should not need to call this function.
231// If you call this, you must ensure that no other goroutine is accessing the
232// exported CPU variable.
233func Detect() {
234	// Set defaults
235	CPU.ThreadsPerCore = 1
236	CPU.Cache.L1I = -1
237	CPU.Cache.L1D = -1
238	CPU.Cache.L2 = -1
239	CPU.Cache.L3 = -1
240	safe := true
241	if detectArmFlag != nil {
242		safe = !*detectArmFlag
243	}
244	addInfo(&CPU, safe)
245	if displayFeats != nil && *displayFeats {
246		fmt.Println("cpu features:", strings.Join(CPU.FeatureSet(), ","))
247		// Exit with non-zero so tests will print value.
248		os.Exit(1)
249	}
250	if disableFlag != nil {
251		s := strings.Split(*disableFlag, ",")
252		for _, feat := range s {
253			feat := ParseFeature(strings.TrimSpace(feat))
254			if feat != UNKNOWN {
255				CPU.featureSet.unset(feat)
256			}
257		}
258	}
259}
260
261// DetectARM will detect ARM64 features.
262// This is NOT done automatically since it can potentially crash
263// if the OS does not handle the command.
264// If in the future this can be done safely this function may not
265// do anything.
266func DetectARM() {
267	addInfo(&CPU, false)
268}
269
270var detectArmFlag *bool
271var displayFeats *bool
272var disableFlag *string
273
274// Flags will enable flags.
275// This must be called *before* flag.Parse AND
276// Detect must be called after the flags have been parsed.
277// Note that this means that any detection used in init() functions
278// will not contain these flags.
279func Flags() {
280	disableFlag = flag.String("cpu.disable", "", "disable cpu features; comma separated list")
281	displayFeats = flag.Bool("cpu.features", false, "lists cpu features and exits")
282	detectArmFlag = flag.Bool("cpu.arm", false, "allow ARM features to be detected; can potentially crash")
283}
284
285// Supports returns whether the CPU supports all of the requested features.
286func (c CPUInfo) Supports(ids ...FeatureID) bool {
287	for _, id := range ids {
288		if !c.featureSet.inSet(id) {
289			return false
290		}
291	}
292	return true
293}
294
295// Has allows for checking a single feature.
296// Should be inlined by the compiler.
297func (c CPUInfo) Has(id FeatureID) bool {
298	return c.featureSet.inSet(id)
299}
300
301// Disable will disable one or several features.
302func (c *CPUInfo) Disable(ids ...FeatureID) bool {
303	for _, id := range ids {
304		c.featureSet.unset(id)
305	}
306	return true
307}
308
309// Enable will disable one or several features even if they were undetected.
310// This is of course not recommended for obvious reasons.
311func (c *CPUInfo) Enable(ids ...FeatureID) bool {
312	for _, id := range ids {
313		c.featureSet.set(id)
314	}
315	return true
316}
317
318// IsVendor returns true if vendor is recognized as Intel
319func (c CPUInfo) IsVendor(v Vendor) bool {
320	return c.VendorID == v
321}
322
323func (c CPUInfo) FeatureSet() []string {
324	s := make([]string, 0)
325	for _, f := range c.featureSet.Strings() {
326		s = append(s, f)
327	}
328	return s
329}
330
331// RTCounter returns the 64-bit time-stamp counter
332// Uses the RDTSCP instruction. The value 0 is returned
333// if the CPU does not support the instruction.
334func (c CPUInfo) RTCounter() uint64 {
335	if !c.Supports(RDTSCP) {
336		return 0
337	}
338	a, _, _, d := rdtscpAsm()
339	return uint64(a) | (uint64(d) << 32)
340}
341
342// Ia32TscAux returns the IA32_TSC_AUX part of the RDTSCP.
343// This variable is OS dependent, but on Linux contains information
344// about the current cpu/core the code is running on.
345// If the RDTSCP instruction isn't supported on the CPU, the value 0 is returned.
346func (c CPUInfo) Ia32TscAux() uint32 {
347	if !c.Supports(RDTSCP) {
348		return 0
349	}
350	_, _, ecx, _ := rdtscpAsm()
351	return ecx
352}
353
354// LogicalCPU will return the Logical CPU the code is currently executing on.
355// This is likely to change when the OS re-schedules the running thread
356// to another CPU.
357// If the current core cannot be detected, -1 will be returned.
358func (c CPUInfo) LogicalCPU() int {
359	if c.maxFunc < 1 {
360		return -1
361	}
362	_, ebx, _, _ := cpuid(1)
363	return int(ebx >> 24)
364}
365
366// hertz tries to compute the clock speed of the CPU. If leaf 15 is
367// supported, use it, otherwise parse the brand string. Yes, really.
368func hertz(model string) int64 {
369	mfi := maxFunctionID()
370	if mfi >= 0x15 {
371		eax, ebx, ecx, _ := cpuid(0x15)
372		if eax != 0 && ebx != 0 && ecx != 0 {
373			return int64((int64(ecx) * int64(ebx)) / int64(eax))
374		}
375	}
376	// computeHz determines the official rated speed of a CPU from its brand
377	// string. This insanity is *actually the official documented way to do
378	// this according to Intel*, prior to leaf 0x15 existing. The official
379	// documentation only shows this working for exactly `x.xx` or `xxxx`
380	// cases, e.g., `2.50GHz` or `1300MHz`; this parser will accept other
381	// sizes.
382	hz := strings.LastIndex(model, "Hz")
383	if hz < 3 {
384		return 0
385	}
386	var multiplier int64
387	switch model[hz-1] {
388	case 'M':
389		multiplier = 1000 * 1000
390	case 'G':
391		multiplier = 1000 * 1000 * 1000
392	case 'T':
393		multiplier = 1000 * 1000 * 1000 * 1000
394	}
395	if multiplier == 0 {
396		return 0
397	}
398	freq := int64(0)
399	divisor := int64(0)
400	decimalShift := int64(1)
401	var i int
402	for i = hz - 2; i >= 0 && model[i] != ' '; i-- {
403		if model[i] >= '0' && model[i] <= '9' {
404			freq += int64(model[i]-'0') * decimalShift
405			decimalShift *= 10
406		} else if model[i] == '.' {
407			if divisor != 0 {
408				return 0
409			}
410			divisor = decimalShift
411		} else {
412			return 0
413		}
414	}
415	// we didn't find a space
416	if i < 0 {
417		return 0
418	}
419	if divisor != 0 {
420		return (freq * multiplier) / divisor
421	}
422	return freq * multiplier
423}
424
425// VM Will return true if the cpu id indicates we are in
426// a virtual machine.
427func (c CPUInfo) VM() bool {
428	return CPU.featureSet.inSet(HYPERVISOR)
429}
430
431// flags contains detected cpu features and characteristics
432type flags uint64
433
434// log2(bits_in_uint64)
435const flagBitsLog2 = 6
436const flagBits = 1 << flagBitsLog2
437const flagMask = flagBits - 1
438
439// flagSet contains detected cpu features and characteristics in an array of flags
440type flagSet [(lastID + flagMask) / flagBits]flags
441
442func (s flagSet) inSet(feat FeatureID) bool {
443	return s[feat>>flagBitsLog2]&(1<<(feat&flagMask)) != 0
444}
445
446func (s *flagSet) set(feat FeatureID) {
447	s[feat>>flagBitsLog2] |= 1 << (feat & flagMask)
448}
449
450// setIf will set a feature if boolean is true.
451func (s *flagSet) setIf(cond bool, features ...FeatureID) {
452	if cond {
453		for _, offset := range features {
454			s[offset>>flagBitsLog2] |= 1 << (offset & flagMask)
455		}
456	}
457}
458
459func (s *flagSet) unset(offset FeatureID) {
460	bit := flags(1 << (offset & flagMask))
461	s[offset>>flagBitsLog2] = s[offset>>flagBitsLog2] & ^bit
462}
463
464// or with another flagset.
465func (s *flagSet) or(other flagSet) {
466	for i, v := range other[:] {
467		s[i] |= v
468	}
469}
470
471// ParseFeature will parse the string and return the ID of the matching feature.
472// Will return UNKNOWN if not found.
473func ParseFeature(s string) FeatureID {
474	s = strings.ToUpper(s)
475	for i := firstID; i < lastID; i++ {
476		if i.String() == s {
477			return i
478		}
479	}
480	return UNKNOWN
481}
482
483// Strings returns an array of the detected features for FlagsSet.
484func (s flagSet) Strings() []string {
485	if len(s) == 0 {
486		return []string{""}
487	}
488	r := make([]string, 0)
489	for i := firstID; i < lastID; i++ {
490		if s.inSet(i) {
491			r = append(r, i.String())
492		}
493	}
494	return r
495}
496
497func maxExtendedFunction() uint32 {
498	eax, _, _, _ := cpuid(0x80000000)
499	return eax
500}
501
502func maxFunctionID() uint32 {
503	a, _, _, _ := cpuid(0)
504	return a
505}
506
507func brandName() string {
508	if maxExtendedFunction() >= 0x80000004 {
509		v := make([]uint32, 0, 48)
510		for i := uint32(0); i < 3; i++ {
511			a, b, c, d := cpuid(0x80000002 + i)
512			v = append(v, a, b, c, d)
513		}
514		return strings.Trim(string(valAsString(v...)), " ")
515	}
516	return "unknown"
517}
518
519func threadsPerCore() int {
520	mfi := maxFunctionID()
521	vend, _ := vendorID()
522
523	if mfi < 0x4 || (vend != Intel && vend != AMD) {
524		return 1
525	}
526
527	if mfi < 0xb {
528		if vend != Intel {
529			return 1
530		}
531		_, b, _, d := cpuid(1)
532		if (d & (1 << 28)) != 0 {
533			// v will contain logical core count
534			v := (b >> 16) & 255
535			if v > 1 {
536				a4, _, _, _ := cpuid(4)
537				// physical cores
538				v2 := (a4 >> 26) + 1
539				if v2 > 0 {
540					return int(v) / int(v2)
541				}
542			}
543		}
544		return 1
545	}
546	_, b, _, _ := cpuidex(0xb, 0)
547	if b&0xffff == 0 {
548		if vend == AMD {
549			// Workaround for AMD returning 0, assume 2 if >= Zen 2
550			// It will be more correct than not.
551			fam, _ := familyModel()
552			_, _, _, d := cpuid(1)
553			if (d&(1<<28)) != 0 && fam >= 23 {
554				return 2
555			}
556		}
557		return 1
558	}
559	return int(b & 0xffff)
560}
561
562func logicalCores() int {
563	mfi := maxFunctionID()
564	v, _ := vendorID()
565	switch v {
566	case Intel:
567		// Use this on old Intel processors
568		if mfi < 0xb {
569			if mfi < 1 {
570				return 0
571			}
572			// CPUID.1:EBX[23:16] represents the maximum number of addressable IDs (initial APIC ID)
573			// that can be assigned to logical processors in a physical package.
574			// The value may not be the same as the number of logical processors that are present in the hardware of a physical package.
575			_, ebx, _, _ := cpuid(1)
576			logical := (ebx >> 16) & 0xff
577			return int(logical)
578		}
579		_, b, _, _ := cpuidex(0xb, 1)
580		return int(b & 0xffff)
581	case AMD, Hygon:
582		_, b, _, _ := cpuid(1)
583		return int((b >> 16) & 0xff)
584	default:
585		return 0
586	}
587}
588
589func familyModel() (int, int) {
590	if maxFunctionID() < 0x1 {
591		return 0, 0
592	}
593	eax, _, _, _ := cpuid(1)
594	family := ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff)
595	model := ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0)
596	return int(family), int(model)
597}
598
599func physicalCores() int {
600	v, _ := vendorID()
601	switch v {
602	case Intel:
603		return logicalCores() / threadsPerCore()
604	case AMD, Hygon:
605		lc := logicalCores()
606		tpc := threadsPerCore()
607		if lc > 0 && tpc > 0 {
608			return lc / tpc
609		}
610
611		// The following is inaccurate on AMD EPYC 7742 64-Core Processor
612		if maxExtendedFunction() >= 0x80000008 {
613			_, _, c, _ := cpuid(0x80000008)
614			if c&0xff > 0 {
615				return int(c&0xff) + 1
616			}
617		}
618	}
619	return 0
620}
621
622// Except from http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID
623var vendorMapping = map[string]Vendor{
624	"AMDisbetter!": AMD,
625	"AuthenticAMD": AMD,
626	"CentaurHauls": VIA,
627	"GenuineIntel": Intel,
628	"TransmetaCPU": Transmeta,
629	"GenuineTMx86": Transmeta,
630	"Geode by NSC": NSC,
631	"VIA VIA VIA ": VIA,
632	"KVMKVMKVMKVM": KVM,
633	"Microsoft Hv": MSVM,
634	"VMwareVMware": VMware,
635	"XenVMMXenVMM": XenHVM,
636	"bhyve bhyve ": Bhyve,
637	"HygonGenuine": Hygon,
638	"Vortex86 SoC": SiS,
639	"SiS SiS SiS ": SiS,
640	"RiseRiseRise": SiS,
641	"Genuine  RDC": RDC,
642}
643
644func vendorID() (Vendor, string) {
645	_, b, c, d := cpuid(0)
646	v := string(valAsString(b, d, c))
647	vend, ok := vendorMapping[v]
648	if !ok {
649		return VendorUnknown, v
650	}
651	return vend, v
652}
653
654func cacheLine() int {
655	if maxFunctionID() < 0x1 {
656		return 0
657	}
658
659	_, ebx, _, _ := cpuid(1)
660	cache := (ebx & 0xff00) >> 5 // cflush size
661	if cache == 0 && maxExtendedFunction() >= 0x80000006 {
662		_, _, ecx, _ := cpuid(0x80000006)
663		cache = ecx & 0xff // cacheline size
664	}
665	// TODO: Read from Cache and TLB Information
666	return int(cache)
667}
668
669func (c *CPUInfo) cacheSize() {
670	c.Cache.L1D = -1
671	c.Cache.L1I = -1
672	c.Cache.L2 = -1
673	c.Cache.L3 = -1
674	vendor, _ := vendorID()
675	switch vendor {
676	case Intel:
677		if maxFunctionID() < 4 {
678			return
679		}
680		for i := uint32(0); ; i++ {
681			eax, ebx, ecx, _ := cpuidex(4, i)
682			cacheType := eax & 15
683			if cacheType == 0 {
684				break
685			}
686			cacheLevel := (eax >> 5) & 7
687			coherency := int(ebx&0xfff) + 1
688			partitions := int((ebx>>12)&0x3ff) + 1
689			associativity := int((ebx>>22)&0x3ff) + 1
690			sets := int(ecx) + 1
691			size := associativity * partitions * coherency * sets
692			switch cacheLevel {
693			case 1:
694				if cacheType == 1 {
695					// 1 = Data Cache
696					c.Cache.L1D = size
697				} else if cacheType == 2 {
698					// 2 = Instruction Cache
699					c.Cache.L1I = size
700				} else {
701					if c.Cache.L1D < 0 {
702						c.Cache.L1I = size
703					}
704					if c.Cache.L1I < 0 {
705						c.Cache.L1I = size
706					}
707				}
708			case 2:
709				c.Cache.L2 = size
710			case 3:
711				c.Cache.L3 = size
712			}
713		}
714	case AMD, Hygon:
715		// Untested.
716		if maxExtendedFunction() < 0x80000005 {
717			return
718		}
719		_, _, ecx, edx := cpuid(0x80000005)
720		c.Cache.L1D = int(((ecx >> 24) & 0xFF) * 1024)
721		c.Cache.L1I = int(((edx >> 24) & 0xFF) * 1024)
722
723		if maxExtendedFunction() < 0x80000006 {
724			return
725		}
726		_, _, ecx, _ = cpuid(0x80000006)
727		c.Cache.L2 = int(((ecx >> 16) & 0xFFFF) * 1024)
728
729		// CPUID Fn8000_001D_EAX_x[N:0] Cache Properties
730		if maxExtendedFunction() < 0x8000001D {
731			return
732		}
733		for i := uint32(0); i < math.MaxUint32; i++ {
734			eax, ebx, ecx, _ := cpuidex(0x8000001D, i)
735
736			level := (eax >> 5) & 7
737			cacheNumSets := ecx + 1
738			cacheLineSize := 1 + (ebx & 2047)
739			cachePhysPartitions := 1 + ((ebx >> 12) & 511)
740			cacheNumWays := 1 + ((ebx >> 22) & 511)
741
742			typ := eax & 15
743			size := int(cacheNumSets * cacheLineSize * cachePhysPartitions * cacheNumWays)
744			if typ == 0 {
745				return
746			}
747
748			switch level {
749			case 1:
750				switch typ {
751				case 1:
752					// Data cache
753					c.Cache.L1D = size
754				case 2:
755					// Inst cache
756					c.Cache.L1I = size
757				default:
758					if c.Cache.L1D < 0 {
759						c.Cache.L1I = size
760					}
761					if c.Cache.L1I < 0 {
762						c.Cache.L1I = size
763					}
764				}
765			case 2:
766				c.Cache.L2 = size
767			case 3:
768				c.Cache.L3 = size
769			}
770		}
771	}
772
773	return
774}
775
776type SGXEPCSection struct {
777	BaseAddress uint64
778	EPCSize     uint64
779}
780
781type SGXSupport struct {
782	Available           bool
783	LaunchControl       bool
784	SGX1Supported       bool
785	SGX2Supported       bool
786	MaxEnclaveSizeNot64 int64
787	MaxEnclaveSize64    int64
788	EPCSections         []SGXEPCSection
789}
790
791func hasSGX(available, lc bool) (rval SGXSupport) {
792	rval.Available = available
793
794	if !available {
795		return
796	}
797
798	rval.LaunchControl = lc
799
800	a, _, _, d := cpuidex(0x12, 0)
801	rval.SGX1Supported = a&0x01 != 0
802	rval.SGX2Supported = a&0x02 != 0
803	rval.MaxEnclaveSizeNot64 = 1 << (d & 0xFF)     // pow 2
804	rval.MaxEnclaveSize64 = 1 << ((d >> 8) & 0xFF) // pow 2
805	rval.EPCSections = make([]SGXEPCSection, 0)
806
807	for subleaf := uint32(2); subleaf < 2+8; subleaf++ {
808		eax, ebx, ecx, edx := cpuidex(0x12, subleaf)
809		leafType := eax & 0xf
810
811		if leafType == 0 {
812			// Invalid subleaf, stop iterating
813			break
814		} else if leafType == 1 {
815			// EPC Section subleaf
816			baseAddress := uint64(eax&0xfffff000) + (uint64(ebx&0x000fffff) << 32)
817			size := uint64(ecx&0xfffff000) + (uint64(edx&0x000fffff) << 32)
818
819			section := SGXEPCSection{BaseAddress: baseAddress, EPCSize: size}
820			rval.EPCSections = append(rval.EPCSections, section)
821		}
822	}
823
824	return
825}
826
827func support() flagSet {
828	var fs flagSet
829	mfi := maxFunctionID()
830	vend, _ := vendorID()
831	if mfi < 0x1 {
832		return fs
833	}
834	family, model := familyModel()
835
836	_, _, c, d := cpuid(1)
837	fs.setIf((d&(1<<15)) != 0, CMOV)
838	fs.setIf((d&(1<<23)) != 0, MMX)
839	fs.setIf((d&(1<<25)) != 0, MMXEXT)
840	fs.setIf((d&(1<<25)) != 0, SSE)
841	fs.setIf((d&(1<<26)) != 0, SSE2)
842	fs.setIf((c&1) != 0, SSE3)
843	fs.setIf((c&(1<<5)) != 0, VMX)
844	fs.setIf((c&0x00000200) != 0, SSSE3)
845	fs.setIf((c&0x00080000) != 0, SSE4)
846	fs.setIf((c&0x00100000) != 0, SSE42)
847	fs.setIf((c&(1<<25)) != 0, AESNI)
848	fs.setIf((c&(1<<1)) != 0, CLMUL)
849	fs.setIf(c&(1<<23) != 0, POPCNT)
850	fs.setIf(c&(1<<30) != 0, RDRAND)
851
852	// This bit has been reserved by Intel & AMD for use by hypervisors,
853	// and indicates the presence of a hypervisor.
854	fs.setIf(c&(1<<31) != 0, HYPERVISOR)
855	fs.setIf(c&(1<<29) != 0, F16C)
856	fs.setIf(c&(1<<13) != 0, CX16)
857
858	if vend == Intel && (d&(1<<28)) != 0 && mfi >= 4 {
859		fs.setIf(threadsPerCore() > 1, HTT)
860	}
861	if vend == AMD && (d&(1<<28)) != 0 && mfi >= 4 {
862		fs.setIf(threadsPerCore() > 1, HTT)
863	}
864	// Check XGETBV/XSAVE (26), OXSAVE (27) and AVX (28) bits
865	const avxCheck = 1<<26 | 1<<27 | 1<<28
866	if c&avxCheck == avxCheck {
867		// Check for OS support
868		eax, _ := xgetbv(0)
869		if (eax & 0x6) == 0x6 {
870			fs.set(AVX)
871			switch vend {
872			case Intel:
873				// Older than Haswell.
874				fs.setIf(family == 6 && model < 60, AVXSLOW)
875			case AMD:
876				// Older than Zen 2
877				fs.setIf(family < 23 || (family == 23 && model < 49), AVXSLOW)
878			}
879		}
880	}
881	// FMA3 can be used with SSE registers, so no OS support is strictly needed.
882	// fma3 and OSXSAVE needed.
883	const fma3Check = 1<<12 | 1<<27
884	fs.setIf(c&fma3Check == fma3Check, FMA3)
885
886	// Check AVX2, AVX2 requires OS support, but BMI1/2 don't.
887	if mfi >= 7 {
888		_, ebx, ecx, edx := cpuidex(7, 0)
889		eax1, _, _, _ := cpuidex(7, 1)
890		if fs.inSet(AVX) && (ebx&0x00000020) != 0 {
891			fs.set(AVX2)
892		}
893		// CPUID.(EAX=7, ECX=0).EBX
894		if (ebx & 0x00000008) != 0 {
895			fs.set(BMI1)
896			fs.setIf((ebx&0x00000100) != 0, BMI2)
897		}
898		fs.setIf(ebx&(1<<2) != 0, SGX)
899		fs.setIf(ebx&(1<<4) != 0, HLE)
900		fs.setIf(ebx&(1<<9) != 0, ERMS)
901		fs.setIf(ebx&(1<<11) != 0, RTM)
902		fs.setIf(ebx&(1<<14) != 0, MPX)
903		fs.setIf(ebx&(1<<18) != 0, RDSEED)
904		fs.setIf(ebx&(1<<19) != 0, ADX)
905		fs.setIf(ebx&(1<<29) != 0, SHA)
906		// CPUID.(EAX=7, ECX=0).ECX
907		fs.setIf(ecx&(1<<5) != 0, WAITPKG)
908		fs.setIf(ecx&(1<<25) != 0, CLDEMOTE)
909		fs.setIf(ecx&(1<<27) != 0, MOVDIRI)
910		fs.setIf(ecx&(1<<28) != 0, MOVDIR64B)
911		fs.setIf(ecx&(1<<29) != 0, ENQCMD)
912		fs.setIf(ecx&(1<<30) != 0, SGXLC)
913		// CPUID.(EAX=7, ECX=0).EDX
914		fs.setIf(edx&(1<<14) != 0, SERIALIZE)
915		fs.setIf(edx&(1<<16) != 0, TSXLDTRK)
916		fs.setIf(edx&(1<<26) != 0, IBPB)
917		fs.setIf(edx&(1<<27) != 0, STIBP)
918
919		// Only detect AVX-512 features if XGETBV is supported
920		if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) {
921			// Check for OS support
922			eax, _ := xgetbv(0)
923
924			// Verify that XCR0[7:5] = ‘111b’ (OPMASK state, upper 256-bit of ZMM0-ZMM15 and
925			// ZMM16-ZMM31 state are enabled by OS)
926			/// and that XCR0[2:1] = ‘11b’ (XMM state and YMM state are enabled by OS).
927			hasAVX512 := (eax>>5)&7 == 7 && (eax>>1)&3 == 3
928			if runtime.GOOS == "darwin" {
929				hasAVX512 = fs.inSet(AVX) && darwinHasAVX512()
930			}
931			if hasAVX512 {
932				fs.setIf(ebx&(1<<16) != 0, AVX512F)
933				fs.setIf(ebx&(1<<17) != 0, AVX512DQ)
934				fs.setIf(ebx&(1<<21) != 0, AVX512IFMA)
935				fs.setIf(ebx&(1<<26) != 0, AVX512PF)
936				fs.setIf(ebx&(1<<27) != 0, AVX512ER)
937				fs.setIf(ebx&(1<<28) != 0, AVX512CD)
938				fs.setIf(ebx&(1<<30) != 0, AVX512BW)
939				fs.setIf(ebx&(1<<31) != 0, AVX512VL)
940				// ecx
941				fs.setIf(ecx&(1<<1) != 0, AVX512VBMI)
942				fs.setIf(ecx&(1<<6) != 0, AVX512VBMI2)
943				fs.setIf(ecx&(1<<8) != 0, GFNI)
944				fs.setIf(ecx&(1<<9) != 0, VAES)
945				fs.setIf(ecx&(1<<10) != 0, VPCLMULQDQ)
946				fs.setIf(ecx&(1<<11) != 0, AVX512VNNI)
947				fs.setIf(ecx&(1<<12) != 0, AVX512BITALG)
948				fs.setIf(ecx&(1<<14) != 0, AVX512VPOPCNTDQ)
949				// edx
950				fs.setIf(edx&(1<<8) != 0, AVX512VP2INTERSECT)
951				fs.setIf(edx&(1<<22) != 0, AMXBF16)
952				fs.setIf(edx&(1<<24) != 0, AMXTILE)
953				fs.setIf(edx&(1<<25) != 0, AMXINT8)
954				// eax1 = CPUID.(EAX=7, ECX=1).EAX
955				fs.setIf(eax1&(1<<5) != 0, AVX512BF16)
956			}
957		}
958	}
959
960	if maxExtendedFunction() >= 0x80000001 {
961		_, _, c, d := cpuid(0x80000001)
962		if (c & (1 << 5)) != 0 {
963			fs.set(LZCNT)
964			fs.set(POPCNT)
965		}
966		fs.setIf((c&(1<<10)) != 0, IBS)
967		fs.setIf((d&(1<<31)) != 0, AMD3DNOW)
968		fs.setIf((d&(1<<30)) != 0, AMD3DNOWEXT)
969		fs.setIf((d&(1<<23)) != 0, MMX)
970		fs.setIf((d&(1<<22)) != 0, MMXEXT)
971		fs.setIf((c&(1<<6)) != 0, SSE4A)
972		fs.setIf(d&(1<<20) != 0, NX)
973		fs.setIf(d&(1<<27) != 0, RDTSCP)
974
975		/* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
976		 * used unless the OS has AVX support. */
977		if fs.inSet(AVX) {
978			fs.setIf((c&0x00000800) != 0, XOP)
979			fs.setIf((c&0x00010000) != 0, FMA4)
980		}
981
982	}
983	if maxExtendedFunction() >= 0x80000008 {
984		_, b, _, _ := cpuid(0x80000008)
985		fs.setIf((b&(1<<9)) != 0, WBNOINVD)
986	}
987
988	if maxExtendedFunction() >= 0x8000001b && fs.inSet(IBS) {
989		eax, _, _, _ := cpuid(0x8000001b)
990		fs.setIf((eax>>0)&1 == 1, IBSFFV)
991		fs.setIf((eax>>1)&1 == 1, IBSFETCHSAM)
992		fs.setIf((eax>>2)&1 == 1, IBSOPSAM)
993		fs.setIf((eax>>3)&1 == 1, IBSRDWROPCNT)
994		fs.setIf((eax>>4)&1 == 1, IBSOPCNT)
995		fs.setIf((eax>>5)&1 == 1, IBSBRNTRGT)
996		fs.setIf((eax>>6)&1 == 1, IBSOPCNTEXT)
997		fs.setIf((eax>>7)&1 == 1, IBSRIPINVALIDCHK)
998	}
999
1000	return fs
1001}
1002
1003func valAsString(values ...uint32) []byte {
1004	r := make([]byte, 4*len(values))
1005	for i, v := range values {
1006		dst := r[i*4:]
1007		dst[0] = byte(v & 0xff)
1008		dst[1] = byte((v >> 8) & 0xff)
1009		dst[2] = byte((v >> 16) & 0xff)
1010		dst[3] = byte((v >> 24) & 0xff)
1011		switch {
1012		case dst[0] == 0:
1013			return r[:i*4]
1014		case dst[1] == 0:
1015			return r[:i*4+1]
1016		case dst[2] == 0:
1017			return r[:i*4+2]
1018		case dst[3] == 0:
1019			return r[:i*4+3]
1020		}
1021	}
1022	return r
1023}
1024